!echo $(date)

Tue May 31 01:36:34 MDT 2022


%load_ext autoreload
%autoreload 2
# picks up changes in repo (code edits, switching branches etc.)


import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 0
pd.options.display.max_columns = 96
pd.options.display.max_rows = 96

import logging
import os
import sys

formatter = logging.Formatter(
    fmt='%(asctime)s.%(msecs)03d %(levelname)s [%(name)s] %(message)s',
    datefmt='%y%m%d@%H:%M:%S',
)

logger = logging.getLogger('misosoup')
logger.setLevel(logging.DEBUG)
handler_stdout = logging.StreamHandler(stream=sys.stdout)
handler_stdout.setFormatter(formatter)

if not logger.hasHandlers():
    logger.addHandler(handler_stdout)  # log to STDOUT or Jupyter

sys.path.append('../../misosoup-preview')
import misosoup

220531@01:36:35.147 DEBUG [misosoup] misosoup package (re)loaded


from misosoup.viz import add_zoom, apply_style, plot_xit
apply_style()

import altair as alt
# import panel as pn
# pn.extension('vega')


os.listdir()  # default working directory is set to `data`

['frame', 'ms1', 'ms2', 'msms', 'peak', 'peak_msms', 'run', 'xic']


peaks = pd.read_parquet('peak')
peaks.sort_values('intensity', ascending=False).head(10)


from misosoup.sql import MisoQuery as MSQ


msq = MSQ("SELECT 'forty-two' AS answer_to_life_universe_and_everything")
msq.run()

220531@01:36:35.662 DEBUG [misosoup.sql] query returned 1 rows


MSQ('PRAGMA show_tables').run()

220531@01:36:35.696 DEBUG [misosoup.sql] query returned 8 rows


msq = MSQ('''
SELECT table_name 
FROM information_schema.tables

/* also works in DuckDB:
PRAGMA show_tables
*/
''')
msq.count.run()

220531@01:36:35.730 DEBUG [misosoup.sql] query returned 1 rows


msq.run()

220531@01:36:35.759 DEBUG [misosoup.sql] query returned 8 rows


msq = MSQ('''
DESCRIBE SELECT * FROM run
''')
msq.run()

220531@01:36:35.790 DEBUG [misosoup.sql] query returned 10 rows


runs = MSQ('SELECT * FROM run').run()
runs.drop(columns=['extra_info'])

220531@01:36:35.833 DEBUG [misosoup.sql] query returned 1 rows


runs.loc[0, 'extra_info']

{'analysis_info': {'processed_at': '220524@20:34:45UTC',
  'tof_to_mz_polynomial': [-1.0256627815483128e-24,
   1.3651354742787636e-18,
   6.3186824399878066e-09,
   0.001066560347108718,
   44.997722832120516]},
 'metadata': {'AcquisitionDateTime': '2019-04-29T19:28:59.393+01:00',
  'AcquisitionFirmwareVersion': '<unknown>',
  'AcquisitionSoftware': 'Bruker otofControl',
  'AcquisitionSoftwareVendor': 'Bruker',
  'AcquisitionSoftwareVersion': '6.0.106.192-14501-vc141',
  'AnalysisId': '00000000-0000-0000-0000-000000000000',
  'ClosedProperly': '1',
  'DenoisingEnabled': '0',
  'Description': '',
  'DigitizerNumSamples': '419587',
  'InstrumentFamily': '9',
  'InstrumentName': 'timsTOF Pro',
  'InstrumentRevision': '1',
  'InstrumentSerialNumber': '1838271.11',
  'InstrumentSourceType': '1',
  'InstrumentVendor': 'Bruker',
  'MaxNumPeaksPerScan': '267',
  'MethodName': 'MS_PASEF_2cycles_Int100cts_Targ4k.m',
  'MzAcqRangeLower': '50.000000',
  'MzAcqRangeUpper': '1600.000000',
  'OneOverK0AcqRangeLower': '0.602912',
  'OneOverK0AcqRangeUpper': '1.951418',
  'OperatorName': 'Demo User',
  'PeakListIndexScaleFactor': '1',
  'PeakWidthEstimateType': None,
  'PeakWidthEstimateValue': None,
  'SampleName': 'SRM1950_20min',
  'SchemaType': 'TDF',
  'SchemaVersionMajor': '3',
  'SchemaVersionMinor': '1',
  'TimsCompressionType': '2'},
 'method_lc': None,
 'method_ms': None,
 'plate_id': None}


msq = MSQ('''
SELECT * FROM frame
WHERE msrun_id = 'LIPID6950'
''')
msq.count.run()

220531@01:36:35.915 DEBUG [misosoup.sql] query returned 1 rows


chrom_df = msq.run()  # both ms_level 1 and 2
chrom_df

220531@01:36:35.955 DEBUG [misosoup.sql] query returned 11106 rows


# shortcut: 
msq = misosoup.sql.get_chromatograms('LIPID6950', ms_level=1)
print(msq)
chrom_df = msq.run()

/* misosoup/duckdb/get_chromatograms.sql

Get chromatogram (frame-by-frame overview) by msrun_id

*/

SELECT
    f.rt,
    f.tic,
    f.top_intensity AS bpc,
    f.num_signals,
    f.msrun_id
FROM frame f
WHERE 1=1
    AND f.ms_level = 1
ORDER BY msrun_id, rt

220531@01:36:36.019 DEBUG [misosoup.sql] query returned 3794 rows


def plot_chromatograms(df, y1='bpc', y2='tic', height=240, width=960):
    wheel_zoom_x = alt.selection_interval(
        bind='scales',
        encodings=['x'],
        zoom="wheel!"
    )
    chromatogram = (
        alt.layer(
            alt.Chart(df, title=f"{y1.upper()} (red) and {y2.upper()} (blue)")
            .mark_line(color='red', strokeWidth=0.75).encode(x='rt', y=y1),
            alt.Chart(df)
            .mark_line(color='blue', strokeWidth=0.5).encode(x='rt', y=y2)
        ).resolve_scale(y='independent')
        .add_selection(wheel_zoom_x)
        .configure_axisLeft(titleColor='red')
        .configure_axisRight(titleColor='blue')
        .configure_view(continuousHeight=height, continuousWidth=width)
    )
    return chromatogram
chrom = plot_chromatograms(chrom_df, y1='tic', y2='bpc')
chrom


msq = MSQ('''
SELECT
    f.rt, ms1.*
FROM ms1
JOIN frame f
    ON f.msrun_id = ms1.msrun_id
    AND f.frame = ms1.frame
WHERE f.msrun_id = 'LIPID6950'
''')
msq.count.run()

220531@01:36:36.472 DEBUG [misosoup.sql] query returned 1 rows


msq.run(footer="USING SAMPLE 3")  # or LIMIT 3

220531@01:36:37.124 DEBUG [misosoup.sql] query returned 3 rows


msq.count.run(footer="WHERE intensity >= 1000")

220531@01:36:37.298 DEBUG [misosoup.sql] query returned 1 rows


msq.count.run(footer="WHERE mz BETWEEN 430.90 AND 430.93")

220531@01:36:37.412 DEBUG [misosoup.sql] query returned 1 rows


msq = MSQ('''
SELECT
    f.rt, ms1.*
FROM ms1
JOIN frame f
    ON f.msrun_id = ms1.msrun_id
    AND f.frame = ms1.frame
WHERE f.msrun_id = 'LIPID6950'
    AND f.rt < 30
    AND ms1.intensity >= 2000
''')
ms1 = msq.run()

220531@01:36:37.939 DEBUG [misosoup.sql] query returned 20176 rows


plot_xit(ms1, x='rt', y='mz', title='Background ions used for mass calibration', figsize=(600, 480))


biggest_early_clusters = ms1.groupby(ms1.mz.round(2)).intensity.sum()[lambda x: x > 5e6]
biggest_early_clusters

mz
226.95    38153849
362.93    16746598
430.91    30811059
498.90    8157733 
566.89    9480526 
634.88    5219263 
Name: intensity, dtype: int64


# we see repeating units of ~67.99, which matches sodium formate
np.ediff1d(biggest_early_clusters.index)

array([135.98,  67.98,  67.99,  67.99,  67.99])


msq = MSQ('''
SELECT
    msrun_id
    ,mz_group
    ,SUM(intensity) AS sum_intensity
    ,COUNT(1) AS signal_count
    ,AVG(mz) AS simple_avg_mz
    --,SUM(mz*intensity) / SUM(intensity) - AVG(mz) AS weighted_unweighted_difference
    ,SUM(mz*intensity) / SUM(intensity) AS weighted_avg_mz
FROM ms1
GROUP BY msrun_id, mz_group
ORDER BY SUM(intensity) DESC
LIMIT 5
''')
top5 = msq.run()

220531@01:36:38.869 DEBUG [misosoup.sql] query returned 5 rows


top5


top5_theor_masses = [369.35158, 758.56943, 786.60073, 760.58508, 759.57280]
top5_annotations = [
    'C27H45 (cholesterol + H+ –H2O)',
    'C42H81NO8P (PLPC, 34:2 PC, M0 isotope)',
    'C44H85NO8P (DOPC, 36:2 PC)',
    'C42H83NO8P (POPC, 34:1 PC)',
    'C42H81NO8P (PLPC, 34:2 PC, M1 isotope)',
]
top5['theor_mz'] = top5_theor_masses
top5['error_ppm'] = ((1 - top5['weighted_avg_mz']/top5['theor_mz']) * 1e6).round(1)
top5['annotation'] = top5_annotations
top5


# misosoup.mol.lrms('C42H82NO10P', charge=1).head(3)
# [misosoup.mol.lrms(x, charge=1).head(2) for x in ['C27H45', 'C42H81NO8P', 'C42H83NO8P', 'C44H85NO8P']]
# [misosoup.mol.hrms(x, charge=1).iloc[0,0] for x in ['C27H45', 'C42H81NO8P', 'C42H83NO8P', 'C44H85NO8P']]


mz_group = 257
msq = MSQ(f'''
SELECT *
FROM xic
WHERE msrun_id = 'LIPID6950'
    AND mz_group = {mz_group}
''')
xic = msq.run()

msq = MSQ(f'''
SELECT
    f.rt, ms1.*, p.msms_id
FROM ms1
JOIN frame f
    ON f.msrun_id = ms1.msrun_id
    AND f.frame = ms1.frame
LEFT JOIN peak_msms p
    ON p.msrun_id = ms1.msrun_id
    AND p.ms1_id = ms1.id
WHERE f.msrun_id = 'LIPID6950'
    AND ms1.mz_group = {mz_group}
    AND ms1.intensity > 99
''')
ms1 = msq.run()

220531@01:36:39.062 DEBUG [misosoup.sql] query returned 417 rows
220531@01:36:40.844 DEBUG [misosoup.sql] query returned 10351 rows

xic


%%time
msq = MSQ('''
SELECT neighbors.*
FROM ms1 self, ms1 neighbors
WHERE self.msrun_id = 'LIPID6950'
    AND self.msrun_id = neighbors.msrun_id
    AND self.mz_group = neighbors.mz_group
    AND self.frame = neighbors.frame
    AND self.tof - neighbors.tof BETWEEN -3 AND 3
    AND self.spectrum - neighbors.spectrum IN (-3, -2, -1, 1, 2, 3)
''')
msq.count.run()

220531@01:36:49.266 DEBUG [misosoup.sql] query returned 1 rows
CPU times: user 26.7 s, sys: 1.68 s, total: 28.4 s
Wall time: 8.31 s


msq = MSQ('''
SELECT *
FROM peak
WHERE msrun_id = 'LIPID6950'
''')
peaks = msq.run()

220531@01:36:49.308 DEBUG [misosoup.sql] query returned 9620 rows


misosoup.viz.plot_fims_with_rt_filter(peaks.query("intensity >= 1e5"), rt_window=120)


peaks[peaks.feature_id.isin(peaks[peaks.peak_num == 4].feature_id.unique())].sort_values(['feature_id','peak_num'])


%%time
features = misosoup.sql.get_ms1_features('LIPID6950', feature_id=6391, rt_window=15, spectrum_window=25).run()

220531@01:36:50.991 DEBUG [misosoup.sql] query returned 1166 rows
CPU times: user 2.07 s, sys: 279 ms, total: 2.35 s
Wall time: 1.44 s


plot_xit(
    features,
    x=alt.X('retention_time:Q', scale=alt.Scale(domain=(807.5, 814.5), nice=False)),
    y=alt.Y('y:Q', scale=alt.Scale(domain=(333, 364), nice=False)),
    tooltip=['peak_id', 'peak_num', 'peak_mz', 'peak_intensity', 'intensity'],
    title='XIT view of a feature (monoisotopic mz=813.6879, up to five isotopes are seen)',
    figsize=(960, 360)
).transform_calculate(
    retention_time='datum.rt + datum.peak_num*.04',
    y='datum.spectrum + datum.peak_num*.1',
)  # this is to highlight isotopes


msms = MSQ('''
SELECT *
FROM msms
WHERE msrun_id = 'LIPID6950'
''').run()
msms.sample(3, random_state=420)

220531@01:36:51.214 DEBUG [misosoup.sql] query returned 83780 rows


pmsms = MSQ('''
SELECT *
FROM peak_msms
WHERE msrun_id = 'LIPID6950'
''').run()

220531@01:36:51.298 DEBUG [misosoup.sql] query returned 62017 rows


# False, True: unfragmented peaks
# True, False: MSMS events not assigned to a peak
# True, True: MSMS event matched to a called peak
pmsms.groupby(['msrun_id', pmsms.msms_id.notna(), pmsms.peak_id.notna()]).agg({
    'msms_id': lambda x: x.nunique(), # number of unique MSMS events
    'peak_id': lambda x: x.nunique(), # number of unique peaks
})


pmsms


def xit_pasef_plot(ms1, pmsms, mz_group=None, figsize=(960, 360)):
    x_scale = alt.Scale(domain=(ms1.rt.min(), ms1.rt.max()))
    if mz_group is None:
        _ms1_rows = ms1.mz_group > 0
        _pmsms_rows = pmsms.mz_group.notna()
    else:
        _ms1_rows = ms1.mz_group == mz_group
        _pmsms_rows = pmsms.mz_group == mz_group
    xit_pasef = alt.layer(
        plot_xit(
            ms1[_ms1_rows],
            x=alt.X('rt', scale=x_scale),
            title=f'XIT: mzgroup {mz_group}, mz={ms1[_ms1_rows].mz.mean():.4f}; PASEF windows (yellow)'
        ),
        alt.Chart(pmsms[_pmsms_rows]).mark_rect(strokeWidth=2.5, stroke='yellow').encode(
            x='msms_rt', x2='msms_rt', y='spectrum_min', y2='spectrum_max', tooltip=['msms_id','msms_rt','mz_min','mz_max']
        )
    ).properties(width=figsize[0], height=figsize[1])
    return xit_pasef

# xit_pasef_plot(ms1, pmsms, mz_group=257)


def plot_xic(xic, x='rt', mz_group=None, title="XIC", figsize=(960,240)):
    if mz_group is None:
        _rows = xic.mz_group.notna()
    else:
        _rows = xic.mz_group == mz_group
        
    data = xic.loc[
        _rows,
        ['mz_group','rt','frame','mz','n_spectra','xic_peak','auc','intensity']
    ].copy()
    y_scale = alt.Scale(type="sqrt", domain=(0, data.intensity.max()*1.1))
    chart = (
        alt.Chart(data)
        .mark_line() #point=alt.OverlayMarkDef())
        .encode(
            x='rt',
            y=alt.Y('intensity', scale=y_scale),
        )
        +
        alt.Chart(data, title=title)
        .mark_point(stroke="black", strokeWidth=0.5)
        .encode(
            x=x,
            y=alt.Y('intensity', scale=y_scale),
            fill=alt.Color('mz', scale=alt.Scale(scheme='turbo')),
            size=alt.condition("datum.auc > 0", alt.value(100), alt.value(25)),
            tooltip=[
                'mz_group', 'frame', 'rt',
                alt.Tooltip('mz', format='.4f'),
                'intensity', 'n_spectra', 'xic_peak',
                alt.Tooltip('auc', format=',d')
            ]

        )
    ).properties(width=figsize[0], height=figsize[1])
    return chart

def xic_xit_plot(xic, ms1, pmsms, mz_group=None, figsize=(800, (200, 360))):
    x_scale = alt.Scale(domain=(ms1.rt.min(), ms1.rt.max()))
    xic_plot = plot_xic(
        xic,
        x=alt.X('rt', scale=x_scale),
        figsize=(figsize[0], figsize[1][0]),
        title="Extracted Ion Chromatogram (XIC)"
    )

    if mz_group is None:
        _ms1_rows = ms1.mz_group > 0
        _pmsms_rows = pmsms.mz_group.notna()
        mz_group = ms1.mz_group.value_counts().index[0]
    else:
        _ms1_rows = ms1.mz_group == mz_group
        _pmsms_rows = pmsms.mz_group == mz_group
        
    xic_xit = add_zoom(
        xic_plot
        & (
            plot_xit(
                ms1[_ms1_rows],
                x=alt.X('rt', scale=x_scale),
                figsize=(figsize[0], figsize[1][1]),
                title=f'Extracted Ion Trace (XIT)' #': mzgroup {mz_group}, mz={ms1[_ms1_rows].mz.mean():.4f}'
            )
            +
            plot_xit(
                ms1[ms1.msms_id.notna()],
                x=alt.X('rt', scale=x_scale), lc="magenta", shape='diamond', lw=5, size=100,
                tooltip=['msms_id', 'rt', 'mz'],
                figsize=(figsize[0], figsize[1][1]),
                title=f'XIT: mzgroup {mz_group}, mz={ms1[_ms1_rows].mz.mean():.4f}'
            )
            +
            alt.Chart(pmsms[_pmsms_rows]).mark_rect(strokeWidth=5, stroke='black').encode(
                x='msms_rt', x2='msms_rt', y='spectrum_min', y2='spectrum_max', tooltip=['msms_id','msms_rt','mz_min','mz_max']
            )
        )
    ).resolve_scale(y='independent', fill='independent')    
    return xic_xit

xic_xit_plot(xic, ms1, pmsms, mz_group=257, figsize=(800, (200, 350)))


msq = MSQ('''
WITH ms2_by_msms_id AS (
    SELECT
        msms_id
        ,mz_group
        ,tof
        ,mz
        ,SUM(intensity) AS intensity
        ,msrun_id
    FROM ms2
    WHERE msrun_id = 'LIPID6950'
        AND mz_group > 0
        AND msms_id IN (9620, 9653, 9908, 9626, 9736, 9983)
    GROUP BY msms_id, mz_group, tof, mz, msrun_id
)
,centroided_via_mzgroup AS (
    SELECT
        msms_id
        ,mz_group
        ,ROUND(SUM(mz*intensity) / SUM(intensity), 6) AS mz_cent
        ,SUM(intensity) AS intensity
        ,msrun_id
    FROM ms2_by_msms_id
    GROUP BY msms_id, mz_group, msrun_id
)
,ms2_peak AS (
    SELECT
        ms2.mz_cent AS mz
        ,ms2.intensity
        ,1000*ms2.intensity / MAX(ms2.intensity) OVER(
                PARTITION BY ms2.msrun_id, ms2.msms_id
        ) AS normalized_intensity  -- comes back as integer because the numerator is integer
        ,pmsms.*
        ,RANK() OVER(
            PARTITION BY pmsms.msrun_id, pmsms.msms_id
            ORDER BY peak_intensity DESC
        ) AS peak_rank
    FROM centroided_via_mzgroup ms2
    JOIN peak_msms pmsms
        ON pmsms.msrun_id = ms2.msrun_id
        AND pmsms.msms_id = ms2.msms_id
)
SELECT *
FROM ms2_peak
WHERE peak_rank = 1
    AND normalized_intensity >= 1  --discard low-intensity MS2 signals
ORDER BY mz
''')
ms2peak = msq.run()

220531@01:36:52.185 DEBUG [misosoup.sql] query returned 42 rows


# ms2peak.loc[ms2peak.spectrum_max > 730, 'intensity'] *= -1
ms2peak.loc[ms2peak.spectrum_max < 730, 'normalized_intensity'] *= -1
# ms2peak.sort_values(['msms_id', 'mz'])


def ms2plot(df, x='mz:Q', tooltip=None, figsize=(960, 240), title='MS2'):
    ms2plot = (
        alt.Chart(df, title=title)
        .mark_bar(width=4)
        .encode(
            x=x,
            y=alt.Y('normalized_intensity', scale=alt.Scale(type='linear')),
#             y=alt.Y('intensity', scale=alt.Scale(type='linear')),
            color=alt.Color('msms_id:N'),
#             color=alt.Color('spectrum_max'),
            tooltip=tooltip or ['msms_id','mz_group','mz','intensity'],
        )
        .properties(width=figsize[0], height=figsize[1])
    )
    return ms2plot
add_zoom(ms2plot(ms2peak, title='MS2 spectra from different fragmentation events within a region of interest'))


help(misosoup.mol.hrms)

Help on function hrms in module misosoup.mol:

hrms(formula, charge=1, precision=6, th=0.001) -> pandas.core.frame.DataFrame
    Simulates a high-resolution MS with fine isotopic distribution.
    
    Parameters
    ----------
    formula : str
        Molecular formula as a string.
    charge : int (default: +1)
        Charge of the molecule
    precision : int, default 6
        Rounds m/z values to this number of significant digits
    th : float, default 1e-3
        A parameter for IsoSpecPy that signifies a threshold
        of probability (i.e. rel. isotope abundance) that remains unassigned
        to a particular mass.  By default, when .999 of probability is covered,
        no additional isotopes are considered
    
    Returns
    -------
    Pandas dataframe with standard autoincrementing index and columns:
    - tmz: theoretical m/z; mass of gained/lost electrons is accounted for
    - tri: theoretical relative intensity, normalized to 1 million
    
    Notes
    -----
    The formula is passed through the molmass interpretor, but IsoSpecPy
    won't handle inputs with specified isotope labels that are permitted
    in molmass (e.g. [13C]O2).  Method ``lrms`` uses molmass only, and can
    handle compounds with unnatural isotopic composition.   If fine
    isotopic patterns are needed, check IsoSpecPy documentation:
    https://github.com/MatteoLacki/IsoSpec/blob/master/Examples/Python


misosoup.mol.hrms('C42H80NO8P + H', charge=1)


misosoup.mol.lrms('C42H80NO8P + H', charge=1)


misosoup.mol.mw_range('C42H80NO8P + H', charge=1, tol_ppm=5)

array([758.565639, 758.573225])


%%time
con2 = misosoup.sql.register_parquet_data('../moredata/')  # 100 runs here
con2.execute('SELECT COUNT (DISTINCT msrun_id) AS n_runs, COUNT(*) AS n_ms1_signals FROM ms1').df()

CPU times: user 12.3 s, sys: 3.67 s, total: 16 s
Wall time: 14.2 s


!echo $(date)

Tue May 31 01:37:22 MDT 2022


from nbconvert import HTMLExporter

html_exporter = HTMLExporter()
html_exporter.template_name = 'classic'

with open('../notebooks/MisoSoup-Preview.ipynb', 'r') as f_input:
    (body, resources) = html_exporter.from_file(f_input)

with open('../MisoSoup-Preview.html', 'w') as f_output:
    f_output.write(body)


!aws s3 cp ./MisoSoup-Preview.html s3://misosoup/MisoSoup-Preview.html --acl public-read

upload failed: ./MisoSoup-Preview.html to s3://misosoup/MisoSoup-Preview.html An error occurred (AccessDenied) when calling the CreateMultipartUpload operation: Access Denied

	peak_id	ms1_id	mz_group	xic_peak	frame	rt	spectrum	tof	mz	feature_id	peak_num	delta_mz	intensity	n_signals	msrun_id
6428	6429	29306510	2007	7828	4217	457.446	418	262086	758.57290	5282	0	0.00000	270528988	315	LIPID6950
338	339	50103169	386	561	9473	1025.158	404	157377	369.35292	273	0	0.00000	253825387	315	LIPID6950
336	337	49935426	386	560	9389	1016.074	390	157378	369.35289	271	0	0.00000	147664238	315	LIPID6950
6474	6475	29306528	2015	7886	4217	457.446	418	262315	759.57685	5282	1	1.00395	147403940	315	LIPID6950
6905	6906	29144293	2126	8400	4070	441.660	408	267524	782.57314	5672	0	0.00000	141372070	315	LIPID6950
7049	7050	31452389	2163	8578	5543	599.838	396	268429	786.60458	5795	0	0.00000	133112148	315	LIPID6950
340	341	50600498	386	562	9650	1044.275	403	157377	369.35292	275	0	0.00000	121445541	315	LIPID6950
6526	6527	30951942	2025	7952	5156	558.203	408	262546	760.58868	5370	0	0.00000	119913218	315	LIPID6950
8756	8757	49912174	2546	10743	9374	1014.482	302	287681	874.79003	7199	0	0.00000	117542775	315	LIPID6950
1802	1803	7903690	881	2282	944	102.912	607	195873	496.34118	1450	0	0.00000	97482575	313	LIPID6950

	column_name	column_type	null	key	default	extra
0	name	VARCHAR	YES	NaN	NaN	NaN
1	uri	VARCHAR	YES	NaN	NaN	NaN
2	sample_type	VARCHAR	YES	NaN	NaN	NaN
3	ionization_mode	VARCHAR	YES	NaN	NaN	NaN
4	extra_info	STRUCT(analysis_info STRUCT(processed_at VARCHAR, tof_to_mz_polynomial DOUBLE[]), metadata STRUCT(AcquisitionDateTime VARCHAR, AcquisitionFirmwareVersion VARCHAR, AcquisitionSoftware VARCHAR, AcquisitionSoftwareVendor VARCHAR, AcquisitionSoftwareVersion VARCHAR, AnalysisId VARCHAR, ClosedProperly VARCHAR, DenoisingEnabled VARCHAR, Description VARCHAR, DigitizerNumSamples VARCHAR, InstrumentFamily VARCHAR, InstrumentName VARCHAR, InstrumentRevision VARCHAR, InstrumentSerialNumber VARCHAR, InstrumentSourceType VARCHAR, InstrumentVendor VARCHAR, MaxNumPeaksPerScan VARCHAR, MethodName VARCHAR, MzAcqRangeLower VARCHAR, MzAcqRangeUpper VARCHAR, OneOverK0AcqRangeLower VARCHAR, OneOverK0AcqRangeUpper VARCHAR, OperatorName VARCHAR, PeakListIndexScaleFactor VARCHAR, PeakWidthEstimateType VARCHAR, PeakWidthEstimateValue VARCHAR, SampleName VARCHAR, SchemaType VARCHAR, SchemaVersionMajor VARCHAR, SchemaVersionMinor VARCHAR, TimsCompressionType VARCHAR), method_lc VARCHAR, method_ms VARCHAR, plate_id VARCHAR)	YES	NaN	NaN	NaN
5	batch_name	VARCHAR	YES	NaN	NaN	NaN
6	batch_eln_id	VARCHAR	YES	NaN	NaN	NaN
7	batch_id	VARCHAR	YES	NaN	NaN	NaN
8	vendor_id	BIGINT	YES	NaN	NaN	NaN
9	id	VARCHAR	YES	NaN	NaN	NaN

	frame	ms_level	rt	num_spectra	num_signals	accum_time	tic	top_intensity	msrun_id
0	1	1	0.772	953	24611	99.96	1317429	357	LIPID6950
1	2	1	0.873	953	23922	99.96	1275343	325	LIPID6950
2	3	1	0.979	953	24715	99.96	1315576	272	LIPID6950
3	4	1	1.083	953	24513	99.96	1313304	354	LIPID6950
4	5	1	1.188	953	25174	99.96	1345134	333	LIPID6950
...	...	...	...	...	...	...	...	...	...
11101	11102	1	1200.460	953	24584	99.96	1313494	388	LIPID6950
11102	11103	2	1200.579	953	10	99.96	508	113	LIPID6950
11103	11104	2	1200.680	953	18	99.96	804	91	LIPID6950
11104	11105	1	1200.787	953	24635	99.96	1314192	227	LIPID6950
11105	11106	2	1200.897	953	15	99.96	584	75	LIPID6950

	rt	id	frame	spectrum	tof	mz	mz_group	raw_intensity	intensity	msrun_id
0	742.529	37116696	6860	665	248777	701.413726	1778	30	70	LIPID6950
1	799.829	41075905	7385	377	276194	821.606931	2355	35	124	LIPID6950
2	928.738	45773239	8579	156	216063	570.429928	1236	47	150	LIPID6950

	msrun_id	mz_group	sum_intensity	signal_count	simple_avg_mz	weighted_avg_mz
0	LIPID6950	386	1.258454e+09	52621	369.355004	369.354520
1	LIPID6950	2007	8.743499e+08	29498	758.568052	758.573391
2	LIPID6950	2163	5.793982e+08	58802	786.608457	786.604380
3	LIPID6950	2025	5.752966e+08	42895	760.587664	760.587282
4	LIPID6950	2015	4.555507e+08	33596	759.589593	759.578361

MisoSoup Preview¶

Imports & Settings¶

Viz¶

Data Model¶

Working directly with Parquet files¶

MisoQuery: Why Use SQL on Mass-Spec Data?¶

Using MisoQuery¶

One-line queries¶

How many tables to we have?¶

What are they called?¶

Table Metadata¶

Run Metadata¶

How do I get the raw data? Where did it come from?¶

Frames: Chromatographic Overview of a Run¶

MS1 Data¶

MS1 Signal table¶

All data¶

Optimizing the amount of data to pull¶

Highlighting the calibrant ions¶

MZ groups¶

XIC¶

Peaks & Features¶

The SELF range JOIN¶

Peaks¶

FIMS Plot¶

Features with many isotopes¶

MS2 Data¶

MSMS¶

PeakMSMS¶

Compare MS2 for several MSMS events¶

Molecular Weights Calculations¶

Scaling DuckDB¶

Publish¶

	ms1_frame_number	frame	rt	mz_group	mz	mz_min	mz_max	n_tof_bins	spectrum_min	spectrum_med	spectrum_max	n_spectra	n_signals	intensity	xic_peak	auc	msrun_id
0	596	1514	164.435	257	338.321487	338.311739	338.338058	3	701	726	734	6	6	270	0	0	LIPID6950
1	699	1823	197.476	257	338.341348	338.338058	338.343906	3	698	708	732	8	8	578	0	0	LIPID6950
2	711	1859	201.368	257	338.344272	338.340982	338.346831	3	699	709	730	8	8	379	0	0	LIPID6950
3	720	1886	204.328	257	338.342931	338.340982	338.343906	2	701	704	729	6	6	379	0	0	LIPID6950
4	721	1889	204.650	257	338.343581	338.340982	338.349755	3	688	703	736	8	9	693	0	0	LIPID6950
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
412	2633	7625	825.920	257	338.343723	338.340982	338.346831	3	699	709	737	15	16	3969	374	13220	LIPID6950
413	2634	7628	826.246	257	338.344491	338.338058	338.355604	5	701	708	739	12	15	2439	0	0	LIPID6950
414	2635	7631	826.574	257	338.344742	338.343906	338.346831	2	705	709	735	7	7	1593	0	0	LIPID6950
415	2636	7634	826.901	257	338.342444	338.340982	338.343906	2	708	713	719	8	8	1773	0	0	LIPID6950
416	2638	7640	827.567	257	338.344638	338.340982	338.346831	3	697	709	733	8	8	1402	0	0	LIPID6950

	peak_id	ms1_id	mz_group	xic_peak	frame	rt	spectrum	tof	mz	feature_id	peak_num	delta_mz	intensity	n_signals	msrun_id
7778	7779	41629582	2312	9471	7484	810.581	351	274451	813.68788	6391	0	0.00000	34129144	316	LIPID6950
7819	7820	41629353	2320	9520	7484	810.581	348	274674	814.69104	6391	1	1.00316	17309762	316	LIPID6950
7846	7847	41629606	2325	9558	7484	810.581	351	274894	815.69350	6391	2	2.00562	4976400	317	LIPID6950
7868	7869	41629292	2332	9589	7484	810.581	347	275115	816.69619	6391	3	3.00831	1029110	280	LIPID6950
7890	7891	41639926	2337	9613	7490	811.230	347	275336	817.69877	6391	4	4.01089	140879	105	LIPID6950
7850	7851	43096254	2325	9562	7859	851.341	340	274896	815.70455	6447	0	0.00000	17155632	315	LIPID6950
7873	7874	43096268	2332	9593	7859	851.341	340	275117	816.70731	6447	1	1.00276	8943724	315	LIPID6950
7893	7894	43096210	2337	9616	7859	851.341	339	275337	817.70973	6447	2	2.00518	2498386	318	LIPID6950
7917	7918	43078430	2340	9646	7853	850.690	341	275558	818.71265	6447	3	3.00810	526234	319	LIPID6950
7936	7937	43066327	2346	9672	7850	850.363	340	275780	819.71736	6447	4	4.01281	51889	97	LIPID6950
8461	8462	49530509	2440	10339	9200	995.729	323	281672	846.75875	6961	0	0.00000	26086502	313	LIPID6950
8473	8474	49530354	2446	10356	9200	995.729	321	281889	847.76098	6961	1	1.00223	15159894	315	LIPID6950
8487	8488	49524946	2451	10381	9197	995.407	323	282104	848.76400	6961	2	2.00525	4727607	324	LIPID6950
8509	8510	49535759	2457	10412	9203	996.054	322	282321	849.76713	6961	3	3.00838	1226790	304	LIPID6950
8536	8537	49535682	2461	10451	9203	996.054	321	282538	850.76931	6961	4	4.01056	182559	98	LIPID6950

	id	parent_frame	child_frame	spectrum_min	spectrum_max	isolation_mz	isolation_width	collision_energy	uncorrected_precursor_mz	precursor_mz	precursor_charge	precursor_intensity	msrun_id
58442	34500	8183	8185	112	138	701.683227	2.0	30.0	701.403839	701.402542	1	1480.0	LIPID6950
9284	5597	1592	1594	740	766	349.355381	2.0	30.0	349.210580	349.209517	1	1001.0	LIPID6950
14066	8469	2264	2266	606	632	429.424546	2.0	30.0	429.238584	429.237342	1	7292.0	LIPID6950

			msms_id	peak_id
msrun_id	msms_id	peak_id
LIPID6950	False	True	0	1590
	True	False	39945	0
	True	True	9238	8030

	feature_id	peak_num	peak_id	xic_peak	mz_group	ms1_id	peak_frame	peak_spectrum	peak_intensity	precursor_mz	msms_id	mz_min	mz_max	msms_rt	msms_frame	spectrum_min	spectrum_max	instrument_precursor_mz	collision_energy	msrun_id
0	0.0	0.0	1.0	1.0	1.0	18941272.0	2267.0	677.0	61732.0	149.02315	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	LIPID6950
1	1.0	0.0	2.0	2.0	4.0	1197909.0	45.0	797.0	124324.0	158.96439	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	LIPID6950
2	2.0	0.0	3.0	3.0	10.0	9941222.0	1334.0	678.0	47898.0	171.13834	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	LIPID6950
3	3.0	0.0	4.0	6.0	13.0	7457949.0	782.0	609.0	244113.0	184.07338	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	LIPID6950
4	4.0	0.0	5.0	7.0	13.0	7777723.0	890.0	603.0	1323317.0	184.07368	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	LIPID6950
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
62012	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	26856.0	512.587200	514.587200	731.453	6758.0	590.0	616.0	513.369283	30.0	LIPID6950
62013	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	26861.0	759.045908	761.045908	731.453	6758.0	379.0	405.0	759.637560	30.0	LIPID6950
62014	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	26862.0	684.697335	686.697335	731.453	6758.0	432.0	458.0	685.430491	30.0	LIPID6950
62015	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	26860.0	894.020795	896.020795	731.453	6758.0	335.0	361.0	894.557759	30.0	LIPID6950
62016	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	26859.0	363.807218	365.807218	731.453	6758.0	745.0	771.0	-0.001267	30.0	LIPID6950

	tmz	tri
0	758.569432	613813
12	759.566467	2243
1	759.572787	281150
8	759.573649	1875
10	759.575709	5753
13	760.569822	1027
5	760.573677	10097
2	760.576142	62856
9	760.577004	859
11	760.579064	2635
6	761.577032	4625
3	761.579497	9139
7	762.580387	1034
4	762.582851	971

	tmz	tri
758	758.56943	616160
759	759.57280	289766
760	760.57584	76720
761	761.57875	14566
762	762.58197	1070
763	763.58531	9