config/config.yaml

# This file should contain everything to configure the workflow on a global scale.
# In case of sample based data, it should be complemented by a samples.tsv file that contains
# one row per sample. It can be parsed easily via pandas.
pep_version: 2.1.0
sample_table: "dataset.tsv"
subsample_table: [ "samples.tsv" ]

projects:
- name: Test
  sample_table: dataset.tsv
  subsample_table: [ "samples.tsv" ]
  rules: "config/config.yaml"

#### RULE CONFIGURATION ####
# rules: set value to TRUE if you want to run the analysis or FALSE if you don't
rules:
  fileconversion: FALSE # True only for *.raw files from Thermo
  preprocessing: TRUE # From raw data to a table of features
  requantification: FALSE # True for files with common features
  GNPS_export: TRUE # All the files necessary for FBMN
  SIRIUS: FALSE # Annotate the feature matrix with predictions for chemical formula, structure (CSI:FingerID) and chemical classes (CANOPUS)
  # The following rules require GNPS_export: TRUE
  spectralmatcher: FALSE # Spectral matching with in-house or any downloaded MSMS library & feature matrix annotation (MSI level 2 annotations)
  MS2Query: FALSE # Machine learning tool for spectral matching and analogue annotation (spec2vec and MS2DeepScore)
  fbmn_integration: FALSE # After FBMN is finished: integration of formula and structural predictions to the GraphML network file. Optionally, annotate with the MSMS library matches from GNPS also (MSI level 2)

#### PARAMETER CONFIGURATION ####
# set values to the most important parameters for your run:
# 1) set the system requirements:
system:
  memory: 300000 #memory limit that snakemake is allowed to use
  threads: 16 # maximum number of threads per rule

# 2) set parameters for pre-processing that are instrument and method/data dependent:
preprocess:
  noise_thr: "1.0e04" # Noise threshold. Instrument dependent (example for Orbitrap IDX)
  mass_error: "10.0" # (in ppm) Mass accuracy related
  rm_single_traces: "true" #set to false if you do not want to filter out peaks without isotopic pattern
  fwhm: "5.0" # Expected chromatographic peak width (in seconds)
  min_trace: "5.0" # Minimum expected length of a mass trace (in seconds).

# 3) adduct annotation (set possible adducts):
adducts:
  ion_mode: "positive" # otherwise "negative"
  adducts_pos: "H:+:0.6 Na:+:0.1 NH4:+:0.1 H-1O-1:+:0.1 H-3O-2:+:0.1" # change according to common adducts generated by the instrument
  adducts_neg: "H-1:-:1 H-2O-1:0:0.05 CH2O2:0:0.5" # change according to common adducts generated by the instrument

# 4) requantification:
requantification:
  mz_window: 10.0 #increase depending on instrument accuracy
  RT_window: 30.0 #increase depending on chromatographic shifts

# 5) linking features:
featurelink:
  mz_tol: 8.0 #increase depending on instrument accuracy
  rt_tol: 30.0 #increase depending on chromatographic shifts

# 6) map alignment:
align:
  mz_max: 10.0 # (in ppm) do not pair features with m/z distance larger than that number - Instrument specific

# 7) SIRIUS/CSI:FingerID/CANOPUS
SIRIUS:
  export_only: FALSE # Only export input files for SIRIUS, but don't actually execute SIRIUS here
  predict_structure_and_class: TRUE # CSI:FingerID and CANOPUS
  # combine_annotations: TRUE --> combine annotations (e.g. SIRIUS_molecularFormula) from all files into a single column separated by " ## "
  # FALSE --> keep a separate column for each file (e.g. sample1_SIRIUS_molecularFormula, sample2_SIRIUS_molecularFormula, ...)
  combine_annotations: TRUE
  max_mz: 300
  instrument: 'orbitrap' # (valid: 'default', 'qtof', 'orbitrap', 'fticr')
  pos_ions_considered: "[M+H]+,[M-H2O+H]+,[M+Na]+,[M+NH4]+"
  neg_ions_considered: "[M-H]-,[M-H2O-H]-,[M-HCOOH]-"
  elements_considered: "SBrClBSe"
  elements_enforced: "CHNOP"
  ppm_max: 10
  ppm_max_ms2: 10
  formula_database: none # Search formulas in the Union of the given databases db-name1,db-name2,db-name3. If no database is given all possible molecular formulas will be respected (no database is used). Example: possible DBs: ALL,BIO,PUBCHEM,MESH,HMDB,KNAPSACK,CHEBI,PUBMED,KEGG,HSDB,MACONDA,METACYC,GNPS,ZINCBIO,UNDP,YMDB,PLANTCYC,NORMAN,ADDITIONAL,PUBCHEMANNOTATIONBIO,PUBCHEMANNOTATIONDRUG,PUBCHEMANNOTATIONSAFETYANDTOXIC,PUBCHEMANNOTATIONFOOD,KEGGMINE,ECOCYCMINE,YMDBMINE
  structure_database: "BIO"