-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathconfig.yaml
78 lines (69 loc) · 4.22 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# This file should contain everything to configure the workflow on a global scale.
# In case of sample based data, it should be complemented by a samples.tsv file that contains
# one row per sample. It can be parsed easily via pandas.
pep_version: 2.1.0
sample_table: "dataset.tsv"
subsample_table: [ "samples.tsv" ]
projects:
- name: Test
sample_table: dataset.tsv
subsample_table: [ "samples.tsv" ]
rules: "config/config.yaml"
#### RULE CONFIGURATION ####
# rules: set value to TRUE if you want to run the analysis or FALSE if you don't
rules:
fileconversion: FALSE # True only for *.raw files from Thermo
preprocessing: TRUE # From raw data to a table of features
requantification: FALSE # True for files with common features
GNPS_export: TRUE # All the files necessary for FBMN
SIRIUS: FALSE # Annotate the feature matrix with predictions for chemical formula, structure (CSI:FingerID) and chemical classes (CANOPUS)
# The following rules require GNPS_export: TRUE
spectralmatcher: FALSE # Spectral matching with in-house or any downloaded MSMS library & feature matrix annotation (MSI level 2 annotations)
MS2Query: FALSE # Machine learning tool for spectral matching and analogue annotation (spec2vec and MS2DeepScore)
fbmn_integration: FALSE # After FBMN is finished: integration of formula and structural predictions to the GraphML network file. Optionally, annotate with the MSMS library matches from GNPS also (MSI level 2)
#### PARAMETER CONFIGURATION ####
# set values to the most important parameters for your run:
# 1) set the system requirements:
system:
memory: 300000 #memory limit that snakemake is allowed to use
threads: 16 # maximum number of threads per rule
# 2) set parameters for pre-processing that are instrument and method/data dependent:
preprocess:
noise_thr: "1.0e04" # Noise threshold. Instrument dependent (example for Orbitrap IDX)
mass_error: "10.0" # (in ppm) Mass accuracy related
rm_single_traces: "true" #set to false if you do not want to filter out peaks without isotopic pattern
fwhm: "5.0" # Expected chromatographic peak width (in seconds)
min_trace: "5.0" # Minimum expected length of a mass trace (in seconds).
# 3) adduct annotation (set possible adducts):
adducts:
ion_mode: "positive" # otherwise "negative"
adducts_pos: "H:+:0.6 Na:+:0.1 NH4:+:0.1 H-1O-1:+:0.1 H-3O-2:+:0.1" # change according to common adducts generated by the instrument
adducts_neg: "H-1:-:1 H-2O-1:0:0.05 CH2O2:0:0.5" # change according to common adducts generated by the instrument
# 4) requantification:
requantification:
mz_window: 10.0 #increase depending on instrument accuracy
RT_window: 30.0 #increase depending on chromatographic shifts
# 5) linking features:
featurelink:
mz_tol: 8.0 #increase depending on instrument accuracy
rt_tol: 30.0 #increase depending on chromatographic shifts
# 6) map alignment:
align:
mz_max: 10.0 # (in ppm) do not pair features with m/z distance larger than that number - Instrument specific
# 7) SIRIUS/CSI:FingerID/CANOPUS
SIRIUS:
export_only: FALSE # Only export input files for SIRIUS, but don't actually execute SIRIUS here
predict_structure_and_class: TRUE # CSI:FingerID and CANOPUS
# combine_annotations: TRUE --> combine annotations (e.g. SIRIUS_molecularFormula) from all files into a single column separated by " ## "
# FALSE --> keep a separate column for each file (e.g. sample1_SIRIUS_molecularFormula, sample2_SIRIUS_molecularFormula, ...)
combine_annotations: TRUE
max_mz: 300
instrument: 'orbitrap' # (valid: 'default', 'qtof', 'orbitrap', 'fticr')
pos_ions_considered: "[M+H]+,[M-H2O+H]+,[M+Na]+,[M+NH4]+"
neg_ions_considered: "[M-H]-,[M-H2O-H]-,[M-HCOOH]-"
elements_considered: "SBrClBSe"
elements_enforced: "CHNOP"
ppm_max: 10
ppm_max_ms2: 10
formula_database: none # Search formulas in the Union of the given databases db-name1,db-name2,db-name3. If no database is given all possible molecular formulas will be respected (no database is used). Example: possible DBs: ALL,BIO,PUBCHEM,MESH,HMDB,KNAPSACK,CHEBI,PUBMED,KEGG,HSDB,MACONDA,METACYC,GNPS,ZINCBIO,UNDP,YMDB,PLANTCYC,NORMAN,ADDITIONAL,PUBCHEMANNOTATIONBIO,PUBCHEMANNOTATIONDRUG,PUBCHEMANNOTATIONSAFETYANDTOXIC,PUBCHEMANNOTATIONFOOD,KEGGMINE,ECOCYCMINE,YMDBMINE
structure_database: "BIO"