diff --git a/README.md b/README.md index 0a6b362..7199737 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ You can perform the pipeline deploy defining a directory `my_dest_dir` for analy ```bash snakedeploy deploy-workflow https://github.com/GeneBANGS/RNASeq.git my_desd_dir - --tag v1.0 + --tag v1.1 ``` To run the pipeline, go inside the deployed pipeline folder and use the command: ```bash diff --git a/workflow/Snakefile b/workflow/Snakefile index dd1511d..ba98726 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -16,9 +16,11 @@ include: "rules/mapping.smk" include: "rules/rseqc.smk" include: "rules/qc.smk" + ##### local rules ##### localrules: - all + all, + ##### target rules ##### rule all: @@ -45,9 +47,10 @@ rule all: sample=samples.reset_index().itertuples(), ), qc=resolve_results_filepath( - config.get("paths").get("results_dir"), "qc/multiqc.html"), - - threads: conservative_cpu_count(reserve_cores=2,max_cores=99) + config.get("paths").get("results_dir"), "qc/multiqc.html" + ), + threads: conservative_cpu_count(reserve_cores=2, max_cores=99) resources: - tmpdir=config.get("paths").get("tmp_dir") - message: "Concluding the workflow!" + tmpdir=config.get("paths").get("tmp_dir"), + message: + "Concluding the workflow!" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index def3ded..9e411d0 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -9,7 +9,7 @@ from snakemake.utils import validate report: "../report/workflow.rst" -#validate(config, schema="../schemas/config.schema.yaml") +# validate(config, schema="../schemas/config.schema.yaml") samples = pd.read_table(config.get("samples"), index_col="sample") @@ -17,10 +17,13 @@ units = pd.read_table(config.get("units"), index_col=["unit"], dtype=str) ## pipeline-related functions -def get_unit_fastqs(wildcards, samples, label='units',read_pair='fq'): - for unit_set in samples.loc[wildcards.sample,[label]]: +def get_unit_fastqs(wildcards, samples, label="units", read_pair="fq"): + for unit_set in samples.loc[wildcards.sample, [label]]: wildcards.sample - return [expand_filepath(units.loc[x,[read_pair]].dropna()[0]) for x in unit_set.split(',')] + return [ + expand_filepath(units.loc[x, [read_pair]].dropna()[0]) + for x in unit_set.split(",") + ] def get_odp(wildcards, samples, optical_dup="odp"): @@ -65,7 +68,7 @@ def conservative_cpu_count(reserve_cores=1, max_cores=5): return max(cores - reserve_cores, 1) -def threads_calculator(read_type="pe",max_cores=99): +def threads_calculator(read_type="pe", max_cores=99): if read_type == "se": if conservative_cpu_count(reserve_cores=1, max_cores=max_cores) > 2: return conservative_cpu_count(reserve_cores=1, max_cores=max_cores) / 2 @@ -75,4 +78,4 @@ def threads_calculator(read_type="pe",max_cores=99): if conservative_cpu_count(reserve_cores=1, max_cores=max_cores) > 4: return conservative_cpu_count(reserve_cores=1, max_cores=max_cores) / 4 else: - return 1 \ No newline at end of file + return 1 diff --git a/workflow/rules/kallisto.smk b/workflow/rules/kallisto.smk index 1559747..32e81c8 100644 --- a/workflow/rules/kallisto.smk +++ b/workflow/rules/kallisto.smk @@ -1,20 +1,27 @@ rule kallisto_index: input: - resolve_single_filepath(config.get("resources").get("reference_path"), config.get("resources").get("transcriptome_fasta")) + resolve_single_filepath( + config.get("resources").get("reference_path"), + config.get("resources").get("transcriptome_fasta"), + ), output: resolve_results_filepath( - config.get("paths").get("results_dir"),"kallisto/index/transcriptome.kidx.transcripts") + config.get("paths").get("results_dir"), + "kallisto/index/transcriptome.kidx.transcripts", + ), conda: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/envs/kallisto.yaml" ) - threads: conservative_cpu_count(reserve_cores=1,max_cores=99) + threads: conservative_cpu_count(reserve_cores=1, max_cores=99) resources: - tmpdir=config.get("paths").get("tmp_dir") - message: "Build Transcriptome Index for Kallisto with {threads} threads on the following fasta file: {input}." + tmpdir=config.get("paths").get("tmp_dir"), + message: + "Build Transcriptome Index for Kallisto with {threads} threads on the following fasta file: {input}." log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/kallisto/index_kallisto.log") + config.get("paths").get("results_dir"), "logs/kallisto/index_kallisto.log" + ), shell: "kallisto index " "-i {output} " @@ -25,31 +32,44 @@ rule kallisto_index: rule kallisto: input: r1=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R1-trimmed.fq.gz"), + config.get("paths").get("results_dir"), + "reads/trimmed/{sample}-R1-trimmed.fq.gz", + ), r2=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R2-trimmed.fq.gz"), - index=rules.kallisto_index.output + config.get("paths").get("results_dir"), + "reads/trimmed/{sample}-R2-trimmed.fq.gz", + ), + index=rules.kallisto_index.output, output: resolve_results_filepath( - config.get("paths").get("results_dir"),"kallisto/{sample}/abundance.h5"), + config.get("paths").get("results_dir"), "kallisto/{sample}/abundance.h5" + ), resolve_results_filepath( - config.get("paths").get("results_dir"),"kallisto/{sample}/abundance.tsv"), + config.get("paths").get("results_dir"), "kallisto/{sample}/abundance.tsv" + ), resolve_results_filepath( - config.get("paths").get("results_dir"),"kallisto/{sample}/run_info.json") + config.get("paths").get("results_dir"), "kallisto/{sample}/run_info.json" + ), params: outdir=lambda w, output: os.path.split(output[0])[0], - params=config.get("params").get("kallisto").get("arguments") + params=config.get("params").get("kallisto").get("arguments"), conda: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/envs/kallisto.yaml" ) - threads: conservative_cpu_count(reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: - tmpdir=config.get("paths").get("tmp_dir") - message: "Kallisto Transcript Quantification with {threads} threads on the following fastq files: {input.r1}/{input.r2}." + tmpdir=config.get("paths").get("tmp_dir"), + message: + "Kallisto Transcript Quantification with {threads} threads on the following fastq files: {input.r1}/{input.r2}." log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/kallisto/{sample}.kallisto.log") + config.get("paths").get("results_dir"), + "logs/kallisto/{sample}.kallisto.log", + ), shell: "kallisto quant " "-i {input.index} " @@ -57,4 +77,4 @@ rule kallisto: "{params.params} " "-t {threads} " "{input.r1} {input.r2} " - ">& {log}" \ No newline at end of file + ">& {log}" diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 6f5c8ce..1414393 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -1,24 +1,36 @@ rule star_index: input: - resolve_single_filepath(config.get("resources").get("reference_path"), config.get("resources").get("reference_fa")) + resolve_single_filepath( + config.get("resources").get("reference_path"), + config.get("resources").get("reference_fa"), + ), output: - length=ensure(resolve_results_filepath( - config.get("paths").get("results_dir"),"star/index/chrLength.txt"), non_empty=True) + length=ensure( + resolve_results_filepath( + config.get("paths").get("results_dir"), "star/index/chrLength.txt" + ), + non_empty=True, + ), params: - gtf=resolve_single_filepath(config.get("resources").get("reference_path"), config.get("resources").get("gtf")), + gtf=resolve_single_filepath( + config.get("resources").get("reference_path"), + config.get("resources").get("gtf"), + ), genomeDir=lambda w, output: os.path.split(output[0])[0], - overhang=config.get("params").get("star").get("overhang") + overhang=config.get("params").get("star").get("overhang"), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/star/star_index.log") + config.get("paths").get("results_dir"), "logs/star/star_index.log" + ), conda: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/envs/star.yaml" ) threads: conservative_cpu_count(reserve_cores=2, max_cores=99) resources: - tmpdir=config.get("paths").get("tmp_dir") - message: "Indexing Genome with {threads} threads on the following fasta file: {input}." + tmpdir=config.get("paths").get("tmp_dir"), + message: + "Indexing Genome with {threads} threads on the following fasta file: {input}." shell: "STAR " "--runMode genomeGenerate " @@ -33,33 +45,50 @@ rule star_index: rule star_2pass_mapping: input: - read1 = resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R1-trimmed.fq.gz"), - read2 = resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R2-trimmed.fq.gz"), - index = rules.star_index.output.length + read1=resolve_results_filepath( + config.get("paths").get("results_dir"), + "reads/trimmed/{sample}-R1-trimmed.fq.gz", + ), + read2=resolve_results_filepath( + config.get("paths").get("results_dir"), + "reads/trimmed/{sample}-R2-trimmed.fq.gz", + ), + index=rules.star_index.output.length, output: - bam = temp(resolve_results_filepath( - config.get("paths").get("results_dir"),"star/{sample}/{sample}.Aligned.sortedByCoord.out.bam")), - log = resolve_results_filepath( - config.get("paths").get("results_dir"),"star/{sample}/{sample}.Log.final.out") + bam=temp( + resolve_results_filepath( + config.get("paths").get("results_dir"), + "star/{sample}/{sample}.Aligned.sortedByCoord.out.bam", + ) + ), + log=resolve_results_filepath( + config.get("paths").get("results_dir"), + "star/{sample}/{sample}.Log.final.out", + ), params: - genomedir = lambda w, input: os.path.split(input.index)[0], - sample = "{sample}", + genomedir=lambda w, input: os.path.split(input.index)[0], + sample="{sample}", platform=config.get("params").get("star").get("platform"), center=config.get("params").get("star").get("sequencing_center"), out_basename=lambda w, output: output[0].split(".", 1), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/star/{sample}/{sample}_star_map.log") + config.get("paths").get("results_dir"), + "logs/star/{sample}/{sample}_star_map.log", + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"),"workflow/envs/star.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/star.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=30000 - message: "STAR RNA-Seq Aligner 2-pass mapping with {threads} threads and {resources.mem_mb} Mbytes for the following file: {input.read1} and {input.read2}." + mem_mb=30000, + message: + "STAR RNA-Seq Aligner 2-pass mapping with {threads} threads and {resources.mem_mb} Mbytes for the following file: {input.read1} and {input.read2}." shell: "STAR " "--runMode alignReads " @@ -72,8 +101,8 @@ rule star_2pass_mapping: "--outStd Log " "--outSAMtype BAM SortedByCoordinate " "--twopassMode Basic " - "--limitBAMsortRAM 30000000000 " # 30 Gb the size required for human genome - "--outSAMmapqUnique 60 " # from GATK BP + "--limitBAMsortRAM 30000000000 " + "--outSAMmapqUnique 60 " "--outTmpDir {resources.tmpdir}/STARtmp " "--outFileNamePrefix {params.out_basename} " "&> {log} " @@ -81,46 +110,65 @@ rule star_2pass_mapping: rule move_bam_files: input: - bam = rules.star_2pass_mapping.output.bam + bam=rules.star_2pass_mapping.output.bam, output: - bam = protected(resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam")) + bam=protected( + resolve_results_filepath( + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ) + ), params: - sample = "{sample}" + sample="{sample}", log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/star/{sample}/{sample}_star_map_move.log") + config.get("paths").get("results_dir"), + "logs/star/{sample}/{sample}_star_map_move.log", + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"),"workflow/envs/star.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/star.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=30000 - message: "Rename and move with {threads} threads and {resources.mem_mb} Mbytes for the following BAM file: {input.bam}." + mem_mb=30000, + message: + "Rename and move with {threads} threads and {resources.mem_mb} Mbytes for the following BAM file: {input.bam}." shell: "mv " "{input.bam} {output.bam} " "&> {log} " + rule index_bam: input: - bam = rules.move_bam_files.output.bam + bam=rules.move_bam_files.output.bam, output: - bai = protected(resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai")) + bai=protected( + resolve_results_filepath( + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ) + ), conda: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/envs/samtools.yaml" ) - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: - tmpdir=config.get("paths").get("tmp_dir") - message: "Build Index for STAR mapped reads with {threads} threads on the following BAM file: {input.bam}." + tmpdir=config.get("paths").get("tmp_dir"), + message: + "Build Index for STAR mapped reads with {threads} threads on the following BAM file: {input.bam}." log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/samtools/{sample}.index.log") + config.get("paths").get("results_dir"), "logs/samtools/{sample}.index.log" + ), shell: "samtools index " "{input.bam} " - ">& {log} " \ No newline at end of file + ">& {log} " diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index b531ed2..dc1b39a 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -2,30 +2,39 @@ rule fastqc: input: rules.merge_read1_fastq.output, - rules.merge_read2_fastq.output + rules.merge_read2_fastq.output, output: - html_r1 = resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/fastqc/{sample}-R1.html"), - zip_r1 = resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/fastqc/{sample}-R1_fastqc.zip"), - html_r2= resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/fastqc/{sample}-R2.html"), - zip_r2= resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/fastqc/{sample}-R2_fastqc.zip") + html_r1=resolve_results_filepath( + config.get("paths").get("results_dir"), "qc/fastqc/{sample}-R1.html" + ), + zip_r1=resolve_results_filepath( + config.get("paths").get("results_dir"), "qc/fastqc/{sample}-R1_fastqc.zip" + ), + html_r2=resolve_results_filepath( + config.get("paths").get("results_dir"), "qc/fastqc/{sample}-R2.html" + ), + zip_r2=resolve_results_filepath( + config.get("paths").get("results_dir"), "qc/fastqc/{sample}-R2_fastqc.zip" + ), params: - outdir=lambda w, output: os.path.split(output[0])[0] + outdir=lambda w, output: os.path.split(output[0])[0], log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/fastqc/{sample}.log") + config.get("paths").get("results_dir"), "logs/fastqc/{sample}.log" + ), conda: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/envs/fastqc.yaml" ) - threads: conservative_cpu_count(reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=6000 - message: "." + mem_mb=6000, + message: + "." shell: "fastqc " "{input} " @@ -37,30 +46,43 @@ rule fastqc: rule fastqc_trimmed: input: rules.rename_trimmed_fastqs.output.read1, - rules.rename_trimmed_fastqs.output.read2 + rules.rename_trimmed_fastqs.output.read2, output: - html_r1 = resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/fastqc_trimmed/{sample}-R1.html"), - zip_r1 = resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/fastqc_trimmed/{sample}-R1_fastqc.zip"), - html_r2= resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/fastqc_trimmed/{sample}-R2.html"), - zip_r2= resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/fastqc_trimmed/{sample}-R2_fastqc.zip") + html_r1=resolve_results_filepath( + config.get("paths").get("results_dir"), + "qc/fastqc_trimmed/{sample}-R1.html", + ), + zip_r1=resolve_results_filepath( + config.get("paths").get("results_dir"), + "qc/fastqc_trimmed/{sample}-R1_fastqc.zip", + ), + html_r2=resolve_results_filepath( + config.get("paths").get("results_dir"), + "qc/fastqc_trimmed/{sample}-R2.html", + ), + zip_r2=resolve_results_filepath( + config.get("paths").get("results_dir"), + "qc/fastqc_trimmed/{sample}-R2_fastqc.zip", + ), params: - outdir=lambda w, output: os.path.split(output[0])[0] + outdir=lambda w, output: os.path.split(output[0])[0], log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/fastqc_trimmed/{sample}.log") + config.get("paths").get("results_dir"), "logs/fastqc_trimmed/{sample}.log" + ), conda: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/envs/fastqc.yaml" ) - threads: conservative_cpu_count(reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=6000 - message: "." + mem_mb=6000, + message: + "." shell: "fastqc " "{input} " @@ -122,7 +144,8 @@ rule multiqc: ), output: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/multiqc.html") + config.get("paths").get("results_dir"), "qc/multiqc.html" + ), params: params=config.get("params").get("multiqc").get("arguments"), outdir="qc", @@ -134,15 +157,21 @@ rule multiqc: kallisto=lambda w, input: os.path.split(input.kallisto[0])[0], conda: resolve_single_filepath( - config.get("paths").get("workdir"), "workflow/envs/multiqc.yaml") + config.get("paths").get("workdir"), "workflow/envs/multiqc.yaml" + ) log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/multiqc/multiqc.log") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("results_dir"), "logs/multiqc/multiqc.log" + ), + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=6000 - message: "Generate MultiQC Report with {threads} threads and {resources.mem_mb} Mbytes." + mem_mb=6000, + message: + "Generate MultiQC Report with {threads} threads and {resources.mem_mb} Mbytes." shell: "multiqc " "{params.fastqc} " @@ -154,7 +183,3 @@ rule multiqc: "-o {params.outdir} " "-n {params.outname} " ">& {log}" - - - - diff --git a/workflow/rules/rseqc.smk b/workflow/rules/rseqc.smk index cebd88f..7ee7aca 100644 --- a/workflow/rules/rseqc.smk +++ b/workflow/rules/rseqc.smk @@ -1,26 +1,38 @@ rule bam_stat: input: bam=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"), + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ), bai=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai") + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ), output: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.bam_stat.txt") + config.get("paths").get("results_dir"), "qc/rseqc/{sample}.bam_stat.txt" + ), params: - min_map_qual=config.get("params").get("rseqc").get("bamstat").get("min_map_qual") + min_map_qual=config.get("params") + .get("rseqc") + .get("bamstat") + .get("min_map_qual"), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/rseqc/bamstat/{sample}_bamstat.log") + config.get("paths").get("results_dir"), + "logs/rseqc/bamstat/{sample}_bamstat.log", + ), conda: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml" ) - threads: conservative_cpu_count(reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "Executing RSeQC BAMSTAT with {threads} threads on the following files {input.bam}." + mem_mb=3000, + message: + "Executing RSeQC BAMSTAT with {threads} threads on the following files {input.bam}." shell: "bam_stat.py " "-i {input.bam} " @@ -31,24 +43,33 @@ rule bam_stat: rule smatools_flagstat: input: bam=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"), + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ), bai=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai") + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ), output: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/samtools/{sample}.flagstat.txt") + config.get("paths").get("results_dir"), "qc/samtools/{sample}.flagstat.txt" + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/samtools/flagstat/{sample}.log") + config.get("paths").get("results_dir"), + "logs/samtools/flagstat/{sample}.log", + ), conda: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/envs/samtools.yaml" ) - threads: conservative_cpu_count(reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "Executing SAMTOOLS FLAGSTAT with {threads} threads on the following files {input.bam}." + mem_mb=3000, + message: + "Executing SAMTOOLS FLAGSTAT with {threads} threads on the following files {input.bam}." shell: "samtools flagstat " "-i {input.bam} " @@ -58,27 +79,40 @@ rule smatools_flagstat: rule rseqc_read_distribution: input: bam=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"), + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ), bai=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai") + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ), output: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.read_distribution.txt") + config.get("paths").get("results_dir"), + "qc/rseqc/{sample}.read_distribution.txt", + ), params: -# ref=resolve_single_filepath(*references_abs_path(ref="rseqc_reference"), config.get("refseq")), - ref=resolve_single_filepath(config.get("resources").get("rseqc_data"), config.get("resources").get("refseq_bed")), + # ref=resolve_single_filepath(*references_abs_path(ref="rseqc_reference"), config.get("refseq")), + ref=resolve_single_filepath( + config.get("resources").get("rseqc_data"), + config.get("resources").get("refseq_bed"), + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/rseqc/read_distribution/{sample}_readdistribution.log") + config.get("paths").get("results_dir"), + "logs/rseqc/read_distribution/{sample}_readdistribution.log", + ), conda: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml" ) - threads: conservative_cpu_count(reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "Executing RSeQC READ DISTRIBUTION with {threads} threads on the following files {input.bam}." + mem_mb=3000, + message: + "Executing RSeQC READ DISTRIBUTION with {threads} threads on the following files {input.bam}." shell: "read_distribution.py " "-r {params.ref} " @@ -89,26 +123,40 @@ rule rseqc_read_distribution: rule genebody_coverage: input: bam=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"), + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ), bai=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai") + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ), output: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.geneBodyCoverage.txt") + config.get("paths").get("results_dir"), + "qc/rseqc/{sample}.geneBodyCoverage.txt", + ), params: out_basename=lambda w, output: output[0].split(".", 1), - ref=resolve_single_filepath(config.get("resources").get("rseqc_data"), config.get("resources").get("housekeeping_genes")) + ref=resolve_single_filepath( + config.get("resources").get("rseqc_data"), + config.get("resources").get("housekeeping_genes"), + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/rseqc/genebody_coverage/{sample}_genebodycoverage.log") + config.get("paths").get("results_dir"), + "logs/rseqc/genebody_coverage/{sample}_genebodycoverage.log", + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml") - threads: conservative_cpu_count(reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "Executing RSeQC Gene Body Coverage with {threads} threads on the following files {input.bam}." + mem_mb=3000, + message: + "Executing RSeQC Gene Body Coverage with {threads} threads on the following files {input.bam}." shell: "geneBody_coverage.py " "-r {params.ref} " @@ -116,29 +164,43 @@ rule genebody_coverage: "-o {params.out_basename} " ">& {log} " + rule rseqc_junction_annotation: input: bam=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"), + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ), bai=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai") + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ), output: out=resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.junction.txt") + config.get("paths").get("results_dir"), "qc/rseqc/{sample}.junction.txt" + ), params: out_basename=lambda w, output: output[0].split(".", 1), - ref=resolve_single_filepath(config.get("resources").get("rseqc_data"), config.get("resources").get("refseq_bed")) + ref=resolve_single_filepath( + config.get("resources").get("rseqc_data"), + config.get("resources").get("refseq_bed"), + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.junctionAnnotation.txt") + config.get("paths").get("results_dir"), + "qc/rseqc/{sample}.junctionAnnotation.txt", + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "." + mem_mb=3000, + message: + "." shell: "junction_annotation.py " "-r {params.ref} " @@ -146,29 +208,44 @@ rule rseqc_junction_annotation: "-o {params.out_basename} " "2> {output.out}" + rule rseqc_junction_saturation: input: bam=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"), + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ), bai=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai") + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ), output: plotr=resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.junctionSaturation_plot.r") + config.get("paths").get("results_dir"), + "qc/rseqc/{sample}.junctionSaturation_plot.r", + ), params: out_basename=lambda w, output: output[0].split(".", 1), - ref=resolve_single_filepath(config.get("resources").get("rseqc_data"), config.get("resources").get("refseq_bed")), + ref=resolve_single_filepath( + config.get("resources").get("rseqc_data"), + config.get("resources").get("refseq_bed"), + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.junctionSaturation.txt") + config.get("paths").get("results_dir"), + "qc/rseqc/{sample}.junctionSaturation.txt", + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "." + mem_mb=3000, + message: + "." shell: "junction_saturation.py " "-r {params.ref} " @@ -180,25 +257,34 @@ rule rseqc_junction_saturation: rule rseqc_GC: input: bam=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"), + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ), bai=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai") + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ), output: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.GC.xls") + config.get("paths").get("results_dir"), "qc/rseqc/{sample}.GC.xls" + ), params: - out_basename=lambda w, output: output[0].split(".", 1) + out_basename=lambda w, output: output[0].split(".", 1), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.GC.txt") + config.get("paths").get("results_dir"), "qc/rseqc/{sample}.GC.txt" + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "." + mem_mb=3000, + message: + "." shell: "read_GC.py " "-i {input.bam} " @@ -208,25 +294,39 @@ rule rseqc_GC: rule rseqc_infer_experiment: input: bam=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"), + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ), bai=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai") + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ), output: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.infer_experiment.txt") + config.get("paths").get("results_dir"), + "qc/rseqc/{sample}.infer_experiment.txt", + ), params: - ref=resolve_single_filepath(config.get("resources").get("rseqc_data"), config.get("resources").get("refseq_bed")) + ref=resolve_single_filepath( + config.get("resources").get("rseqc_data"), + config.get("resources").get("refseq_bed"), + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.inferexperiment.txt") + config.get("paths").get("results_dir"), + "qc/rseqc/{sample}.inferexperiment.txt", + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "." + mem_mb=3000, + message: + "." shell: "infer_experiment.py " "-r {params.ref} " @@ -237,26 +337,40 @@ rule rseqc_infer_experiment: rule rseqc_inner_distance: input: bam=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"), + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ), bai=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai") + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ), output: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.inner_distance.txt") + config.get("paths").get("results_dir"), + "qc/rseqc/{sample}.inner_distance.txt", + ), params: - out_basename = lambda w, output: output[0].split(".", 1), - ref=resolve_single_filepath(config.get("resources").get("rseqc_data"), config.get("resources").get("refseq_bed")), + out_basename=lambda w, output: output[0].split(".", 1), + ref=resolve_single_filepath( + config.get("resources").get("rseqc_data"), + config.get("resources").get("refseq_bed"), + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.innerdistance.txt") + config.get("paths").get("results_dir"), + "qc/rseqc/{sample}.innerdistance.txt", + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "." + mem_mb=3000, + message: + "." shell: "inner_distance.py " "-r {params.ref} " @@ -267,25 +381,36 @@ rule rseqc_inner_distance: rule rseqc_read_duplication: input: bam=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"), + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ), bai=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai") + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ), output: out1=resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.dup.pos.DupRate.xls") + config.get("paths").get("results_dir"), + "qc/rseqc/{sample}.dup.pos.DupRate.xls", + ), params: - out_basename=lambda w, output: output[0].split(".", 1) + out_basename=lambda w, output: output[0].split(".", 1), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.read_duplication.txt") + config.get("paths").get("results_dir"), + "qc/rseqc/{sample}.read_duplication.txt", + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "." + mem_mb=3000, + message: + "." shell: "read_duplication.py " "-i {input.bam} " @@ -295,26 +420,39 @@ rule rseqc_read_duplication: rule rseqc_RPKM_saturation: input: bam=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"), + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ), bai=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai") + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ), output: out=resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.saturation.pdf") + config.get("paths").get("results_dir"), "qc/rseqc/{sample}.saturation.pdf" + ), params: out_basename=lambda w, output: output[0].split(".", 1), - ref=resolve_single_filepath(config.get("resources").get("rseqc_data"), config.get("resources").get("refseq_bed")), + ref=resolve_single_filepath( + config.get("resources").get("rseqc_data"), + config.get("resources").get("refseq_bed"), + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/rseqc/{sample}.RPKM_saturation.log") + config.get("paths").get("results_dir"), + "logs/rseqc/{sample}.RPKM_saturation.log", + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "." + mem_mb=3000, + message: + "." shell: "RPKM_saturation.py " "-r {params.ref} " @@ -326,25 +464,37 @@ rule rseqc_RPKM_saturation: rule rseqc_tin: input: bam=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"), + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam" + ), bai=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai") + config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai" + ), output: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.tin.summary.txt") + config.get("paths").get("results_dir"), "qc/rseqc/{sample}.tin.summary.txt" + ), params: - ref=resolve_single_filepath(config.get("resources").get("rseqc_data"), config.get("resources").get("refseq_bed")) + ref=resolve_single_filepath( + config.get("resources").get("rseqc_data"), + config.get("resources").get("refseq_bed"), + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.tin.txt") + config.get("paths").get("results_dir"), "qc/rseqc/{sample}.tin.txt" + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "." + mem_mb=3000, + message: + "." shell: "tin.py " "-r {params.ref} " @@ -364,20 +514,27 @@ rule check_rseqc: rules.rseqc_junction_annotation.output.out, rules.rseqc_junction_saturation.output.plotr, rules.rseqc_read_distribution.output, - rules.rseqc_read_duplication.output.out1 + rules.rseqc_read_duplication.output.out1, output: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.rseqc_complete") + config.get("paths").get("results_dir"), "qc/rseqc/{sample}.rseqc_complete" + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/rseqc/{sample}.check.txt") + config.get("paths").get("results_dir"), "qc/rseqc/{sample}.check.txt" + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/rseqc.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3000 - message: "." + mem_mb=3000, + message: + "." shell: - "touch {output} " \ No newline at end of file + "touch {output} " diff --git a/workflow/rules/trimming.smk b/workflow/rules/trimming.smk index ff8cf4e..e6584e0 100644 --- a/workflow/rules/trimming.smk +++ b/workflow/rules/trimming.smk @@ -1,19 +1,28 @@ rule merge_read1_fastq: input: - lambda wildcards: get_unit_fastqs(wildcards, samples, read_pair='fq1') + lambda wildcards: get_unit_fastqs(wildcards, samples, read_pair="fq1"), output: - temp(resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/untrimmed/merged/{sample}-R1.fq.gz")) + temp( + resolve_results_filepath( + config.get("paths").get("results_dir"), + "reads/untrimmed/merged/{sample}-R1.fq.gz", + ) + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/merge/{sample}_R1.log") + config.get("paths").get("results_dir"), "logs/merge/{sample}_R1.log" + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"),"workflow/envs/python_script.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/python_script.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3500 + mem_mb=3500, script: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/scripts/merge_fastq.py" @@ -22,20 +31,29 @@ rule merge_read1_fastq: rule merge_read2_fastq: input: - lambda wildcards: get_unit_fastqs(wildcards, samples, read_pair='fq2') + lambda wildcards: get_unit_fastqs(wildcards, samples, read_pair="fq2"), output: - temp(resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/untrimmed/merged/{sample}-R2.fq.gz")) + temp( + resolve_results_filepath( + config.get("paths").get("results_dir"), + "reads/untrimmed/merged/{sample}-R2.fq.gz", + ) + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/merge/{sample}_R2.log") + config.get("paths").get("results_dir"), "logs/merge/{sample}_R2.log" + ), conda: resolve_single_filepath( - config.get("paths").get("workdir"),"workflow/envs/python_script.yaml") - threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores"))) + config.get("paths").get("workdir"), "workflow/envs/python_script.yaml" + ) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: tmpdir=config.get("paths").get("tmp_dir"), - mem_mb=3500 + mem_mb=3500, script: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/scripts/merge_fastq.py" @@ -45,32 +63,47 @@ rule merge_read2_fastq: rule trimming: input: read1=rules.merge_read1_fastq.output, - read2=rules.merge_read2_fastq.output + read2=rules.merge_read2_fastq.output, output: - read1=temp(resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R1_val_1.fq.gz")), + read1=temp( + resolve_results_filepath( + config.get("paths").get("results_dir"), + "reads/trimmed/{sample}-R1_val_1.fq.gz", + ) + ), read1_trimming_report=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R1.fq.gz_trimming_report.txt"), - read2=temp(resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R2_val_2.fq.gz")), + config.get("paths").get("results_dir"), + "reads/trimmed/{sample}-R1.fq.gz_trimming_report.txt", + ), + read2=temp( + resolve_results_filepath( + config.get("paths").get("results_dir"), + "reads/trimmed/{sample}-R2_val_2.fq.gz", + ) + ), read2_trimming_report=resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R2.fq.gz_trimming_report.txt") + config.get("paths").get("results_dir"), + "reads/trimmed/{sample}-R2.fq.gz_trimming_report.txt", + ), params: extra=config.get("params").get("trim_galore").get("arguments"), - outdir= lambda w, output: os.path.split(output[0])[0], + outdir=lambda w, output: os.path.split(output[0])[0], qc_dir=resolve_results_filepath( - config.get("paths").get("results_dir"),"qc/fastqc") + config.get("paths").get("results_dir"), "qc/fastqc" + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/trim_galore/{sample}.log") + config.get("paths").get("results_dir"), "logs/trim_galore/{sample}.log" + ), conda: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/envs/trim_galore.yaml" ) threads: conservative_cpu_count(reserve_cores=0, max_cores=2) resources: - tmpdir=config.get("paths").get("tmp_dir") - message: "Trimming reads with TRIM_GALORE with {threads} threads for the following files {input.read1}{input.read2}." + tmpdir=config.get("paths").get("tmp_dir"), + message: + "Trimming reads with TRIM_GALORE with {threads} threads for the following files {input.read1}{input.read2}." shell: "mkdir -p {params.qc_dir}; " "trim_galore " @@ -84,24 +117,37 @@ rule trimming: rule rename_trimmed_fastqs: input: read1=rules.trimming.output.read1, - read2=rules.trimming.output.read2 + read2=rules.trimming.output.read2, output: - read1=protected(resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R1-trimmed.fq.gz")), - read2=protected(resolve_results_filepath( - config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R2-trimmed.fq.gz")) + read1=protected( + resolve_results_filepath( + config.get("paths").get("results_dir"), + "reads/trimmed/{sample}-R1-trimmed.fq.gz", + ) + ), + read2=protected( + resolve_results_filepath( + config.get("paths").get("results_dir"), + "reads/trimmed/{sample}-R2-trimmed.fq.gz", + ) + ), log: resolve_results_filepath( - config.get("paths").get("results_dir"),"logs/bash/{sample}.log") + config.get("paths").get("results_dir"), "logs/bash/{sample}.log" + ), conda: resolve_single_filepath( config.get("paths").get("workdir"), "workflow/envs/bash.yaml" ) - threads: conservative_cpu_count(reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))) + threads: + conservative_cpu_count( + reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")) + ) resources: - tmpdir=config.get("paths").get("tmp_dir") - message: "Rename fastq files {input.read1}{input.read2}." + tmpdir=config.get("paths").get("tmp_dir"), + message: + "Rename fastq files {input.read1}{input.read2}." shell: "mv {input.read1} {output.read1} && " "mv {input.read2} {output.read2} " - ">& {log}" \ No newline at end of file + ">& {log}"