snakefmt formatting for compatibility with snakemake standard compliants

GeneBANGS · Sep 21, 2023 · e4b9ea6 · e4b9ea6
1 parent 511e73b
commit e4b9ea6
Show file tree

Hide file tree

Showing 8 changed files with 564 additions and 262 deletions.
diff --git a/README.md b/README.md
@@ -35,7 +35,7 @@ You can perform the pipeline deploy defining a directory `my_dest_dir` for analy
 ```bash
 snakedeploy deploy-workflow https://github.com/GeneBANGS/RNASeq.git 
                     my_desd_dir 
-                    --tag v1.0
+                    --tag v1.1
 ```
 To run the pipeline, go inside the deployed pipeline folder and use the command:
 ```bash

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -16,9 +16,11 @@ include: "rules/mapping.smk"
 include: "rules/rseqc.smk"
 include: "rules/qc.smk"
 
+
 ##### local rules #####
 localrules:
-    all
+    all,
+
 
 ##### target rules #####
 rule all:
@@ -45,9 +47,10 @@ rule all:
             sample=samples.reset_index().itertuples(),
         ),
         qc=resolve_results_filepath(
-            config.get("paths").get("results_dir"), "qc/multiqc.html"),
-
-    threads: conservative_cpu_count(reserve_cores=2,max_cores=99)
+            config.get("paths").get("results_dir"), "qc/multiqc.html"
+        ),
+    threads: conservative_cpu_count(reserve_cores=2, max_cores=99)
     resources:
-        tmpdir=config.get("paths").get("tmp_dir")
-    message: "Concluding the workflow!"
+        tmpdir=config.get("paths").get("tmp_dir"),
+    message:
+        "Concluding the workflow!"
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -9,18 +9,21 @@ from snakemake.utils import validate
 report: "../report/workflow.rst"
 
 
-#validate(config, schema="../schemas/config.schema.yaml")
+# validate(config, schema="../schemas/config.schema.yaml")
 
 
 samples = pd.read_table(config.get("samples"), index_col="sample")
 units = pd.read_table(config.get("units"), index_col=["unit"], dtype=str)
 
 
 ## pipeline-related functions
-def get_unit_fastqs(wildcards, samples, label='units',read_pair='fq'):
-    for unit_set in samples.loc[wildcards.sample,[label]]:
+def get_unit_fastqs(wildcards, samples, label="units", read_pair="fq"):
+    for unit_set in samples.loc[wildcards.sample, [label]]:
         wildcards.sample
-    return [expand_filepath(units.loc[x,[read_pair]].dropna()[0]) for x in unit_set.split(',')]
+    return [
+        expand_filepath(units.loc[x, [read_pair]].dropna()[0])
+        for x in unit_set.split(",")
+    ]
 
 
 def get_odp(wildcards, samples, optical_dup="odp"):
@@ -65,7 +68,7 @@ def conservative_cpu_count(reserve_cores=1, max_cores=5):
     return max(cores - reserve_cores, 1)
 
 
-def threads_calculator(read_type="pe",max_cores=99):
+def threads_calculator(read_type="pe", max_cores=99):
     if read_type == "se":
         if conservative_cpu_count(reserve_cores=1, max_cores=max_cores) > 2:
             return conservative_cpu_count(reserve_cores=1, max_cores=max_cores) / 2
@@ -75,4 +78,4 @@ def threads_calculator(read_type="pe",max_cores=99):
         if conservative_cpu_count(reserve_cores=1, max_cores=max_cores) > 4:
             return conservative_cpu_count(reserve_cores=1, max_cores=max_cores) / 4
         else:
-            return 1
+            return 1
diff --git a/workflow/rules/kallisto.smk b/workflow/rules/kallisto.smk
@@ -1,20 +1,27 @@
 rule kallisto_index:
     input:
-        resolve_single_filepath(config.get("resources").get("reference_path"), config.get("resources").get("transcriptome_fasta"))
+        resolve_single_filepath(
+            config.get("resources").get("reference_path"),
+            config.get("resources").get("transcriptome_fasta"),
+        ),
     output:
         resolve_results_filepath(
-            config.get("paths").get("results_dir"),"kallisto/index/transcriptome.kidx.transcripts")
+            config.get("paths").get("results_dir"),
+            "kallisto/index/transcriptome.kidx.transcripts",
+        ),
     conda:
         resolve_single_filepath(
             config.get("paths").get("workdir"), "workflow/envs/kallisto.yaml"
         )
-    threads: conservative_cpu_count(reserve_cores=1,max_cores=99)
+    threads: conservative_cpu_count(reserve_cores=1, max_cores=99)
     resources:
-        tmpdir=config.get("paths").get("tmp_dir")
-    message: "Build Transcriptome Index for Kallisto with {threads} threads on the following fasta file: {input}."
+        tmpdir=config.get("paths").get("tmp_dir"),
+    message:
+        "Build Transcriptome Index for Kallisto with {threads} threads on the following fasta file: {input}."
     log:
         resolve_results_filepath(
-            config.get("paths").get("results_dir"),"logs/kallisto/index_kallisto.log")
+            config.get("paths").get("results_dir"), "logs/kallisto/index_kallisto.log"
+        ),
     shell:
         "kallisto index "
         "-i {output} "
@@ -25,36 +32,49 @@ rule kallisto_index:
 rule kallisto:
     input:
         r1=resolve_results_filepath(
-            config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R1-trimmed.fq.gz"),
+            config.get("paths").get("results_dir"),
+            "reads/trimmed/{sample}-R1-trimmed.fq.gz",
+        ),
         r2=resolve_results_filepath(
-            config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R2-trimmed.fq.gz"),
-        index=rules.kallisto_index.output
+            config.get("paths").get("results_dir"),
+            "reads/trimmed/{sample}-R2-trimmed.fq.gz",
+        ),
+        index=rules.kallisto_index.output,
     output:
         resolve_results_filepath(
-            config.get("paths").get("results_dir"),"kallisto/{sample}/abundance.h5"),
+            config.get("paths").get("results_dir"), "kallisto/{sample}/abundance.h5"
+        ),
         resolve_results_filepath(
-            config.get("paths").get("results_dir"),"kallisto/{sample}/abundance.tsv"),
+            config.get("paths").get("results_dir"), "kallisto/{sample}/abundance.tsv"
+        ),
         resolve_results_filepath(
-            config.get("paths").get("results_dir"),"kallisto/{sample}/run_info.json")
+            config.get("paths").get("results_dir"), "kallisto/{sample}/run_info.json"
+        ),
     params:
         outdir=lambda w, output: os.path.split(output[0])[0],
-        params=config.get("params").get("kallisto").get("arguments")
+        params=config.get("params").get("kallisto").get("arguments"),
     conda:
         resolve_single_filepath(
             config.get("paths").get("workdir"), "workflow/envs/kallisto.yaml"
         )
-    threads: conservative_cpu_count(reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")))
+    threads:
+        conservative_cpu_count(
+            reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))
+        )
     resources:
-        tmpdir=config.get("paths").get("tmp_dir")
-    message: "Kallisto Transcript Quantification with {threads} threads on the following fastq files: {input.r1}/{input.r2}."
+        tmpdir=config.get("paths").get("tmp_dir"),
+    message:
+        "Kallisto Transcript Quantification with {threads} threads on the following fastq files: {input.r1}/{input.r2}."
     log:
         resolve_results_filepath(
-            config.get("paths").get("results_dir"),"logs/kallisto/{sample}.kallisto.log")
+            config.get("paths").get("results_dir"),
+            "logs/kallisto/{sample}.kallisto.log",
+        ),
     shell:
         "kallisto quant "
         "-i {input.index} "
         "-o {params.outdir} "
         "{params.params} "
         "-t {threads} "
         "{input.r1} {input.r2} "
-        ">& {log}"
+        ">& {log}"
diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk
@@ -1,24 +1,36 @@
 rule star_index:
     input:
-        resolve_single_filepath(config.get("resources").get("reference_path"), config.get("resources").get("reference_fa"))
+        resolve_single_filepath(
+            config.get("resources").get("reference_path"),
+            config.get("resources").get("reference_fa"),
+        ),
     output:
-        length=ensure(resolve_results_filepath(
-            config.get("paths").get("results_dir"),"star/index/chrLength.txt"), non_empty=True)
+        length=ensure(
+            resolve_results_filepath(
+                config.get("paths").get("results_dir"), "star/index/chrLength.txt"
+            ),
+            non_empty=True,
+        ),
     params:
-        gtf=resolve_single_filepath(config.get("resources").get("reference_path"), config.get("resources").get("gtf")),
+        gtf=resolve_single_filepath(
+            config.get("resources").get("reference_path"),
+            config.get("resources").get("gtf"),
+        ),
         genomeDir=lambda w, output: os.path.split(output[0])[0],
-        overhang=config.get("params").get("star").get("overhang")
+        overhang=config.get("params").get("star").get("overhang"),
     log:
         resolve_results_filepath(
-            config.get("paths").get("results_dir"),"logs/star/star_index.log")
+            config.get("paths").get("results_dir"), "logs/star/star_index.log"
+        ),
     conda:
         resolve_single_filepath(
             config.get("paths").get("workdir"), "workflow/envs/star.yaml"
         )
     threads: conservative_cpu_count(reserve_cores=2, max_cores=99)
     resources:
-        tmpdir=config.get("paths").get("tmp_dir")
-    message: "Indexing Genome with {threads} threads on the following fasta file: {input}."
+        tmpdir=config.get("paths").get("tmp_dir"),
+    message:
+        "Indexing Genome with {threads} threads on the following fasta file: {input}."
     shell:
         "STAR "
         "--runMode genomeGenerate "
@@ -33,33 +45,50 @@ rule star_index:
 
 rule star_2pass_mapping:
     input:
-        read1 = resolve_results_filepath(
-            config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R1-trimmed.fq.gz"),
-        read2 = resolve_results_filepath(
-            config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R2-trimmed.fq.gz"),
-        index = rules.star_index.output.length
+        read1=resolve_results_filepath(
+            config.get("paths").get("results_dir"),
+            "reads/trimmed/{sample}-R1-trimmed.fq.gz",
+        ),
+        read2=resolve_results_filepath(
+            config.get("paths").get("results_dir"),
+            "reads/trimmed/{sample}-R2-trimmed.fq.gz",
+        ),
+        index=rules.star_index.output.length,
     output:
-        bam = temp(resolve_results_filepath(
-            config.get("paths").get("results_dir"),"star/{sample}/{sample}.Aligned.sortedByCoord.out.bam")),
-        log = resolve_results_filepath(
-            config.get("paths").get("results_dir"),"star/{sample}/{sample}.Log.final.out")
+        bam=temp(
+            resolve_results_filepath(
+                config.get("paths").get("results_dir"),
+                "star/{sample}/{sample}.Aligned.sortedByCoord.out.bam",
+            )
+        ),
+        log=resolve_results_filepath(
+            config.get("paths").get("results_dir"),
+            "star/{sample}/{sample}.Log.final.out",
+        ),
     params:
-        genomedir = lambda w, input: os.path.split(input.index)[0],
-        sample = "{sample}",
+        genomedir=lambda w, input: os.path.split(input.index)[0],
+        sample="{sample}",
         platform=config.get("params").get("star").get("platform"),
         center=config.get("params").get("star").get("sequencing_center"),
         out_basename=lambda w, output: output[0].split(".", 1),
     log:
         resolve_results_filepath(
-            config.get("paths").get("results_dir"),"logs/star/{sample}/{sample}_star_map.log")
+            config.get("paths").get("results_dir"),
+            "logs/star/{sample}/{sample}_star_map.log",
+        ),
     conda:
         resolve_single_filepath(
-            config.get("paths").get("workdir"),"workflow/envs/star.yaml")
-    threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores")))
+            config.get("paths").get("workdir"), "workflow/envs/star.yaml"
+        )
+    threads:
+        conservative_cpu_count(
+            reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))
+        )
     resources:
         tmpdir=config.get("paths").get("tmp_dir"),
-        mem_mb=30000
-    message: "STAR RNA-Seq Aligner 2-pass mapping with {threads} threads and {resources.mem_mb} Mbytes for the following file: {input.read1} and {input.read2}."
+        mem_mb=30000,
+    message:
+        "STAR RNA-Seq Aligner 2-pass mapping with {threads} threads and {resources.mem_mb} Mbytes for the following file: {input.read1} and {input.read2}."
     shell:
         "STAR "
         "--runMode alignReads "
@@ -72,55 +101,74 @@ rule star_2pass_mapping:
         "--outStd Log "
         "--outSAMtype BAM SortedByCoordinate "
         "--twopassMode Basic "
-        "--limitBAMsortRAM 30000000000 " # 30 Gb the size required for human genome
-        "--outSAMmapqUnique 60 " # from GATK BP
+        "--limitBAMsortRAM 30000000000 "
+        "--outSAMmapqUnique 60 "
         "--outTmpDir {resources.tmpdir}/STARtmp "
         "--outFileNamePrefix {params.out_basename} "
         "&> {log} "
 
 
 rule move_bam_files:
     input:
-        bam = rules.star_2pass_mapping.output.bam
+        bam=rules.star_2pass_mapping.output.bam,
     output:
-        bam = protected(resolve_results_filepath(
-            config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"))
+        bam=protected(
+            resolve_results_filepath(
+                config.get("paths").get("results_dir"), "reads/bam/{sample}.bam"
+            )
+        ),
     params:
-        sample = "{sample}"
+        sample="{sample}",
     log:
         resolve_results_filepath(
-            config.get("paths").get("results_dir"),"logs/star/{sample}/{sample}_star_map_move.log")
+            config.get("paths").get("results_dir"),
+            "logs/star/{sample}/{sample}_star_map_move.log",
+        ),
     conda:
         resolve_single_filepath(
-            config.get("paths").get("workdir"),"workflow/envs/star.yaml")
-    threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores")))
+            config.get("paths").get("workdir"), "workflow/envs/star.yaml"
+        )
+    threads:
+        conservative_cpu_count(
+            reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))
+        )
     resources:
         tmpdir=config.get("paths").get("tmp_dir"),
-        mem_mb=30000
-    message: "Rename and move with {threads} threads and {resources.mem_mb} Mbytes for the following BAM file: {input.bam}."
+        mem_mb=30000,
+    message:
+        "Rename and move with {threads} threads and {resources.mem_mb} Mbytes for the following BAM file: {input.bam}."
     shell:
         "mv "
         "{input.bam} {output.bam} "
         "&> {log} "
 
+
 rule index_bam:
     input:
-        bam = rules.move_bam_files.output.bam
+        bam=rules.move_bam_files.output.bam,
     output:
-        bai = protected(resolve_results_filepath(
-            config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai"))
+        bai=protected(
+            resolve_results_filepath(
+                config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai"
+            )
+        ),
     conda:
         resolve_single_filepath(
             config.get("paths").get("workdir"), "workflow/envs/samtools.yaml"
         )
-    threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores")))
+    threads:
+        conservative_cpu_count(
+            reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))
+        )
     resources:
-        tmpdir=config.get("paths").get("tmp_dir")
-    message: "Build Index for STAR mapped reads with {threads} threads on the following BAM file: {input.bam}."
+        tmpdir=config.get("paths").get("tmp_dir"),
+    message:
+        "Build Index for STAR mapped reads with {threads} threads on the following BAM file: {input.bam}."
     log:
         resolve_results_filepath(
-            config.get("paths").get("results_dir"),"logs/samtools/{sample}.index.log")
+            config.get("paths").get("results_dir"), "logs/samtools/{sample}.index.log"
+        ),
     shell:
         "samtools index "
         "{input.bam} "
-        ">& {log} "
+        ">& {log} "