Skip to content

Commit

Permalink
snakefmt formatting for compatibility with snakemake standard compliants
Browse files Browse the repository at this point in the history
  • Loading branch information
massiddamt committed Sep 21, 2023
1 parent 511e73b commit e4b9ea6
Show file tree
Hide file tree
Showing 8 changed files with 564 additions and 262 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ You can perform the pipeline deploy defining a directory `my_dest_dir` for analy
```bash
snakedeploy deploy-workflow https://github.com/GeneBANGS/RNASeq.git
my_desd_dir
--tag v1.0
--tag v1.1
```
To run the pipeline, go inside the deployed pipeline folder and use the command:
```bash
Expand Down
15 changes: 9 additions & 6 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ include: "rules/mapping.smk"
include: "rules/rseqc.smk"
include: "rules/qc.smk"


##### local rules #####
localrules:
all
all,


##### target rules #####
rule all:
Expand All @@ -45,9 +47,10 @@ rule all:
sample=samples.reset_index().itertuples(),
),
qc=resolve_results_filepath(
config.get("paths").get("results_dir"), "qc/multiqc.html"),

threads: conservative_cpu_count(reserve_cores=2,max_cores=99)
config.get("paths").get("results_dir"), "qc/multiqc.html"
),
threads: conservative_cpu_count(reserve_cores=2, max_cores=99)
resources:
tmpdir=config.get("paths").get("tmp_dir")
message: "Concluding the workflow!"
tmpdir=config.get("paths").get("tmp_dir"),
message:
"Concluding the workflow!"
15 changes: 9 additions & 6 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,21 @@ from snakemake.utils import validate
report: "../report/workflow.rst"


#validate(config, schema="../schemas/config.schema.yaml")
# validate(config, schema="../schemas/config.schema.yaml")


samples = pd.read_table(config.get("samples"), index_col="sample")
units = pd.read_table(config.get("units"), index_col=["unit"], dtype=str)


## pipeline-related functions
def get_unit_fastqs(wildcards, samples, label='units',read_pair='fq'):
for unit_set in samples.loc[wildcards.sample,[label]]:
def get_unit_fastqs(wildcards, samples, label="units", read_pair="fq"):
for unit_set in samples.loc[wildcards.sample, [label]]:
wildcards.sample
return [expand_filepath(units.loc[x,[read_pair]].dropna()[0]) for x in unit_set.split(',')]
return [
expand_filepath(units.loc[x, [read_pair]].dropna()[0])
for x in unit_set.split(",")
]


def get_odp(wildcards, samples, optical_dup="odp"):
Expand Down Expand Up @@ -65,7 +68,7 @@ def conservative_cpu_count(reserve_cores=1, max_cores=5):
return max(cores - reserve_cores, 1)


def threads_calculator(read_type="pe",max_cores=99):
def threads_calculator(read_type="pe", max_cores=99):
if read_type == "se":
if conservative_cpu_count(reserve_cores=1, max_cores=max_cores) > 2:
return conservative_cpu_count(reserve_cores=1, max_cores=max_cores) / 2
Expand All @@ -75,4 +78,4 @@ def threads_calculator(read_type="pe",max_cores=99):
if conservative_cpu_count(reserve_cores=1, max_cores=max_cores) > 4:
return conservative_cpu_count(reserve_cores=1, max_cores=max_cores) / 4
else:
return 1
return 1
56 changes: 38 additions & 18 deletions workflow/rules/kallisto.smk
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
rule kallisto_index:
input:
resolve_single_filepath(config.get("resources").get("reference_path"), config.get("resources").get("transcriptome_fasta"))
resolve_single_filepath(
config.get("resources").get("reference_path"),
config.get("resources").get("transcriptome_fasta"),
),
output:
resolve_results_filepath(
config.get("paths").get("results_dir"),"kallisto/index/transcriptome.kidx.transcripts")
config.get("paths").get("results_dir"),
"kallisto/index/transcriptome.kidx.transcripts",
),
conda:
resolve_single_filepath(
config.get("paths").get("workdir"), "workflow/envs/kallisto.yaml"
)
threads: conservative_cpu_count(reserve_cores=1,max_cores=99)
threads: conservative_cpu_count(reserve_cores=1, max_cores=99)
resources:
tmpdir=config.get("paths").get("tmp_dir")
message: "Build Transcriptome Index for Kallisto with {threads} threads on the following fasta file: {input}."
tmpdir=config.get("paths").get("tmp_dir"),
message:
"Build Transcriptome Index for Kallisto with {threads} threads on the following fasta file: {input}."
log:
resolve_results_filepath(
config.get("paths").get("results_dir"),"logs/kallisto/index_kallisto.log")
config.get("paths").get("results_dir"), "logs/kallisto/index_kallisto.log"
),
shell:
"kallisto index "
"-i {output} "
Expand All @@ -25,36 +32,49 @@ rule kallisto_index:
rule kallisto:
input:
r1=resolve_results_filepath(
config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R1-trimmed.fq.gz"),
config.get("paths").get("results_dir"),
"reads/trimmed/{sample}-R1-trimmed.fq.gz",
),
r2=resolve_results_filepath(
config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R2-trimmed.fq.gz"),
index=rules.kallisto_index.output
config.get("paths").get("results_dir"),
"reads/trimmed/{sample}-R2-trimmed.fq.gz",
),
index=rules.kallisto_index.output,
output:
resolve_results_filepath(
config.get("paths").get("results_dir"),"kallisto/{sample}/abundance.h5"),
config.get("paths").get("results_dir"), "kallisto/{sample}/abundance.h5"
),
resolve_results_filepath(
config.get("paths").get("results_dir"),"kallisto/{sample}/abundance.tsv"),
config.get("paths").get("results_dir"), "kallisto/{sample}/abundance.tsv"
),
resolve_results_filepath(
config.get("paths").get("results_dir"),"kallisto/{sample}/run_info.json")
config.get("paths").get("results_dir"), "kallisto/{sample}/run_info.json"
),
params:
outdir=lambda w, output: os.path.split(output[0])[0],
params=config.get("params").get("kallisto").get("arguments")
params=config.get("params").get("kallisto").get("arguments"),
conda:
resolve_single_filepath(
config.get("paths").get("workdir"), "workflow/envs/kallisto.yaml"
)
threads: conservative_cpu_count(reserve_cores=1, max_cores=int(config.get("resources").get("max_cores")))
threads:
conservative_cpu_count(
reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))
)
resources:
tmpdir=config.get("paths").get("tmp_dir")
message: "Kallisto Transcript Quantification with {threads} threads on the following fastq files: {input.r1}/{input.r2}."
tmpdir=config.get("paths").get("tmp_dir"),
message:
"Kallisto Transcript Quantification with {threads} threads on the following fastq files: {input.r1}/{input.r2}."
log:
resolve_results_filepath(
config.get("paths").get("results_dir"),"logs/kallisto/{sample}.kallisto.log")
config.get("paths").get("results_dir"),
"logs/kallisto/{sample}.kallisto.log",
),
shell:
"kallisto quant "
"-i {input.index} "
"-o {params.outdir} "
"{params.params} "
"-t {threads} "
"{input.r1} {input.r2} "
">& {log}"
">& {log}"
134 changes: 91 additions & 43 deletions workflow/rules/mapping.smk
Original file line number Diff line number Diff line change
@@ -1,24 +1,36 @@
rule star_index:
input:
resolve_single_filepath(config.get("resources").get("reference_path"), config.get("resources").get("reference_fa"))
resolve_single_filepath(
config.get("resources").get("reference_path"),
config.get("resources").get("reference_fa"),
),
output:
length=ensure(resolve_results_filepath(
config.get("paths").get("results_dir"),"star/index/chrLength.txt"), non_empty=True)
length=ensure(
resolve_results_filepath(
config.get("paths").get("results_dir"), "star/index/chrLength.txt"
),
non_empty=True,
),
params:
gtf=resolve_single_filepath(config.get("resources").get("reference_path"), config.get("resources").get("gtf")),
gtf=resolve_single_filepath(
config.get("resources").get("reference_path"),
config.get("resources").get("gtf"),
),
genomeDir=lambda w, output: os.path.split(output[0])[0],
overhang=config.get("params").get("star").get("overhang")
overhang=config.get("params").get("star").get("overhang"),
log:
resolve_results_filepath(
config.get("paths").get("results_dir"),"logs/star/star_index.log")
config.get("paths").get("results_dir"), "logs/star/star_index.log"
),
conda:
resolve_single_filepath(
config.get("paths").get("workdir"), "workflow/envs/star.yaml"
)
threads: conservative_cpu_count(reserve_cores=2, max_cores=99)
resources:
tmpdir=config.get("paths").get("tmp_dir")
message: "Indexing Genome with {threads} threads on the following fasta file: {input}."
tmpdir=config.get("paths").get("tmp_dir"),
message:
"Indexing Genome with {threads} threads on the following fasta file: {input}."
shell:
"STAR "
"--runMode genomeGenerate "
Expand All @@ -33,33 +45,50 @@ rule star_index:

rule star_2pass_mapping:
input:
read1 = resolve_results_filepath(
config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R1-trimmed.fq.gz"),
read2 = resolve_results_filepath(
config.get("paths").get("results_dir"),"reads/trimmed/{sample}-R2-trimmed.fq.gz"),
index = rules.star_index.output.length
read1=resolve_results_filepath(
config.get("paths").get("results_dir"),
"reads/trimmed/{sample}-R1-trimmed.fq.gz",
),
read2=resolve_results_filepath(
config.get("paths").get("results_dir"),
"reads/trimmed/{sample}-R2-trimmed.fq.gz",
),
index=rules.star_index.output.length,
output:
bam = temp(resolve_results_filepath(
config.get("paths").get("results_dir"),"star/{sample}/{sample}.Aligned.sortedByCoord.out.bam")),
log = resolve_results_filepath(
config.get("paths").get("results_dir"),"star/{sample}/{sample}.Log.final.out")
bam=temp(
resolve_results_filepath(
config.get("paths").get("results_dir"),
"star/{sample}/{sample}.Aligned.sortedByCoord.out.bam",
)
),
log=resolve_results_filepath(
config.get("paths").get("results_dir"),
"star/{sample}/{sample}.Log.final.out",
),
params:
genomedir = lambda w, input: os.path.split(input.index)[0],
sample = "{sample}",
genomedir=lambda w, input: os.path.split(input.index)[0],
sample="{sample}",
platform=config.get("params").get("star").get("platform"),
center=config.get("params").get("star").get("sequencing_center"),
out_basename=lambda w, output: output[0].split(".", 1),
log:
resolve_results_filepath(
config.get("paths").get("results_dir"),"logs/star/{sample}/{sample}_star_map.log")
config.get("paths").get("results_dir"),
"logs/star/{sample}/{sample}_star_map.log",
),
conda:
resolve_single_filepath(
config.get("paths").get("workdir"),"workflow/envs/star.yaml")
threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores")))
config.get("paths").get("workdir"), "workflow/envs/star.yaml"
)
threads:
conservative_cpu_count(
reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))
)
resources:
tmpdir=config.get("paths").get("tmp_dir"),
mem_mb=30000
message: "STAR RNA-Seq Aligner 2-pass mapping with {threads} threads and {resources.mem_mb} Mbytes for the following file: {input.read1} and {input.read2}."
mem_mb=30000,
message:
"STAR RNA-Seq Aligner 2-pass mapping with {threads} threads and {resources.mem_mb} Mbytes for the following file: {input.read1} and {input.read2}."
shell:
"STAR "
"--runMode alignReads "
Expand All @@ -72,55 +101,74 @@ rule star_2pass_mapping:
"--outStd Log "
"--outSAMtype BAM SortedByCoordinate "
"--twopassMode Basic "
"--limitBAMsortRAM 30000000000 " # 30 Gb the size required for human genome
"--outSAMmapqUnique 60 " # from GATK BP
"--limitBAMsortRAM 30000000000 "
"--outSAMmapqUnique 60 "
"--outTmpDir {resources.tmpdir}/STARtmp "
"--outFileNamePrefix {params.out_basename} "
"&> {log} "


rule move_bam_files:
input:
bam = rules.star_2pass_mapping.output.bam
bam=rules.star_2pass_mapping.output.bam,
output:
bam = protected(resolve_results_filepath(
config.get("paths").get("results_dir"),"reads/bam/{sample}.bam"))
bam=protected(
resolve_results_filepath(
config.get("paths").get("results_dir"), "reads/bam/{sample}.bam"
)
),
params:
sample = "{sample}"
sample="{sample}",
log:
resolve_results_filepath(
config.get("paths").get("results_dir"),"logs/star/{sample}/{sample}_star_map_move.log")
config.get("paths").get("results_dir"),
"logs/star/{sample}/{sample}_star_map_move.log",
),
conda:
resolve_single_filepath(
config.get("paths").get("workdir"),"workflow/envs/star.yaml")
threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores")))
config.get("paths").get("workdir"), "workflow/envs/star.yaml"
)
threads:
conservative_cpu_count(
reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))
)
resources:
tmpdir=config.get("paths").get("tmp_dir"),
mem_mb=30000
message: "Rename and move with {threads} threads and {resources.mem_mb} Mbytes for the following BAM file: {input.bam}."
mem_mb=30000,
message:
"Rename and move with {threads} threads and {resources.mem_mb} Mbytes for the following BAM file: {input.bam}."
shell:
"mv "
"{input.bam} {output.bam} "
"&> {log} "


rule index_bam:
input:
bam = rules.move_bam_files.output.bam
bam=rules.move_bam_files.output.bam,
output:
bai = protected(resolve_results_filepath(
config.get("paths").get("results_dir"),"reads/bam/{sample}.bam.bai"))
bai=protected(
resolve_results_filepath(
config.get("paths").get("results_dir"), "reads/bam/{sample}.bam.bai"
)
),
conda:
resolve_single_filepath(
config.get("paths").get("workdir"), "workflow/envs/samtools.yaml"
)
threads: conservative_cpu_count(reserve_cores=1,max_cores=int(config.get("resources").get("max_cores")))
threads:
conservative_cpu_count(
reserve_cores=1, max_cores=int(config.get("resources").get("max_cores"))
)
resources:
tmpdir=config.get("paths").get("tmp_dir")
message: "Build Index for STAR mapped reads with {threads} threads on the following BAM file: {input.bam}."
tmpdir=config.get("paths").get("tmp_dir"),
message:
"Build Index for STAR mapped reads with {threads} threads on the following BAM file: {input.bam}."
log:
resolve_results_filepath(
config.get("paths").get("results_dir"),"logs/samtools/{sample}.index.log")
config.get("paths").get("results_dir"), "logs/samtools/{sample}.index.log"
),
shell:
"samtools index "
"{input.bam} "
">& {log} "
">& {log} "
Loading

0 comments on commit e4b9ea6

Please sign in to comment.