Skip to content

Commit

Permalink
feat: refactor vg giraffe (#3644)
Browse files Browse the repository at this point in the history
<!-- Ensure that the PR title follows conventional commit style (<type>:
<description>)-->
<!-- Possible types are here:
https://github.com/commitizen/conventional-commit-types/blob/master/index.json
-->

Refactored the wrapper to be able to work with haplotype sampled
pangenome graphs.

### QC
<!-- Make sure that you can tick the boxes below. -->

* [x] I confirm that I have followed the [documentation for contributing
to
`snakemake-wrappers`](https://snakemake-wrappers.readthedocs.io/en/stable/contributing.html).

While the contributions guidelines are more extensive, please
particularly ensure that:
* [x] `test.py` was updated to call any added or updated example rules
in a `Snakefile`
* [x] `input:` and `output:` file paths in the rules can be chosen
arbitrarily
* [x] wherever possible, command line arguments are inferred and set
automatically (e.g. based on file extensions in `input:` or `output:`)
* [x] temporary files are either written to a unique hidden folder in
the working directory, or (better) stored where the Python function
`tempfile.gettempdir()` points to
* [x] the `meta.yaml` contains a link to the documentation of the
respective tool or command under `url:`
* [x] conda environments use a minimal amount of channels and packages,
in recommended ordering


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Updated output file naming and bundling for genome resources, offering
clearer organization of results.
- Enhanced command construction for processing with new input parameters
for haplotypes and kmer counts.

- **Refactor**
- Streamlined processing by adjusting how optional mapping parameters
are handled and enhancing command construction.

- **Chores**
	- Removed an obsolete genomic sequence entry from the reference data.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
  • Loading branch information
FelixMoelder authored Feb 7, 2025
1 parent e36c29e commit 0df1d3b
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 31 deletions.
26 changes: 10 additions & 16 deletions bio/vg/giraffe/test/Snakefile
Original file line number Diff line number Diff line change
@@ -1,23 +1,17 @@
rule vg_autoindex: # [hide]
input: # [hide]
ref="{genome}.fasta", # [hide]
output: # [hide]
multiext("resources/{genome}", ".dist", ".min", ".giraffe.gbz"), # [hide]
log: # [hide]
"logs/vg_autoindex/{genome}.log", # [hide]
params: # [hide]
extra="", # [hide]
threads: 8 # [hide]
wrapper: # [hide]
"master/bio/vg/autoindex" # [hide]


rule vg_giraffe_map:
input:
reads=["reads/{sample}.1.fastq", "reads/{sample}.2.fastq"],
index=multiext("resources/genome", ".dist", ".min", ".giraffe.gbz")
graph="resources/{sample}.gbz",
#kmers="resources/{sample}.kff", # optional: kmer counts
#hapl="resources/{genome}.hapl", # optional: haplotype index
output:
"mapped/{sample}.bam",
bam="mapped/{sample}.bam",
indexes=multiext(
"resources/{sample}",
".dist",
".shortread.withzip.min",
".shortread.zipcodes",
),
log:
"logs/vg_giraffe/{sample}.log",
params:
Expand Down
2 changes: 0 additions & 2 deletions bio/vg/giraffe/test/genome.fasta

This file was deleted.

Binary file added bio/vg/giraffe/test/resources/a.gbz
Binary file not shown.
22 changes: 9 additions & 13 deletions bio/vg/giraffe/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,6 @@
samtools_opts = get_samtools_opts(snakemake, param_name="sort_extra")
java_opts = get_java_opts(snakemake)


expected_files = {".dist": "-d", ".min": "-m", ".giraffe.gbz": "-Z"}

input_cmd = ""
for ext, param in expected_files.items():
matching_files = [file for file in snakemake.input.index if file.endswith(ext)]
if not matching_files:
raise ValueError(f"Missing required file with extension: {ext}")
input_cmd += f" {param} {matching_files[0]}"


# Check inputs/arguments.
if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in {
1,
Expand All @@ -44,6 +33,12 @@
else " -f ".join(snakemake.input.reads)
)

input_cmd = ""
if snakemake.input.get("hapl", ""):
input_cmd += f" --haplotype-name {snakemake.input.hapl}"
if snakemake.input.get("kmers", ""):
input_cmd += f" --kff-name {snakemake.input.kmers}"

if sort_order not in {"coordinate", "queryname"}:
raise ValueError("Unexpected value for sort_order ({})".format(sort_order))

Expand All @@ -61,7 +56,7 @@
elif sort == "fgbio":
if sort_order == "queryname":
sort_extra += " -s Queryname"
pipe_cmd = "fgbio SortBam -i /dev/stdin -o {snakemake.output[0]} {sort_extra}"
pipe_cmd = "fgbio SortBam -i /dev/stdin -o {snakemake.output.bam} {sort_extra}"
elif sort == "picard":
# Sort alignments using picard SortSam.
pipe_cmd = "picard SortSam {java_opts} {sort_extra} --INPUT /dev/stdin --TMP_DIR {tmpdir} --SORT_ORDER {sort_order} --OUTPUT {snakemake.output[0]}"
Expand All @@ -71,8 +66,9 @@

with tempfile.TemporaryDirectory() as tmpdir:
shell(
"(vg giraffe"
"(vg giraffe -p "
" -t {snakemake.threads}"
" -Z {snakemake.input.graph}"
" {input_cmd}"
" -f {reads}"
" --output-format BAM"
Expand Down

0 comments on commit 0df1d3b

Please sign in to comment.