Skip to content

Commit

Permalink
feat(call): output 95% CIs and supporting read depth in VCFs
Browse files Browse the repository at this point in the history
  • Loading branch information
davidlougheed committed Jun 17, 2024
1 parent fd71d08 commit 25dea95
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 9 deletions.
14 changes: 8 additions & 6 deletions docs/output_formats.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,14 @@ Example report format:

VCF format fields (i.e., for each variant sample entry):

* `AD`: Read depth for each allele (STR + SNV)
* `DP`: Total read depth (STR + SNV)
* `GT`: Genotype (STR + SNV)
* `MC`: Motif copy number for each allele (STR only)
* `PS`: Phase set (STR + SNV)
* `PM`: Peak-calling method (`dist`/`single`/`snv+dist`/`snv`/`hp`; STR only)
* `AD`: Read depth for each allele
* `DP`: Total read depth
* `DPS`: Total read depth; only supporting reads (for calls with incorporated SNVs mainly; STR calls only)
* `GT`: Genotype
* `MC`: Motif copy number for each allele (STR calls only)
* `MCCI`: Motif copy number 95% confidence intervals for each allele (STR calls only)
* `PS`: Phase set
* `PM`: Peak-calling method (`dist`/`single`/`snv+dist`/`snv`/`hp`; STR calls only)

VCF info. fields (i.e., for each STR variant record; not present for SNV records):

Expand Down
11 changes: 8 additions & 3 deletions strkit/call/output/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,10 @@ def build_vcf_header(sample_id: str, reference_file: str) -> pysam.VariantHeader
# Set up basic VCF formats
vh.formats.add("AD", ".", "Integer", "Read depth for each allele")
vh.formats.add("DP", 1, "Integer", "Read depth")
vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)")
vh.formats.add("GT", 1, "String", "Genotype")
vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele")
vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele")
vh.formats.add("PS", 1, "Integer", "Phase set")
vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)")

Expand Down Expand Up @@ -108,6 +110,7 @@ def output_contig_vcf_lines(

n_alleles: int = get_n_alleles(2, params.sex_chroms, contig) or 2

res_reads = result["reads"]
res_peaks = result["peaks"] or {}
peak_seqs: list[str] = list(map(idx_0_getter, res_peaks.get("seqs", [])))
if any(map(is_none, peak_seqs)): # Occurs when no consensus for one of the peaks
Expand All @@ -122,6 +125,7 @@ def output_contig_vcf_lines(
common_suffix_idx = -1 * len(commonprefix(tuple(map(_reversed_str, (ref_seq, *seqs)))))

call = result["call"]
call_95_cis = result["call_95_cis"]

seq_alleles_raw: tuple[Optional[str], ...] = (ref_seq, *(seq_alts or (None,))) if call is not None else (".",)
seq_alleles: list[str] = [ref_start_anchor + (ref_seq[:common_suffix_idx] if common_suffix_idx else ref_seq)]
Expand All @@ -148,10 +152,13 @@ def output_contig_vcf_lines(
if am := result.get("assign_method"):
vr.samples[sample_id]["PM"] = am

vr.samples[sample_id]["DP"] = len(res_reads)

if call is not None and res_peaks:
vr.samples[sample_id]["DP"] = sum(res_peaks["n_reads"])
vr.samples[sample_id]["DPS"] = sum(res_peaks["n_reads"])
vr.samples[sample_id]["AD"] = tuple(res_peaks["n_reads"])
vr.samples[sample_id]["MC"] = tuple(map(int, call))
vr.samples[sample_id]["MCCI"] = tuple(f"{x[0]}-{x[1]}" for x in call_95_cis)

ps = result["ps"]

Expand Down Expand Up @@ -187,8 +194,6 @@ def output_contig_vcf_lines(
alleles=snv_alleles,
)

# TODO: write "rcs" for sample SNV genotypes - list of #reads per allele

snv_vr.samples[sample_id]["GT"] = tuple(map(snv_alleles.index, snv["call"]))
snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"])
snv_vr.samples[sample_id]["AD"] = snv["rcs"]
Expand Down

0 comments on commit 25dea95

Please sign in to comment.