From 25dea95db20df4017d8651cd9c2f2b77651d9363 Mon Sep 17 00:00:00 2001
From: David Lougheed <david.lougheed@gmail.com>
Date: Mon, 17 Jun 2024 12:12:09 -0400
Subject: [PATCH] feat(call): output 95% CIs and supporting read depth in VCFs

---
 docs/output_formats.md    | 14 ++++++++------
 strkit/call/output/vcf.py | 11 ++++++++---
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/docs/output_formats.md b/docs/output_formats.md
index eb2cbf4..c5a454f 100644
--- a/docs/output_formats.md
+++ b/docs/output_formats.md
@@ -151,12 +151,14 @@ Example report format:
 
 VCF format fields (i.e., for each variant sample entry):
 
-* `AD`: Read depth for each allele (STR + SNV)
-* `DP`: Total read depth (STR + SNV)
-* `GT`: Genotype (STR + SNV)
-* `MC`: Motif copy number for each allele (STR only)
-* `PS`: Phase set (STR + SNV)
-* `PM`: Peak-calling method (`dist`/`single`/`snv+dist`/`snv`/`hp`; STR only)
+* `AD`: Read depth for each allele
+* `DP`: Total read depth
+* `DPS`: Total read depth; only supporting reads (for calls with incorporated SNVs mainly; STR calls only)
+* `GT`: Genotype
+* `MC`: Motif copy number for each allele (STR calls only)
+* `MCCI`: Motif copy number 95% confidence intervals for each allele (STR calls only)
+* `PS`: Phase set
+* `PM`: Peak-calling method (`dist`/`single`/`snv+dist`/`snv`/`hp`; STR calls only)
 
 VCF info. fields (i.e., for each STR variant record; not present for SNV records):
 
diff --git a/strkit/call/output/vcf.py b/strkit/call/output/vcf.py
index 5a14c22..c57a457 100644
--- a/strkit/call/output/vcf.py
+++ b/strkit/call/output/vcf.py
@@ -51,8 +51,10 @@ def build_vcf_header(sample_id: str, reference_file: str) -> pysam.VariantHeader
     # Set up basic VCF formats
     vh.formats.add("AD", ".", "Integer", "Read depth for each allele")
     vh.formats.add("DP", 1, "Integer", "Read depth")
+    vh.formats.add("DPS", 1, "Integer", "Read depth (supporting reads only)")
     vh.formats.add("GT", 1, "String", "Genotype")
     vh.formats.add("MC", ".", "Integer", "Motif copy number for each allele")
+    vh.formats.add("MCCI", ".", "String", "Motif copy number 95% confidence interval for each allele")
     vh.formats.add("PS", 1, "Integer", "Phase set")
     vh.formats.add("PM", 1, "String", "Peak-calling method (dist/snv+dist/snv/hp)")
 
@@ -108,6 +110,7 @@ def output_contig_vcf_lines(
 
         n_alleles: int = get_n_alleles(2, params.sex_chroms, contig) or 2
 
+        res_reads = result["reads"]
         res_peaks = result["peaks"] or {}
         peak_seqs: list[str] = list(map(idx_0_getter, res_peaks.get("seqs", [])))
         if any(map(is_none, peak_seqs)):  # Occurs when no consensus for one of the peaks
@@ -122,6 +125,7 @@ def output_contig_vcf_lines(
         common_suffix_idx = -1 * len(commonprefix(tuple(map(_reversed_str, (ref_seq, *seqs)))))
 
         call = result["call"]
+        call_95_cis = result["call_95_cis"]
 
         seq_alleles_raw: tuple[Optional[str], ...] = (ref_seq, *(seq_alts or (None,))) if call is not None else (".",)
         seq_alleles: list[str] = [ref_start_anchor + (ref_seq[:common_suffix_idx] if common_suffix_idx else ref_seq)]
@@ -148,10 +152,13 @@ def output_contig_vcf_lines(
         if am := result.get("assign_method"):
             vr.samples[sample_id]["PM"] = am
 
+        vr.samples[sample_id]["DP"] = len(res_reads)
+
         if call is not None and res_peaks:
-            vr.samples[sample_id]["DP"] = sum(res_peaks["n_reads"])
+            vr.samples[sample_id]["DPS"] = sum(res_peaks["n_reads"])
             vr.samples[sample_id]["AD"] = tuple(res_peaks["n_reads"])
             vr.samples[sample_id]["MC"] = tuple(map(int, call))
+            vr.samples[sample_id]["MCCI"] = tuple(f"{x[0]}-{x[1]}" for x in call_95_cis)
 
             ps = result["ps"]
 
@@ -187,8 +194,6 @@ def output_contig_vcf_lines(
                     alleles=snv_alleles,
                 )
 
-                # TODO: write "rcs" for sample SNV genotypes - list of #reads per allele
-
                 snv_vr.samples[sample_id]["GT"] = tuple(map(snv_alleles.index, snv["call"]))
                 snv_vr.samples[sample_id]["DP"] = sum(snv["rcs"])
                 snv_vr.samples[sample_id]["AD"] = snv["rcs"]