Merge branch 'release/v0.3.1'

ACEnglish · Jun 25, 2024 · 68aab27 · 68aab27
2 parents 705f05d + 6d6be53
commit 68aab27
Show file tree

Hide file tree

Showing 13 changed files with 219 additions and 192 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "kanpig"
-version = "0.3.0"
+version = "0.3.1"
 edition = "2021"
 
 [dependencies]

diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ are still being determined and will likely be dependent on things such as number
 of the variants, and sequencing technology.
 
 ### `--bed`
-Sorted bed file restricts kanpig to only analyzing variants with starts and ends within a single bed entry.
+A sorted bed file (`bedtools sort`) that restricts kanpig to only analyzing variants with starts and ends within a single bed entry.
 
 ### `--ploidy-bed`
 This bed file informs kanpig of special regions within chromosomes that should have non-diploid genotypes. For example, a female
@@ -49,18 +49,31 @@ Kanpig will build local variant graphs from windows of the genome. These windows
 of an upstream window's variants at least `chunksize` base-pairs away from the next window's variants' minimum start position.
 
 This chunksize also determines the region over which read pileups are generated. Only reads with at least `mapq` mapping quality, 
-passing the `mapflag` filter, and which fully span the minimum variant start and maximum variant end are considered. 
+passing the `mapflag` filter, and which fully span the window are considered.
 
 This is an important parameter because too small of a `chunksize` may not recruit distant read pileups which support variants. Similarly, 
-too large of a value may create windows with many SVs which are also too large for reads to fully-span. 
+too large of a value may create windows with many SVs which are also too large for reads to fully-span.
 
 ### `--sizemin` and `--sizemax`
 Variant sizes are determined by `abs(length(ALT) - length(REF))`. Genotypes of variants not within the size boundaries are set to missing (`./.`).
 
 ### `--sizesim` and `--seqsim`
 When applying a haplotype to a variant graph, only paths above these two thresholds are allowed. If there are multiple
-paths above the threshold, the one with the highest `(sizesim + seqsim) / 2` is kept. Generally, `0.90` is well balanced
-whereas lower thresholds will boost recall at the cost of precision and vice versa for higher thresholds.
+paths above the threshold, the one with the highest score is kept. Generally, `0.90` is well balanced
+whereas lower thresholds will boost recall at the cost of precision and vice versa for higher thresholds. 
+
+### `--gpenalty` and `--fpenalty`
+The similarity of a path to the graph is used to to compute a score and the highest score kept. The scoring formula is
+
+```
+Score(P) = ((SS + SZ) / 2) − (λg ⋅ ∣L(P)−E∣) - (λf ⋅ N)
+``` 
+
+where `SS` and `SZ` are sequence and size similarity,  `L(P)` is the number of nodes in the path, and `E` is the number of 
+pileups in the haplotype, and `N` is the number of putative false-negatives in the variant graph. 
+
+The penalty factor `λg` helps reduce paths with split variant representations. The penalty factor `λf` helps penalizes
+false-negatives in the variant graph. Details on how the impact of the scoring penalties are in [the wiki](https://github.com/ACEnglish/kanpig/wiki/Scoring-Function).
 
 ### `--maxpaths`
 When performing path-finding, this threshold limits the number of paths which are checked. A lower `maxpaths` will
@@ -87,8 +100,7 @@ The `SAMPLE` column fields populated by kanpig are:
 | **PS**  | Each chunk of variants is assigned a phase set |
 | **DP**  | Read coverage over the region |
 | **AD**  | Read coverage supporting the reference and alternate alleles. |
-| **SZ**  | Size similarity of the two haplotypes to this variant |
-| **SS**  | Sequence similarity of the two haplotypes to this variant |
+| **KS**  | [Kanpig score](https://github.com/ACEnglish/kanpig/wiki/Scoring-Function) |
 
 Details of `FT`
 | Flag   | Description |

diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh
@@ -5,15 +5,34 @@ create() {
     #time ../target/release/kanpig \
     #time kanpig-v0.2.0-x86_64-apple-darwin/kanpig \
     time cargo run --release -- \
-        --input test_rs/test2.vcf.gz \
+        --input test_rs/chr20.vcf.gz \
+        --bam /Users/english/code/kanpig/experiments/test_rs/NA24385.chr20.bam \
+        --reference /Users/english/code/references/grch38/GRCh38_1kg_mainchrs.fa \
+        --sizemin 50 \
+        --sizesim 0.95 --seqsim 0.90 --threads 4 \
+        --maxpaths 1000 --mapq 5 --hapsim 0.98 \
+        --chunksize 100 --maxhom 0 \
+        --sample doesthiswork \
+        --bed $bed -o test_rs/hc.vcf 
+            #| bcftools sort -O z -o test_rs/hc.vcf.gz
+    # --bed /Users/english/code/kanpig/test/GRCh38_HG002-T2TQ100-V1.0_stvar.benchmark.bed \
+    # --bam /Users/english/code/kanpig/experiments/test_rs/GIABHG002.bam \
+}
+
+create_zero() {
+    #time ../target/release/kanpig \
+    #time kanpig-v0.2.0-x86_64-apple-darwin/kanpig \
+    time cargo run --release -- \
+        --input test_rs/chr20.vcf.gz \
         --bam /Users/english/code/kanpig/experiments/test_rs/NA24385.chr20.bam \
         --reference /Users/english/code/references/grch38/GRCh38_1kg_mainchrs.fa \
         --sizemin 50 \
         --sizesim 0.95 --seqsim 0.90 --threads 5 \
         --maxpaths 1000 --mapq 20 --hapsim 0.98 \
         --chunksize 100 --maxhom 0 \
         --sample doesthiswork \
-        --bed $bed -o test_rs/hc.vcf
+        --factor 0.03 \
+        --bed $bed -o test_rs/hc_zero.vcf
             #| bcftools sort -O z -o test_rs/hc.vcf.gz
     # --bed /Users/english/code/kanpig/test/GRCh38_HG002-T2TQ100-V1.0_stvar.benchmark.bed \
     # --bam /Users/english/code/kanpig/experiments/test_rs/GIABHG002.bam \
@@ -28,21 +47,28 @@ bench_lite() {
 }
 
 bench_medium() {
-    rm -rf test_rs/hcbench_noref
+    oname=${1}_bench/
+    rm -rf $oname
     truvari bench --includebed $bed \
         -b test_rs/GRCh38_HG002-T2TQ100-V1.0_stvar.vcf.gz \
-        -c test_rs/hc.vcf.gz --no-ref a -o test_rs/hcbench_noref/ \
+        -c $1 --no-ref a -o $oname \
         --pctsize 0.90 --pctseq 0.90 --pick ac
 }
 
 bench_full() {
+    oname=${1}_bench/
     truvari refine -f ~/code/references/grch38/GRCh38_1kg_mainchrs.fa -U -u -R \
-            --regions test_rs/hcbench_noref/candidate.refine.bed test_rs/hcbench_noref
+            --regions $oname/candidate.refine.bed $oname
 }
 
 create
 bcftools sort -O z -o test_rs/hc.vcf.gz test_rs/hc.vcf
 tabix test_rs/hc.vcf.gz
-#bench_lite
-bench_medium
-#bench_full
+bench_medium test_rs/hc.vcf.gz
+#bench_full test_rs/hc.vcf.gz
+
+#create_zero
+#bcftools sort -O z -o test_rs/hc_zero.vcf.gz test_rs/hc_zero.vcf
+#tabix test_rs/hc_zero.vcf.gz
+#bench_medium  test_rs/hc_zero.vcf.gz
+#bench_full test_rs/hc_zero.vcf.gz
diff --git a/src/kplib/annotator.rs b/src/kplib/annotator.rs
@@ -30,8 +30,7 @@ pub struct GenotypeAnno {
     pub gq: i32,
     pub dp: i32,
     pub ad: IntG,
-    pub zs: IntG,
-    pub ss: IntG,
+    pub ks: IntG,
     pub gt_state: metrics::GTstate,
 }
 
@@ -80,8 +79,7 @@ impl GenotypeAnno {
             Some(Value::Integer(phase_group)),
             Some(Value::Integer(self.dp)),
             Some(Value::Array(Array::Integer(self.ad.clone()))),
-            Some(Value::Array(Array::Integer(self.zs.clone()))),
-            Some(Value::Array(Array::Integer(self.ss.clone()))),
+            Some(Value::Array(Array::Integer(self.ks.clone()))),
         ]
     }
 }
@@ -93,68 +91,74 @@ fn diploid(
     paths: &[PathScore],
     coverage: u64,
 ) -> GenotypeAnno {
-    let path1 = &paths[0];
-    let path2 = &paths[1];
+    let (gt_str, gt_path, alt_cov, full_target) = match paths.len() {
+        // hom ref
+        0 => {
+            if coverage != 0 {
+                ("0|0", metrics::GTstate::Ref, 0.0, true)
+            } else {
+                ("./.", metrics::GTstate::Non, 0.0, true)
+            }
+        }
+        // het or hom
+        1 => {
+            let m_path = &paths[0];
+            if !m_path.path.contains(var_idx) {
+                ("0|0", metrics::GTstate::Ref, 0.0, true)
+            } else {
+                let alt_cov = m_path.coverage.unwrap() as f64;
+                let ref_cov = (coverage as f64) - alt_cov;
+                let (genotype, state) = match metrics::genotyper(ref_cov, alt_cov) {
+                    metrics::GTstate::Ref | metrics::GTstate::Het => ("0|1", metrics::GTstate::Het),
+                    metrics::GTstate::Hom => ("1|1", metrics::GTstate::Hom),
+                    _ => panic!("Cannot happen here"),
+                };
+                (genotype, state, alt_cov, m_path.full_target)
+            }
+        }
+        // compound het or the other one
+        2 => {
+            let path1 = &paths[0];
+            let path2 = &paths[1];
 
-    let (mut gt_str, gt_path, alt_cov, full_target) =
-        match (path1.path.contains(var_idx), path2.path.contains(var_idx)) {
-            (true, true) if path1 != path2 => (
-                "1|1",
-                metrics::GTstate::Hom,
-                (path1.coverage.unwrap() + path2.coverage.unwrap()) as f64,
-                path1.full_target && path2.full_target,
-            ),
-            // sometimes I used the same path twice
-            (true, true) => (
-                "1|1",
-                metrics::GTstate::Hom,
-                path1.coverage.unwrap() as f64,
-                path1.full_target,
-            ),
-            (true, false) => (
-                "1|0",
-                metrics::GTstate::Het,
-                path1.coverage.unwrap() as f64,
-                path1.full_target,
-            ),
-            (false, true) => (
-                "0|1",
-                metrics::GTstate::Het,
-                path2.coverage.unwrap() as f64,
-                path2.full_target,
-            ),
-            (false, false) if coverage != 0 => ("0|0", metrics::GTstate::Ref, 0.0, true),
-            (false, false) => ("./.", metrics::GTstate::Non, 0.0, true),
-        };
-    let ref_cov = (coverage as f64) - alt_cov;
-    let gt_obs = metrics::genotyper(ref_cov, alt_cov);
+            match (path1.path.contains(var_idx), path2.path.contains(var_idx)) {
+                (true, true) => (
+                    "1|1",
+                    metrics::GTstate::Hom,
+                    (path1.coverage.unwrap() + path2.coverage.unwrap()) as f64,
+                    path1.full_target || path2.full_target,
+                ),
+                (true, false) => (
+                    "1|0",
+                    metrics::GTstate::Het,
+                    path1.coverage.unwrap() as f64,
+                    path1.full_target,
+                ),
+                (false, true) => (
+                    "0|1",
+                    metrics::GTstate::Het,
+                    path2.coverage.unwrap() as f64,
+                    path2.full_target,
+                ),
+                (false, false) if coverage != 0 => ("0|0", metrics::GTstate::Ref, 0.0, true),
+                (false, false) => ("./.", metrics::GTstate::Non, 0.0, true),
+            }
+        }
+        _ => panic!("Cannot happen in diploid"),
+    };
 
-    // Alt haplotypes without a path can be filtered if we think it is
-    // more likely to be an error
-    let bcov = path1.coverage.unwrap() as f64;
-    if !path1.is_ref && *path1 == PathScore::default() && bcov < ref_cov && gt_path != gt_obs {
-        gt_str = match gt_obs {
-            metrics::GTstate::Ref => "0|0",
-            metrics::GTstate::Het => "0|1",
-            metrics::GTstate::Hom => "1|1",
-            _ => "./.",
-        };
-    }
+    let ref_cov = coverage as f64 - alt_cov;
+    let gt_obs = metrics::genotyper(ref_cov, alt_cov);
 
     // we're now assuming that ref/alt are the coverages used for these genotypes. no bueno
     let (gq, sq) = metrics::genotype_quals(ref_cov, alt_cov);
 
     let ad = vec![Some(ref_cov as i32), Some(alt_cov as i32)];
 
-    let zs = vec![
-        Some((path1.sizesim * 100.0) as i32),
-        Some((path2.sizesim * 100.0) as i32),
-    ];
-
-    let ss = vec![
-        Some((path1.seqsim * 100.0) as i32),
-        Some((path2.seqsim * 100.0) as i32),
-    ];
+    let ks: Vec<Option<i32>> = paths
+        .iter()
+        .map(|p| Some((p.score * 100.0) as i32))
+        .collect();
 
     let mut filt = FiltFlags::PASS;
     // The genotype from AD doesn't match path genotype
@@ -187,8 +191,7 @@ fn diploid(
         gq: gq.round() as i32,
         dp: coverage as i32,
         ad,
-        zs,
-        ss,
+        ks,
         gt_state: gt_path,
     }
 }
@@ -203,8 +206,7 @@ fn zero(entry: RecordBuf, coverage: u64) -> GenotypeAnno {
         gq: 0,
         dp: coverage as i32,
         ad: vec![None],
-        zs: vec![None],
-        ss: vec![None],
+        ks: vec![None],
         gt_state: metrics::GTstate::Non,
     }
 }
@@ -216,6 +218,30 @@ fn haploid(
     paths: &[PathScore],
     coverage: u64,
 ) -> GenotypeAnno {
+    if paths.is_empty() {
+        let (gt_str, gq, gt_state) = match coverage == 0 {
+            true => (".", 0, metrics::GTstate::Non),
+            false => ("0", 100, metrics::GTstate::Ref),
+        };
+
+        let mut filt = FiltFlags::PASS;
+        if coverage < 5 {
+            filt |= FiltFlags::LOWCOV;
+        }
+
+        return GenotypeAnno {
+            entry,
+            gt: gt_str.to_string(),
+            filt,
+            sq: 0,
+            gq,
+            dp: coverage as i32,
+            ad: vec![Some(coverage as i32), Some(0)],
+            ks: vec![],
+            gt_state,
+        };
+    }
+
     let path1 = &paths[0];
     let (gt_str, gt_path, alt_cov) = match path1.path.contains(var_idx) {
         true => ("1", metrics::GTstate::Hom, path1.coverage.unwrap() as f64),
@@ -230,9 +256,7 @@ fn haploid(
 
     let ad = vec![Some(ref_cov as i32), Some(alt_cov as i32)];
 
-    let zs = vec![Some((path1.sizesim * 100.0) as i32)];
-
-    let ss = vec![Some((path1.seqsim * 100.0) as i32)];
+    let ks = vec![Some((path1.score * 100.0) as i32)];
 
     let mut filt = FiltFlags::PASS;
     if gq < 5.0 {
@@ -261,8 +285,7 @@ fn haploid(
         gq: gq.round() as i32,
         dp: coverage as i32,
         ad,
-        zs,
-        ss,
+        ks,
         gt_state: gt_path,
     }
 }
diff --git a/src/kplib/bamparser.rs b/src/kplib/bamparser.rs
@@ -73,8 +73,8 @@ impl BamParser {
                     || alignment.record().mapq() < self.params.mapq
                     || (alignment.record().flags() & self.params.mapflag) != 0
                     || (self.params.spanoff
-                        && !((alignment.record().reference_start() as u64) < start
-                            && (alignment.record().reference_end() as u64) > end))
+                        && !((alignment.record().reference_start() as u64) < window_start
+                            && (alignment.record().reference_end() as u64) > window_end))
                 {
                     continue;
                 }

diff --git a/src/kplib/cli.rs b/src/kplib/cli.rs
@@ -74,7 +74,7 @@ pub struct KDParams {
     pub sizemax: u64,
 
     /// Maximum number of paths in a graph to traverse
-    #[arg(long, default_value_t = 10000)]
+    #[arg(long, default_value_t = 5000)]
     pub maxpaths: u64,
 
     /// Minimum sequence similarity for paths
@@ -93,6 +93,14 @@ pub struct KDParams {
     #[arg(long, default_value_t = 1.0)]
     pub hapsim: f32,
 
+    /// Scoring penalty for 'gaps'
+    #[arg(long, default_value_t = 0.01)]
+    pub gpenalty: f32,
+
+    /// Scoring penalty for 'fns'
+    #[arg(long, default_value_t = 0.10)]
+    pub fpenalty: f32,
+
     /// Search for a 1-to-1 match before graph traversal
     #[arg(long, default_value_t = false)]
     pub try_exact: bool,