From 405ea9ad6da4089068c733dc4c9f793c4dbb0bbb Mon Sep 17 00:00:00 2001 From: Adam English Date: Mon, 13 May 2024 14:09:08 -0400 Subject: [PATCH 01/47] trying to move noodles but the api is entirely different and some functions seem to be lost --- Cargo.lock | 33 +++++++++++++++++++++++---------- Cargo.toml | 2 +- experiments/bedtest.sh | 7 ++++--- src/kplib/annotator.rs | 2 +- src/kplib/vcf_traits.rs | 16 +++++++++------- src/kplib/vcfreader.rs | 4 ++-- src/kplib/vcfwriter.rs | 4 ++-- 7 files changed, 42 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4e7e3f6..50d187c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -107,6 +107,16 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +[[package]] +name = "bstr" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "byteorder" version = "1.5.0" @@ -540,9 +550,9 @@ dependencies = [ [[package]] name = "noodles-bgzf" -version = "0.26.0" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8970db2e84adb1007377dd3988258d7a64e3fc4c05602ebf94e1f8cba207c030" +checksum = "7dba1c82e9f92c00b23538359e5d191dff7ccb300cf659ee3a835af65c3cd143" dependencies = [ "byteorder", "bytes", @@ -552,15 +562,18 @@ dependencies = [ [[package]] name = "noodles-core" -version = "0.14.0" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7336c3be652de4e05444c9b12a32331beb5ba3316e8872d92bfdd8ef3b06c282" +checksum = "c5a8c6b020d1205abef2b0fab4463a6c5ecc3c8f4d561ca8b0d1a42323376200" +dependencies = [ + "bstr", +] [[package]] name = "noodles-csi" -version = "0.30.0" +version = "0.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a60dfe0919f7ecbd081a82eb1d32e8f89f9041932d035fe8309073c8c01277bf" +checksum = "ad09737d94ec2674361219fb3d46a81561a15773585805de807cab323a15648a" dependencies = [ "bit-vec", "byteorder", @@ -571,9 +584,9 @@ dependencies = [ [[package]] name = "noodles-tabix" -version = "0.36.0" +version = "0.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc1ab29335a68d0c2bdf41460a67714ca69e23a1cbeb950ac5c38a9afa446a62" +checksum = "6f8da8182c56b64d28d0330ce209857ba2a5a4981c6925838e4d4ffeea82db09" dependencies = [ "bit-vec", "byteorder", @@ -585,9 +598,9 @@ dependencies = [ [[package]] name = "noodles-vcf" -version = "0.49.0" +version = "0.56.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e1f2fa749afaccadc596ec55ccb7bdcd8101fa79f8382384223c0dbae3e245b" +checksum = "2576e5b1e12d729c93d30ba25676b4a3efb19134f14f30fa30ae70a064d12eb2" dependencies = [ "indexmap", "memchr", diff --git a/Cargo.toml b/Cargo.toml index b9e61a0..3e3d023 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ indicatif = "0.17.8" itertools = { version = "0.12.1" } lazy_static = "1.4.0" log = { version = "0.4", features = ["std", "serde"] } -noodles-vcf = { version = "0.49.0" } +noodles-vcf = { version = "0.56.0" } ordered-float = { version = "4.0", default-features = false } page_size = "0.6.0" petgraph = { version = "0.6.2" } diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh index 1d12d63..0ffd9ee 100644 --- a/experiments/bedtest.sh +++ b/experiments/bedtest.sh @@ -3,14 +3,14 @@ bed=test_rs/test.chr20.bed create() { #../target/release/kanpig \ - time cargo run --release -- \ + time cargo run -- \ --input test_rs/test2.vcf.gz \ --bam /Users/english/code/kanpig/experiments/test_rs/NA24385.chr20.bam \ --reference /Users/english/code/references/grch38/GRCh38_1kg_mainchrs.fa \ --sizemin 50 \ --sizesim 0.95 --seqsim 0.90 --threads 4 \ --maxpaths 20000 --mapq 5 --hapsim 0.98 \ - --chunksize 100 --maxhom 5 \ + --chunksize 100 --maxhom 5 --prune --try-exact \ -o test_rs/hc.vcf --bed $bed # --bed /Users/english/code/kanpig/test/GRCh38_HG002-T2TQ100-V1.0_stvar.benchmark.bed \ # --bam /Users/english/code/kanpig/experiments/test_rs/GIABHG002.bam \ @@ -29,6 +29,7 @@ bench_medium() { truvari bench --includebed $bed \ -b test_rs/GRCh38_HG002-T2TQ100-V1.0_stvar.vcf.gz \ -c test_rs/hc.vcf.gz --no-ref a -o test_rs/hcbench_noref/ \ + -s 5 \ --pctsize 0.90 --pctseq 0.90 } @@ -42,4 +43,4 @@ bcftools sort -O z -o test_rs/hc.vcf.gz test_rs/hc.vcf tabix test_rs/hc.vcf.gz #bench_lite bench_medium -#bench_full +bench_full diff --git a/src/kplib/annotator.rs b/src/kplib/annotator.rs index 9693b0c..1f85b65 100644 --- a/src/kplib/annotator.rs +++ b/src/kplib/annotator.rs @@ -3,7 +3,7 @@ use bitflags::bitflags; use petgraph::graph::NodeIndex; -use noodles_vcf::{self as vcf, record::genotypes::sample::Value}; +use noodles_vcf::{self as vcf, record::sample::Value}; bitflags! { pub struct FiltFlags: u32 { diff --git a/src/kplib/vcf_traits.rs b/src/kplib/vcf_traits.rs index dbe2116..2793e2d 100644 --- a/src/kplib/vcf_traits.rs +++ b/src/kplib/vcf_traits.rs @@ -1,6 +1,8 @@ use crate::kplib::seq_to_kmer; use noodles_vcf::{ - self as vcf, record::alternate_bases::allele, record::info::field, record::Filters, + self as vcf, + variant::record::info::field::value::Value, + variant::record::info::field::key::Key, }; use std::cmp::Ordering; use std::str::FromStr; @@ -74,11 +76,11 @@ impl KdpVcf for vcf::Record { fn size(&self) -> u64 { let svlen = self .info() - .get(&field::Key::from_str("SVLEN").unwrap_or_else(|_| panic!("No SVLEN INFO"))); + .get(&Key::from_str("SVLEN").unwrap_or_else(|_| panic!("No SVLEN INFO"))); - if let Some(Some(field::Value::Integer(svlen))) = svlen { + if let Some(Some(Value::Integer(svlen))) = svlen { return svlen.unsigned_abs() as u64; - } else if let Some(Some(field::Value::Array(field::value::Array::Integer(svlen)))) = svlen { + } else if let Some(Some(Value::Array(field::value::Array::Integer(svlen)))) = svlen { return svlen .first() .unwrap_or_else(|| panic!("Bad SVLEN")) @@ -119,11 +121,11 @@ impl KdpVcf for vcf::Record { fn variant_type(&self) -> Svtype { match self .info() - .get(&field::Key::from_str("SVTYPE").expect("Unable to make key")) + .get(&Key::from_str("SVTYPE").expect("Unable to make key")) { // INFO/SVTYPE - Some(Some(field::Value::String(svtype))) => svtype.parse().expect("Bad SVTYPE"), - Some(Some(field::Value::Array(field::value::Array::String(svtype)))) => svtype + Some(Some(Value::String(svtype))) => svtype.parse().expect("Bad SVTYPE"), + Some(Some(Value::Array(field::value::Array::String(svtype)))) => svtype .first() .cloned() .unwrap_or_else(|| panic!("Bad SVTYPE")) diff --git a/src/kplib/vcfreader.rs b/src/kplib/vcfreader.rs index 9849580..37c9346 100644 --- a/src/kplib/vcfreader.rs +++ b/src/kplib/vcfreader.rs @@ -8,7 +8,7 @@ use crate::kplib::{KDParams, KdpVcf, Regions, VcfWriter}; /// Takes a vcf and filtering parameters to create in iterable which will /// return chunks of variants in the same neighborhood pub struct VcfChunker<'a, R: BufRead> { - pub m_vcf: vcf::reader::Reader, + pub m_vcf: vcf::io::Reader, pub m_header: vcf::Header, regions: Regions, params: KDParams, @@ -27,7 +27,7 @@ pub struct VcfChunker<'a, R: BufRead> { impl<'a, R: BufRead> VcfChunker<'a, R> { pub fn new( - m_vcf: vcf::reader::Reader, + m_vcf: vcf::io::Reader, m_header: vcf::Header, regions: Regions, params: KDParams, diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index 8b858a3..33302a7 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -13,7 +13,7 @@ use noodles_vcf::{ }; pub struct VcfWriter { - writer: vcf::Writer>, + writer: vcf::io::Writer>, header: vcf::Header, keys: Keys, pub gtcounts: HashMap, @@ -129,7 +129,7 @@ impl VcfWriter { page_size::get() * 1000, File::create(out_path).expect("Error Creating Output File"), ); - let mut writer = vcf::Writer::new(out_buf); + let mut writer = vcf::io::Writer::new(out_buf); let _ = writer.write_header(&header); Self { From 667dee68308902e09edc7422b958a7da2540a704 Mon Sep 17 00:00:00 2001 From: Adam English Date: Mon, 13 May 2024 15:45:52 -0400 Subject: [PATCH 02/47] progress a little bit --- src/kplib/vcf_traits.rs | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/kplib/vcf_traits.rs b/src/kplib/vcf_traits.rs index 2793e2d..2d2d728 100644 --- a/src/kplib/vcf_traits.rs +++ b/src/kplib/vcf_traits.rs @@ -39,6 +39,8 @@ pub trait KdpVcf { fn size(&self) -> u64; fn is_filtered(&self) -> bool; fn variant_type(&self) -> Svtype; + fn is_symbolic(&self) -> bool; + fn is_bnd(&self) -> bool; } impl KdpVcf for vcf::Record { @@ -89,13 +91,14 @@ impl KdpVcf for vcf::Record { } let r_len: u64 = self.reference_bases().len() as u64; - let a_len: u64 = match self.alternate_bases().first() { - Some(allele::Allele::Bases(alt)) => alt.len() as u64, - Some(allele::Allele::Symbol(_alt)) => { + let a_len: u64 = if self.is_symbolic() { let (start, end) = self.boundaries(); start.abs_diff(end) + 1 - } - _ => 0, + } else { + match self.alternate_bases().first() { + Some(alt) => alt.len(), + None => 0 + } }; if r_len == a_len { @@ -148,4 +151,20 @@ impl KdpVcf for vcf::Record { }, } } + + /// Checks if its a symbolic allele e.g. + /// Returns false if its a monozygotic reference + fn is_symbolic(&self) -> bool { + match self.alternate_bases().first() { + Some(alt) => alt.contains('<'), + None => false + } + } + + fn is_bnd(&self) -> bool { + match self.alternate_bases().first() { + Some(alt) => (alt.contains('[') || alt.contains(']')) && alt.contains(':'), + None => false + } + } } From d3cbb4ceac799577dbd2417d784b0c226c431ad5 Mon Sep 17 00:00:00 2001 From: Adam English Date: Tue, 21 May 2024 17:33:18 -0400 Subject: [PATCH 03/47] move changelog to wiki --- changelog.md | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 changelog.md diff --git a/changelog.md b/changelog.md deleted file mode 100644 index d70df77..0000000 --- a/changelog.md +++ /dev/null @@ -1,31 +0,0 @@ -v0.2.0 -*in progress* - -* Up to 40% reduction in runtime -* Hemizygous and sex chromosome aware genotyping with new `--ploidy-bed` -* Variants with alternate alleles of stars, monozygotic reference, and BNDs are filtered out - -v0.1.2 -*May 5, 2024* - -* New optional hompolymer filter doesn't kmerize long homopolymers -* Improved logging info -* Correcting GQ field -* Correcting kmer counting -* Small speed/memory/io improvements - * Off-loaded annotation work from the single writer thread to the worker threads and using a large - multiple of page size for the BufWriter capacity - * Fewer bam file opens - -v0.1.1 -*Apr 11, 2024* - -* The `--no-prune` flag has been changed to `--prune` since not pruning is a better default. -* Partial haplotypes now only allow up to 3 false negatives for regions with fewer than 500 pileups. More than 500 do - not attempt partials. -* Partial haplotypes now respect the `--kmer` option. - -v0.1.0 -*Apr 9, 2024* - -Initial version. Works well enough to freeze. From 9120a8b020f09583508032543b411303a127abd3 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 22 May 2024 02:11:44 -0400 Subject: [PATCH 04/47] Update README.md doc cleaning --- README.md | 80 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index 0979b32..bace2c7 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ A fast tool for genotyping structural variants with long-reads. *Kanpig is currently under active research and development. We make no guarantees about its accuracy or the stability of features before version 1.0.* -# Install +# 📥 Install ``` git clone https://github.com/ACEnglish/kanpig cd kanpig @@ -15,20 +15,20 @@ cargo build --release ``` Alternatively, binaries are available in [releases](https://github.com/ACEnglish/kanpig/releases). -# Quick Start +# 🚀 Quick Start ``` kanpig --input variant.vcf.gz --bam alignments.bam --reference ref.fa --out output.vcf ``` See `kanpig -h` for all available parameters, most of which are detailed below. -# Current Limitations +# ⚠️ Current Limitations * Kanpig expects sequence resolved SVs. Variants with symbolic alts (e.g. ``) and BNDs are not parsed. * Kanpig only looks at read pileups and does not consider split or soft-clipped alignment information. This means variants above ~10kbp should be skipped with the `--sizemax` parameter. -* Please do not publish manuscripts with kanpig results until we've completed our manuscript. We're aiming to have a preprint +* Please do not publish manuscripts benchmarking kanpig results until we've completed our manuscript. We're aiming to have a preprint available early Q3 2024. -# Core Parameter Details +# 🔧 Core Parameter Details The default parameters are tuned to work generally well for genotyping a single sample's VCF, meaning the variants are all expected to be present in the sample. For a multi-sample VCF (a.k.a. a project-level VCF), the optimal parameters @@ -41,32 +41,28 @@ Sorted bed file restricts kanpig to only analyzing variants with starts and ends ### `--ploidy-bed` This bed file informs kanpig of special regions within chromosomes that should have non-diploid genotypes. For example, a female human sample shouldn't have any genotypes on chrY. A male human sample should have hemizygous genotypes on chrY and the -non-pseudoautosomal regions of chrX. The `ploidy_beds/` directory has example bed files for GRCh38. All regions not -within the `ploidy-bed` (or if no bed is provided) are assumed to be diploid. - -### `--hapsim` -After performing kmeans clustering on reads to determine the two haplotypes, if the two haplotypes have a size similarity above `hapsim`, they -are consolidated into a homozygous allele. +non-pseudoautosomal regions of chrX. The [ploidy_beds/](https://github.com/ACEnglish/kanpig/tree/develop/ploidy_beds) directory +has example bed files for GRCh38. All regions not within the `--ploidy-bed` (or if no bed is provided) are assumed to be diploid. ### `--chunksize` -Kanpig will build local variant graphs from windows of the genome. These windows are determined by `chunksize` where -the maximum end position of an upstream window's variants is at least `chunksize` base-pairs away from the next window's +Kanpig will build local variant graphs from windows of the genome. These windows are determined by `--chunksize` where +the maximum end position of an upstream window's variants is at least `--chunksize` base-pairs away from the next window's variants' minimum start position. This chunksize also determins the region over which read pileups are generated. Only reads which pass the `--mapq` and `--mapflag` filter are considered. Also, reads must fully span the minimum variant start and maximum variant end. -This is an important parameter because too small of a `chunksize` may not recruit read pileups which support variants +This is an important parameter because too small of a `--chunksize` may not recruit read pileups which support variants but are far away. Similarly, too large of a value may create windows with many SVs which are also too large for reads to have a fully-spanning alignment. ### `--sizemin` and `--sizemax` Variant sizes are determined by `INFO/SVLEN`. If `INFO/SVLEN` tag is not in the VCF entry, the variant's size is set as -`abs(length(ALT) - length(REF))`. +`abs(length(ALT) - length(REF))`. Genotypes of variants not within the size boundaries are set to missing (`./.`). ### `--sizesim` and `--seqsim` When applying a haplotype to a variant graph, only paths above these two thresholds are allowed. If there are multiple -paths above the threshold, the one with the higher `(sizesim + seqsim) / 2` is kept. Generally, `0.90` is well balanced +paths above the threshold, the one with the highest `(sizesim + seqsim) / 2` is kept. Generally, `0.90` is well balanced whereas lower thresholds will boost recall at the cost of precision and vice versa for higher thresholds. ### `--maxpaths` @@ -74,39 +70,47 @@ When performing path-finding, this threshold limits the number of paths which ar speed up runtime but may come at a cost of recall. A higher `--maxpaths` is slower and may come at a cost to specificity. +### `--hapsim` +After performing kmeans clustering on reads to determine the two haplotypes, if the two haplotypes have a size similarity +above `--hapsim`, they are consolidated into a homozygous allele. + ### `--threads` Number of analysis threads to use. Note that in addition to the analysis threads, kanpig keeps one dedicated IO thread for VCF reading and writing. -# Annotations +# 📝 Annotations The `SAMPLE` column fields populated by kanpig are: -* FT - Bit flag for properties of the variant's genotyping. Flags == 0 are considered PASS. The bits definitions are: - * 0x1 - The genotype observed from variants matching paths is not equal to the genotype observed from measuring the - proportions of reads supporting the two alleles. - * 0x2 - The genotype quality is less than 5 - * 0x4 - The depth (DP) is less than 5 - * 0x8 - The sample quality (SQ) is less than 5 (only present on non-ref variants) - * 0x16 - The number of reads supporting the alternate allele less than 5 (only present on non-ref variants) - * 0x32 - The best scoring path through the variant graph only used part of the haplotype. This may be indicative of a - false-negative in the variant graph. -* SQ - Phred scaled likelihood variant alternate is present in the sample -* GQ - Phred scale difference between most and second-most likely genotypes -* PG - Each chunk of variants is assigned a phase group -* DP - Read coverage over the region -* AD - Read coverage supporting the reference and alternate alleles. -* SZ - Size similarity of the two haplotypes to this variant -* SS - Sequence similarity of the two haplotypes to this variant - -# Experimental Parameter Details +| Field | Description | +|---------|-------------| +| **FT** | Bit flag for properties of the variant's genotyping. Flags == 0 are considered PASS. | +| **SQ** | Phred scaled likelihood variant alternate is present in the sample | +| **GQ** | Phred scale difference between most and second-most likely genotypes | +| **PG** | Each chunk of variants is assigned a phase group | +| **DP** | Read coverage over the region | +| **AD** | Read coverage supporting the reference and alternate alleles. | +| **SZ** | Size similarity of the two haplotypes to this variant | +| **SS** | Sequence similarity of the two haplotypes to this variant | + +Details of `FT` +| Flag | Description | +|--------|-------------| +| 0x1 | The genotype observed from variants matching paths is not equal to the genotype observed from measuring the proportions of reads supporting the two alleles. | +| 0x2 | The genotype quality is less than 5 | +| 0x4 | The depth (DP) is less than 5 | +| 0x8 | The sample quality (SQ) is less than 5 (only present on non-ref variants) | +| 0x16 | The number of reads supporting the alternate allele less than 5 (only present on non-ref variants) | +| 0x32 | The best scoring path through the variant graph only used part of the haplotype. This may be indicative of a false-negative in the variant graph. | + +# 🔬 Experimental Parameter Details These parameters have a varying effect on the results and are not guaranteed to be stable across releases. ### `--try-exact` -Before performing the path-finding algorithm that applies haplotypes to the variant graph, perform a 1-to-1 comparison -of the haplotypes to each node in the variant graph. If a single node matches above `--sizesim` and `--seqsim`, the -path-finding is skipped and haplotype applied to support the node. +Before performing the path-finding algorithm that applies a haplotype to the variant graph, perform a 1-to-1 comparison +of the haplotype to each node in the variant graph. If a single node matches above `--sizesim` and `--seqsim`, the +path-finding is skipped and haplotype applied to the node. This parameter will boost the specificity and speed of kanpig at the cost of recall. From cada4bc2d32dde5a85d8bf71da7b116d340c777b Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 22 May 2024 02:21:07 -0400 Subject: [PATCH 05/47] Update README.md --- README.md | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index bace2c7..b63554d 100644 --- a/README.md +++ b/README.md @@ -45,16 +45,14 @@ non-pseudoautosomal regions of chrX. The [ploidy_beds/](https://github.com/ACEng has example bed files for GRCh38. All regions not within the `--ploidy-bed` (or if no bed is provided) are assumed to be diploid. ### `--chunksize` -Kanpig will build local variant graphs from windows of the genome. These windows are determined by `--chunksize` where -the maximum end position of an upstream window's variants is at least `--chunksize` base-pairs away from the next window's -variants' minimum start position. +Kanpig will build local variant graphs from windows of the genome. These windows are determined by making the maximum end position +of an upstream window's variants at least `chunksize` base-pairs away from the next window's variants' minimum start position. -This chunksize also determins the region over which read pileups are generated. Only reads which pass the `--mapq` and -`--mapflag` filter are considered. Also, reads must fully span the minimum variant start and maximum variant end. +This chunksize also determines the region over which read pileups are generated. Only reads with at least `mapq` mapping quality, +passing the `mapflag` filter, and which fully span the minimum variant start and maximum variant end are considered. -This is an important parameter because too small of a `--chunksize` may not recruit read pileups which support variants -but are far away. Similarly, too large of a value may create windows with many SVs which are also too large for reads to have a -fully-spanning alignment. +This is an important parameter because too small of a `chunksize` may not recruit read pileups that support variants but are +further away. Similarly, too large of a value may create windows with many SVs which are also too large for reads to fully-span. ### `--sizemin` and `--sizemax` Variant sizes are determined by `INFO/SVLEN`. If `INFO/SVLEN` tag is not in the VCF entry, the variant's size is set as @@ -66,13 +64,13 @@ paths above the threshold, the one with the highest `(sizesim + seqsim) / 2` is whereas lower thresholds will boost recall at the cost of precision and vice versa for higher thresholds. ### `--maxpaths` -When performing path-finding, this threshold limits the number of paths which are checked. A lower `--maxpaths` will -speed up runtime but may come at a cost of recall. A higher `--maxpaths` is slower and may come at a cost to +When performing path-finding, this threshold limits the number of paths which are checked. A lower `maxpaths` will +speed up runtime but may come at a cost of recall. A higher `maxpaths` is slower and may come at a cost to specificity. ### `--hapsim` After performing kmeans clustering on reads to determine the two haplotypes, if the two haplotypes have a size similarity -above `--hapsim`, they are consolidated into a homozygous allele. +above `hapsim`, they are consolidated into a homozygous allele. ### `--threads` Number of analysis threads to use. Note that in addition to the analysis threads, kanpig keeps one dedicated IO thread @@ -109,21 +107,21 @@ These parameters have a varying effect on the results and are not guaranteed to ### `--try-exact` Before performing the path-finding algorithm that applies a haplotype to the variant graph, perform a 1-to-1 comparison -of the haplotype to each node in the variant graph. If a single node matches above `--sizesim` and `--seqsim`, the +of the haplotype to each node in the variant graph. If a single node matches above `sizesim` and `seqsim`, the path-finding is skipped and haplotype applied to the node. This parameter will boost the specificity and speed of kanpig at the cost of recall. ### `--prune` -Similar to `--try-exact`, a 1-to-1 comparison is performed before path-finding. If any matches are found, all paths +Similar to `try-exact`, a 1-to-1 comparison is performed before path-finding. If any matches are found, all paths which do not traverse the matching nodes are pruned from the variant graph. This parameter will boost the specificity and speed of kanpig at the cost of recall. ### `--maxhom` -When performing kmer-featurization of sequences (from reads or variants), homopolymer runs above `--maxhom` are trimmed -to `--maxhom`. For example, `--maxhom 5` will only count two four-mers in all homopolymer runs above 5bp. +When performing kmer-featurization of sequences (from reads or variants), homopolymer runs above `maxhom` are trimmed +to `maxhom`. For example, `--maxhom 5` will only count two four-mers in homopolymer runs above 5bp. ### `--spanoff` From 337917f420743a82ccaf72992bd2e9c2c42041f0 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 22 May 2024 12:01:41 -0400 Subject: [PATCH 06/47] New IO Thread pattern The writer has its own dedicated thread which can immediately start pumping out results. This should prevent the initial spike in memory usage while input is being read. --- Cargo.lock | 2 +- README.md | 8 +- experiments/bedtest.sh | 2 +- src/kplib/bedparser.rs | 2 +- src/kplib/vcfreader.rs | 33 ++++--- src/kplib/vcfwriter.rs | 7 +- src/main.rs | 193 +++++++++++++++++++++++++---------------- 7 files changed, 146 insertions(+), 101 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5cae7be..ccb314c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -445,7 +445,7 @@ dependencies = [ [[package]] name = "kanpig" -version = "0.2.0-dev" +version = "0.2.0" dependencies = [ "bitflags", "clap", diff --git a/README.md b/README.md index 0979b32..b0f7bb8 100644 --- a/README.md +++ b/README.md @@ -44,10 +44,6 @@ human sample shouldn't have any genotypes on chrY. A male human sample should ha non-pseudoautosomal regions of chrX. The `ploidy_beds/` directory has example bed files for GRCh38. All regions not within the `ploidy-bed` (or if no bed is provided) are assumed to be diploid. -### `--hapsim` -After performing kmeans clustering on reads to determine the two haplotypes, if the two haplotypes have a size similarity above `hapsim`, they -are consolidated into a homozygous allele. - ### `--chunksize` Kanpig will build local variant graphs from windows of the genome. These windows are determined by `chunksize` where the maximum end position of an upstream window's variants is at least `chunksize` base-pairs away from the next window's @@ -74,6 +70,10 @@ When performing path-finding, this threshold limits the number of paths which ar speed up runtime but may come at a cost of recall. A higher `--maxpaths` is slower and may come at a cost to specificity. +### `--hapsim` +After performing kmeans clustering on reads to determine the two haplotypes, if the two haplotypes have a size similarity above +`hapsim`, they are consolidated into a homozygous allele. + ### `--threads` Number of analysis threads to use. Note that in addition to the analysis threads, kanpig keeps one dedicated IO thread for VCF reading and writing. diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh index d44e325..7a0c87d 100644 --- a/experiments/bedtest.sh +++ b/experiments/bedtest.sh @@ -11,7 +11,7 @@ create() { --sizemin 50 \ --sizesim 0.95 --seqsim 0.90 --threads 4 \ --maxpaths 20000 --mapq 5 --hapsim 0.98 \ - --chunksize 100 --maxhom 5 --try-exact --prune \ + --chunksize 100 --maxhom 5 \ --sample doesthiswork \ -o test_rs/hc.vcf --bed $bed # --bed /Users/english/code/kanpig/test/GRCh38_HG002-T2TQ100-V1.0_stvar.benchmark.bed \ diff --git a/src/kplib/bedparser.rs b/src/kplib/bedparser.rs index 5e8eb22..562c9f8 100644 --- a/src/kplib/bedparser.rs +++ b/src/kplib/bedparser.rs @@ -58,7 +58,7 @@ impl BedParser { }; if chrom != self.prev_chrom { - self.prev_chrom = chrom.clone(); + self.prev_chrom.clone_from(&chrom); self.prev_start = 0; } diff --git a/src/kplib/vcfreader.rs b/src/kplib/vcfreader.rs index 2862c52..c387874 100644 --- a/src/kplib/vcfreader.rs +++ b/src/kplib/vcfreader.rs @@ -1,13 +1,13 @@ +use crate::kplib::{GenotypeAnno, KDParams, KdpVcf, Ploidy, Regions}; +use crossbeam_channel::Sender; +use noodles_vcf::{self as vcf}; +use petgraph::graph::NodeIndex; use std::collections::VecDeque; use std::io::BufRead; -use noodles_vcf::{self as vcf}; - -use crate::kplib::{KDParams, KdpVcf, Regions, VcfWriter}; - /// Takes a vcf and filtering parameters to create in iterable which will /// return chunks of variants in the same neighborhood -pub struct VcfChunker<'a, R: BufRead> { +pub struct VcfChunker { pub m_vcf: vcf::reader::Reader, pub m_header: vcf::Header, regions: Regions, @@ -20,18 +20,18 @@ pub struct VcfChunker<'a, R: BufRead> { // next chunk hold_entry: Option, chunk_count: u64, - call_count: u64, - skip_count: u64, - writer: &'a mut VcfWriter, + pub call_count: u64, + pub skip_count: u64, + result_sender: Sender>>, } -impl<'a, R: BufRead> VcfChunker<'a, R> { +impl VcfChunker { pub fn new( m_vcf: vcf::reader::Reader, m_header: vcf::Header, regions: Regions, params: KDParams, - writer: &'a mut VcfWriter, + result_sender: Sender>>, ) -> Self { Self { m_vcf, @@ -44,7 +44,7 @@ impl<'a, R: BufRead> VcfChunker<'a, R> { chunk_count: 0, call_count: 0, skip_count: 0, - writer, + result_sender, } } @@ -99,7 +99,6 @@ impl<'a, R: BufRead> VcfChunker<'a, R> { /// Return the next vcf entry which passes parameter conditions fn get_next_entry(&mut self) -> Option { - //let mut entry = vcf::Record::default(); let mut entry = vcf::Record::default(); loop { @@ -114,7 +113,13 @@ impl<'a, R: BufRead> VcfChunker<'a, R> { return Some(entry); } else { self.skip_count += 1; - self.writer.write_entry(entry.clone()); + let _ = self.result_sender.send(Some(vec![GenotypeAnno::new( + entry.clone(), + &NodeIndex::new(0), + &[], + 0, + &Ploidy::Zero, + )])); } } } @@ -144,7 +149,7 @@ impl<'a, R: BufRead> VcfChunker<'a, R> { } } -impl<'a, R: BufRead> Iterator for VcfChunker<'a, R> { +impl Iterator for VcfChunker { type Item = Vec; fn next(&mut self) -> Option { diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index 7a9adab..c58b93c 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -1,10 +1,9 @@ +use crate::kplib::{metrics::GTstate, GenotypeAnno}; use std::collections::HashMap; use std::fs::File; use std::io::BufWriter; use std::path::PathBuf; -use crate::kplib::{metrics::GTstate, GenotypeAnno}; - use noodles_vcf::{ self as vcf, header::record::value::map::format, @@ -36,7 +35,7 @@ impl VcfWriter { } }; - if header.sample_names().len() >= 1 { + if !header.sample_names().is_empty() { warn!( "clearing {} sample columns in output", header.sample_names().len() @@ -148,7 +147,7 @@ impl VcfWriter { let _result = self.writer.write_record(&self.header, &annot.entry); } - pub fn write_entry(&mut self, mut entry: vcf::Record) { + pub fn __write_entry(&mut self, mut entry: vcf::Record) { *entry.genotypes_mut() = Genotypes::new( "GT".parse().unwrap(), vec![vec![ diff --git a/src/main.rs b/src/main.rs index 9978699..75290ff 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,10 +4,12 @@ extern crate pretty_env_logger; extern crate log; use clap::Parser; -use crossbeam_channel::{select, unbounded, Receiver, Sender}; -use indicatif::{ProgressBar, ProgressStyle}; +use crossbeam_channel::{unbounded, Receiver, Sender}; +//use indicatif::{ProgressBar, ProgressStyle}; use noodles_vcf::{self as vcf}; +use std::sync::{Arc, Mutex}; use std::thread; +use std::thread::JoinHandle; mod kplib; use kplib::{ @@ -46,112 +48,151 @@ fn main() { let ploidy = PloidyRegions::new(&args.io.ploidy_bed); - let mut writer = VcfWriter::new(&args.io.out, input_header.clone(), &args.io.sample); + let (result_sender, result_receiver): ( + Sender>, + Receiver>, + ) = unbounded(); + /* + * Each Thread is given a result_sender + * They don't need to do anything different except + * But spawning them I do need to collect JoinHandlers + * The VcfWriter needs to start first + * Needs to be joinable + * Then VcfChunker is run, sending things to the Threads + * We then join the threads + * We then send a None to VcfWriter so it knows when to stop + * We then join the VcfWriter + * We are then finished + */ + // This needs a channel for results + // These results may be genotype annos or blanks. + // But I have to make them a single type, so I'll send filtered as genotype anno from + // take_annotated(&[], 0, Ploidy::Zero) + let writer = Arc::new(Mutex::new(VcfWriter::new( + &args.io.out, + input_header.clone(), + &args.io.sample, + ))); + // This blanks work will need to be done by the chunker. So the chunker doesn't get + // the writer but the channel and it won't write_entry it will write_anno // We send the writer to the reader so that we can pipe filtered variants forward let mut m_input = VcfChunker::new( input_vcf, input_header.clone(), tree, args.kd.clone(), - &mut writer, + result_sender.clone(), ); // Create channels for communication between threads - let (sender, receiver): (Sender>, Receiver>) = unbounded(); - let (result_sender, result_receiver): (Sender, Receiver) = unbounded(); + let (task_sender, task_receiver): (Sender>, Receiver>) = + unbounded(); info!("spawning {} threads", args.io.threads); - for _ in 0..args.io.threads { - let m_args = args.clone(); - let receiver = receiver.clone(); - let result_sender = result_sender.clone(); - let m_ploidy = ploidy.clone(); - thread::spawn(move || { - let mut m_bam = BamParser::new(m_args.io.bam, m_args.io.reference, m_args.kd.clone()); - for chunk in receiver.into_iter().flatten() { - let mut m_graph = Variants::new(chunk, m_args.kd.kmer, m_args.kd.maxhom); - - let ploidy = m_ploidy.get_ploidy(&m_graph.chrom, m_graph.start); - // For zero, we don't have to waste time going into the bam - if ploidy == Ploidy::Zero { - result_sender - .send(m_graph.take_annotated(&[], 0, &ploidy)) - .unwrap(); - continue; + let task_handles: Vec> = (0..args.io.threads) + .map(|_| { + let m_args = args.clone(); + let m_receiver = task_receiver.clone(); + let result_sender = result_sender.clone(); + let m_ploidy = ploidy.clone(); + thread::spawn(move || { + let mut m_bam = + BamParser::new(m_args.io.bam, m_args.io.reference, m_args.kd.clone()); + loop { + match m_receiver.recv() { + Ok(None) | Err(_) => break, + Ok(Some(chunk)) => { + let mut m_graph = + Variants::new(chunk, m_args.kd.kmer, m_args.kd.maxhom); + + let ploidy = m_ploidy.get_ploidy(&m_graph.chrom, m_graph.start); + // For zero, we don't have to waste time going into the bam + if ploidy == Ploidy::Zero { + result_sender + .send(Some(m_graph.take_annotated(&[], 0, &ploidy))) + .unwrap(); + continue; + } + + let (haps, coverage) = + m_bam.find_haps(&m_graph.chrom, m_graph.start, m_graph.end); + + let haps = match ploidy { + Ploidy::Haploid => haploid_haplotypes(haps, coverage, &m_args.kd), + _ => diploid_haplotypes(haps, coverage, &m_args.kd), + // and then eventually this could allow a --ploidy flag to branch to + // polyploid_haplotypes + }; + + let paths: Vec = haps + .iter() + .map(|h| m_graph.apply_coverage(h, &m_args.kd)) + .collect(); + + result_sender + .send(Some(m_graph.take_annotated(&paths, coverage, &ploidy))) + .unwrap(); + } + } + } + info!("I finished"); + // This should give a result + }) + }) + .collect(); + + //Before we start the workers, we'll start the writer + let cloned_writer = writer.clone(); + let write_handler = std::thread::spawn(move || { + // When this goes out of scope (the thread finishes) + // So the Mutex might be my write_handler + let mut phase_group: i32 = 0; + loop { + match result_receiver.recv() { + Ok(None) | Err(_) => break, + Ok(Some(result)) => { + let mut m_writer = cloned_writer.lock().unwrap(); + for entry in result { + m_writer.anno_write(entry, phase_group); + } + phase_group += 1; } - - let (haps, coverage) = m_bam.find_haps(&m_graph.chrom, m_graph.start, m_graph.end); - - let haps = match ploidy { - Ploidy::Haploid => haploid_haplotypes(haps, coverage, &m_args.kd), - _ => diploid_haplotypes(haps, coverage, &m_args.kd), - // and then eventually this could allow a --ploidy flag to branch to - // polyploid_haplotypes - }; - - let paths: Vec = haps - .iter() - .map(|h| m_graph.apply_coverage(h, &m_args.kd)) - .collect(); - - result_sender - .send(m_graph.take_annotated(&paths, coverage, &ploidy)) - .unwrap(); } - }); - } + } + }); // Send items to worker threads let mut num_chunks: u64 = 0; info!("parsing input"); for i in &mut m_input { - sender.send(Some(i)).unwrap(); + task_sender.send(Some(i)).unwrap(); num_chunks += 1; } if num_chunks == 0 { error!("No variants to be analyzed"); + // This might need to still join. std::process::exit(1); } // Signal worker threads to exit for _ in 0..args.io.threads { - sender.send(None).unwrap(); + task_sender.send(None).unwrap(); } - info!("collecting output"); - let sty = - ProgressStyle::with_template(" [{elapsed_precise}] {bar:44.cyan/blue} > {pos} completed") - .unwrap() - .progress_chars("##-"); - let pbar = ProgressBar::new(num_chunks); - pbar.set_style(sty.clone()); - - let mut phase_group: i32 = 0; - loop { - select! { - recv(result_receiver) -> result => { - match result { - Ok(annotated_entries) => { - for entry in annotated_entries { - writer.anno_write(entry, phase_group); - } - phase_group += 1; - pbar.inc(1); - if phase_group as u64 == num_chunks { - break; - } - }, - Err(e) => { - debug!("Problem {:?}", e); - break; - } - } - }, - } + info!("running"); + for handle in task_handles { + handle.join().unwrap(); } - pbar.finish(); + + // There will be no more results to be made + result_sender.send(None).unwrap(); + + // Join on the tasks + info!("collecting output"); + write_handler.join().unwrap(); + let writer = writer.lock().unwrap(); info!("genotype counts: {:#?}", writer.gtcounts); info!("finished"); } From f248ba33dea8a9eb02562466984bcb4a793b3c2a Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 22 May 2024 12:18:09 -0500 Subject: [PATCH 07/47] dev version bump and documenting memory usage Now have a better guide for how much memory it takes --- Cargo.lock | 2 +- Cargo.toml | 2 +- README.md | 10 ++++++++++ src/kplib/vcfwriter.rs | 2 +- src/main.rs | 4 ++-- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ccb314c..c7aac0b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -445,7 +445,7 @@ dependencies = [ [[package]] name = "kanpig" -version = "0.2.0" +version = "0.2.1-dev" dependencies = [ "bitflags", "clap", diff --git a/Cargo.toml b/Cargo.toml index 0d64edb..932fe88 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "kanpig" -version = "0.2.0" +version = "0.2.1-dev" edition = "2021" [dependencies] diff --git a/README.md b/README.md index b63554d..688b9dc 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,16 @@ Details of `FT` | 0x16 | The number of reads supporting the alternate allele less than 5 (only present on non-ref variants) | | 0x32 | The best scoring path through the variant graph only used part of the haplotype. This may be indicative of a false-negative in the variant graph. | +# 🔌 Compute Resources + +Kanpig is highly parallelized and will fully utilize all threads it is given. However, hyperthreading doesn't seem to +help and therefore the number of threads should probably be limited to the number of physical processors available. + +For memory, a general rule is kanpig will need about 20x the size of the compressed `.vcf.gz`. The minimum required +memory is also dependent on the number of threads running as each will need space for its processing. For example, +a 1.6Gb vcf (~5 million SVs) using 16 cores needs at least 32Gb of RAM. That same vcf with 8 or 4 cores needs at least + 24Gb and 20Gb of RAM, respectively. + # 🔬 Experimental Parameter Details These parameters have a varying effect on the results and are not guaranteed to be stable across releases. diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index c58b93c..3d0edb1 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -125,7 +125,7 @@ impl VcfWriter { // Ready to make files let out_buf = BufWriter::with_capacity( - page_size::get() * 1000, + page_size::get() * 500, File::create(out_path).expect("Error Creating Output File"), ); let mut writer = vcf::Writer::new(out_buf); diff --git a/src/main.rs b/src/main.rs index 75290ff..44ba388 100644 --- a/src/main.rs +++ b/src/main.rs @@ -164,7 +164,7 @@ fn main() { // Send items to worker threads let mut num_chunks: u64 = 0; - info!("parsing input"); + info!("building variant graphs"); for i in &mut m_input { task_sender.send(Some(i)).unwrap(); num_chunks += 1; @@ -181,7 +181,7 @@ fn main() { task_sender.send(None).unwrap(); } - info!("running"); + info!("genotyping"); for handle in task_handles { handle.join().unwrap(); } From 60f6864f29348a35814dc376cb729941e153e073 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 22 May 2024 14:41:42 -0400 Subject: [PATCH 08/47] reimplementing progressbars lost it during the dedicated IO thread refactor. Also, it now counts variants written instead of number of chunks of variants. --- src/kplib/regions.rs | 2 +- src/kplib/vcfreader.rs | 4 ++-- src/main.rs | 44 ++++++++++++++++++++++++++++++++++++------ 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/src/kplib/regions.rs b/src/kplib/regions.rs index 84f771e..13bedd7 100644 --- a/src/kplib/regions.rs +++ b/src/kplib/regions.rs @@ -42,7 +42,7 @@ pub fn build_region_tree( .filter(|entry| m_contigs.contains_key(&entry.chrom)) { if entry.chrom != prev_chrom { - prev_chrom = entry.chrom.clone(); + prev_chrom.clone_from(&entry.chrom); prev_start = 0; } diff --git a/src/kplib/vcfreader.rs b/src/kplib/vcfreader.rs index c387874..e236b0b 100644 --- a/src/kplib/vcfreader.rs +++ b/src/kplib/vcfreader.rs @@ -171,8 +171,8 @@ impl Iterator for VcfChunker { Some(ret) } else { info!( - "{} chunks of {} variants", - self.chunk_count, self.call_count + "{} variants in {} chunks", + self.call_count, self.chunk_count ); info!("{} variants skipped", self.skip_count); None diff --git a/src/main.rs b/src/main.rs index 44ba388..8efbc1f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,7 +5,7 @@ extern crate log; use clap::Parser; use crossbeam_channel::{unbounded, Receiver, Sender}; -//use indicatif::{ProgressBar, ProgressStyle}; +use indicatif::{ProgressBar, ProgressStyle}; use noodles_vcf::{self as vcf}; use std::sync::{Arc, Mutex}; use std::thread; @@ -136,29 +136,56 @@ fn main() { } } } - info!("I finished"); // This should give a result }) }) .collect(); + let num_variants = Arc::new(Mutex::new(0)); //Before we start the workers, we'll start the writer let cloned_writer = writer.clone(); + let cloned_num_variants = num_variants.clone(); let write_handler = std::thread::spawn(move || { - // When this goes out of scope (the thread finishes) - // So the Mutex might be my write_handler + let sty = ProgressStyle::with_template( + " [{elapsed_precise}] {bar:44.cyan/blue} > {pos} completed", + ) + .unwrap() + .progress_chars("##-"); + let mut pbar: Option = None; let mut phase_group: i32 = 0; + let mut completed_variants: u64 = 0; loop { match result_receiver.recv() { - Ok(None) | Err(_) => break, + Ok(None) | Err(_) => { + pbar.expect("I actually shouldn't be expecting the bar") + .finish(); + break; + } Ok(Some(result)) => { + let mut rsize: u64 = 0; let mut m_writer = cloned_writer.lock().unwrap(); for entry in result { m_writer.anno_write(entry, phase_group); + rsize += 1; + } + if let Some(ref mut bar) = pbar { + bar.inc(rsize); + } else { + completed_variants += rsize; } phase_group += 1; } } + // check if the reader is finished so we can setup the pbar + { + let mut value_guard = cloned_num_variants.lock().unwrap(); + if *value_guard != 0 { + let t_bar = ProgressBar::new(*value_guard).with_style(sty.clone()); + t_bar.inc(completed_variants); + pbar = Some(t_bar); + *value_guard = 0; + } + } } }); @@ -182,6 +209,12 @@ fn main() { } info!("genotyping"); + + // We now know how many variants will be parsed and can turn on the bar + { + let mut value_guard = num_variants.lock().unwrap(); + *value_guard = m_input.call_count + m_input.skip_count; + } for handle in task_handles { handle.join().unwrap(); } @@ -190,7 +223,6 @@ fn main() { result_sender.send(None).unwrap(); // Join on the tasks - info!("collecting output"); write_handler.join().unwrap(); let writer = writer.lock().unwrap(); info!("genotype counts: {:#?}", writer.gtcounts); From b18d3a46f60188748650d538836e253045d2cb67 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 22 May 2024 15:00:14 -0400 Subject: [PATCH 09/47] reducing mutex checks After the bar is setup, we no longer need to worry with it --- experiments/bedtest.sh | 6 +++--- src/main.rs | 17 +++++++---------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh index 7a0c87d..b51862d 100644 --- a/experiments/bedtest.sh +++ b/experiments/bedtest.sh @@ -40,8 +40,8 @@ bench_full() { } create -bcftools sort -O z -o test_rs/hc.vcf.gz test_rs/hc.vcf -tabix test_rs/hc.vcf.gz +#bcftools sort -O z -o test_rs/hc.vcf.gz test_rs/hc.vcf +#tabix test_rs/hc.vcf.gz #bench_lite -bench_medium +#bench_medium #bench_full diff --git a/src/main.rs b/src/main.rs index 8efbc1f..d064a20 100644 --- a/src/main.rs +++ b/src/main.rs @@ -172,20 +172,17 @@ fn main() { bar.inc(rsize); } else { completed_variants += rsize; + // check if the reader is finished so we can setup the pbar + let value_guard = cloned_num_variants.lock().unwrap(); + if *value_guard != 0 { + let t_bar = ProgressBar::new(*value_guard).with_style(sty.clone()); + t_bar.inc(completed_variants); + pbar = Some(t_bar); + } } phase_group += 1; } } - // check if the reader is finished so we can setup the pbar - { - let mut value_guard = cloned_num_variants.lock().unwrap(); - if *value_guard != 0 { - let t_bar = ProgressBar::new(*value_guard).with_style(sty.clone()); - t_bar.inc(completed_variants); - pbar = Some(t_bar); - *value_guard = 0; - } - } } }); From 53b810cdebf93dc3737761efdc8c1dac99bee357 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 22 May 2024 15:38:47 -0400 Subject: [PATCH 10/47] minor clean --- src/main.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index d064a20..70ee2a3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -154,6 +154,7 @@ fn main() { let mut pbar: Option = None; let mut phase_group: i32 = 0; let mut completed_variants: u64 = 0; + let mut m_writer = cloned_writer.lock().unwrap(); loop { match result_receiver.recv() { Ok(None) | Err(_) => { @@ -163,7 +164,6 @@ fn main() { } Ok(Some(result)) => { let mut rsize: u64 = 0; - let mut m_writer = cloned_writer.lock().unwrap(); for entry in result { m_writer.anno_write(entry, phase_group); rsize += 1; @@ -196,7 +196,6 @@ fn main() { if num_chunks == 0 { error!("No variants to be analyzed"); - // This might need to still join. std::process::exit(1); } @@ -206,7 +205,6 @@ fn main() { } info!("genotyping"); - // We now know how many variants will be parsed and can turn on the bar { let mut value_guard = num_variants.lock().unwrap(); @@ -216,7 +214,7 @@ fn main() { handle.join().unwrap(); } - // There will be no more results to be made + // There will be no more results made result_sender.send(None).unwrap(); // Join on the tasks From 8ebe920160e3253148d317d7ddc3aa28092fe3e4 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 22 May 2024 15:56:04 -0400 Subject: [PATCH 11/47] code clean --- src/main.rs | 70 ++++++++++++++++++++--------------------------------- 1 file changed, 26 insertions(+), 44 deletions(-) diff --git a/src/main.rs b/src/main.rs index 70ee2a3..63a6991 100644 --- a/src/main.rs +++ b/src/main.rs @@ -17,8 +17,8 @@ use kplib::{ PathScore, Ploidy, PloidyRegions, Variants, VcfChunker, VcfWriter, }; -type InputType = Vec; -type OutputType = Vec; +type InputType = Option>; +type OutputType = Option>; fn main() { let args = ArgParser::parse(); @@ -48,47 +48,15 @@ fn main() { let ploidy = PloidyRegions::new(&args.io.ploidy_bed); - let (result_sender, result_receiver): ( - Sender>, - Receiver>, - ) = unbounded(); - /* - * Each Thread is given a result_sender - * They don't need to do anything different except - * But spawning them I do need to collect JoinHandlers - * The VcfWriter needs to start first - * Needs to be joinable - * Then VcfChunker is run, sending things to the Threads - * We then join the threads - * We then send a None to VcfWriter so it knows when to stop - * We then join the VcfWriter - * We are then finished - */ - // This needs a channel for results - // These results may be genotype annos or blanks. - // But I have to make them a single type, so I'll send filtered as genotype anno from - // take_annotated(&[], 0, Ploidy::Zero) - let writer = Arc::new(Mutex::new(VcfWriter::new( - &args.io.out, - input_header.clone(), - &args.io.sample, - ))); - - // This blanks work will need to be done by the chunker. So the chunker doesn't get - // the writer but the channel and it won't write_entry it will write_anno - // We send the writer to the reader so that we can pipe filtered variants forward - let mut m_input = VcfChunker::new( - input_vcf, - input_header.clone(), - tree, - args.kd.clone(), - result_sender.clone(), - ); - // Create channels for communication between threads - let (task_sender, task_receiver): (Sender>, Receiver>) = + let (task_sender, task_receiver): (Sender, Receiver) = unbounded(); + let (result_sender, result_receiver): ( + Sender, + Receiver, + ) = unbounded(); + info!("spawning {} threads", args.io.threads); let task_handles: Vec> = (0..args.io.threads) .map(|_| { @@ -141,9 +109,14 @@ fn main() { }) .collect(); - let num_variants = Arc::new(Mutex::new(0)); //Before we start the workers, we'll start the writer + let writer = Arc::new(Mutex::new(VcfWriter::new( + &args.io.out, + input_header.clone(), + &args.io.sample, + ))); let cloned_writer = writer.clone(); + let num_variants = Arc::new(Mutex::new(0)); let cloned_num_variants = num_variants.clone(); let write_handler = std::thread::spawn(move || { let sty = ProgressStyle::with_template( @@ -186,9 +159,17 @@ fn main() { } }); + info!("building variant graphs"); + let mut m_input = VcfChunker::new( + input_vcf, + input_header.clone(), + tree, + args.kd.clone(), + result_sender.clone(), + ); + // Send items to worker threads let mut num_chunks: u64 = 0; - info!("building variant graphs"); for i in &mut m_input { task_sender.send(Some(i)).unwrap(); num_chunks += 1; @@ -204,12 +185,13 @@ fn main() { task_sender.send(None).unwrap(); } - info!("genotyping"); // We now know how many variants will be parsed and can turn on the bar { let mut value_guard = num_variants.lock().unwrap(); *value_guard = m_input.call_count + m_input.skip_count; } + + info!("genotyping"); for handle in task_handles { handle.join().unwrap(); } @@ -217,7 +199,7 @@ fn main() { // There will be no more results made result_sender.send(None).unwrap(); - // Join on the tasks + // Wait on the writer write_handler.join().unwrap(); let writer = writer.lock().unwrap(); info!("genotype counts: {:#?}", writer.gtcounts); From 67fcb86016434eed88043844079db9c7f0769314 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 22 May 2024 19:06:57 -0400 Subject: [PATCH 12/47] code clean --- src/main.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index 63a6991..138fffa 100644 --- a/src/main.rs +++ b/src/main.rs @@ -56,14 +56,14 @@ fn main() { Receiver, ) = unbounded(); - info!("spawning {} threads", args.io.threads); let task_handles: Vec> = (0..args.io.threads) .map(|_| { let m_args = args.clone(); let m_receiver = task_receiver.clone(); - let result_sender = result_sender.clone(); + let m_result_sender = result_sender.clone(); let m_ploidy = ploidy.clone(); + thread::spawn(move || { let mut m_bam = BamParser::new(m_args.io.bam, m_args.io.reference, m_args.kd.clone()); @@ -77,7 +77,7 @@ fn main() { let ploidy = m_ploidy.get_ploidy(&m_graph.chrom, m_graph.start); // For zero, we don't have to waste time going into the bam if ploidy == Ploidy::Zero { - result_sender + m_result_sender .send(Some(m_graph.take_annotated(&[], 0, &ploidy))) .unwrap(); continue; @@ -98,7 +98,7 @@ fn main() { .map(|h| m_graph.apply_coverage(h, &m_args.kd)) .collect(); - result_sender + m_result_sender .send(Some(m_graph.take_annotated(&paths, coverage, &ploidy))) .unwrap(); } @@ -115,9 +115,11 @@ fn main() { input_header.clone(), &args.io.sample, ))); + let cloned_writer = writer.clone(); let num_variants = Arc::new(Mutex::new(0)); let cloned_num_variants = num_variants.clone(); + let write_handler = std::thread::spawn(move || { let sty = ProgressStyle::with_template( " [{elapsed_precise}] {bar:44.cyan/blue} > {pos} completed", @@ -128,6 +130,7 @@ fn main() { let mut phase_group: i32 = 0; let mut completed_variants: u64 = 0; let mut m_writer = cloned_writer.lock().unwrap(); + loop { match result_receiver.recv() { Ok(None) | Err(_) => { @@ -141,6 +144,7 @@ fn main() { m_writer.anno_write(entry, phase_group); rsize += 1; } + if let Some(ref mut bar) = pbar { bar.inc(rsize); } else { From 3409abbdbef3c167aee85693c2a5c965f2a13eb6 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 22 May 2024 20:33:43 -0400 Subject: [PATCH 13/47] new --mem Attempts to throttle vcf reading to limit maximum memory usage --- Cargo.lock | 7 +++++++ Cargo.toml | 1 + experiments/bedtest.sh | 2 +- src/kplib/cli.rs | 11 +++++++++++ src/kplib/vcfreader.rs | 2 +- src/main.rs | 42 ++++++++++++++++++++++++++++++++---------- 6 files changed, 53 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c7aac0b..99449cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -458,6 +458,7 @@ dependencies = [ "noodles-vcf", "ordered-float", "page_size", + "peak_alloc", "petgraph", "pretty_env_logger", "rust-htslib", @@ -655,6 +656,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "peak_alloc" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c4e8e2dd832fd76346468f822e4e600d30ba4e5aa545a128abf12cfae7ea3e" + [[package]] name = "percent-encoding" version = "2.3.1" diff --git a/Cargo.toml b/Cargo.toml index 932fe88..88b3069 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ log = { version = "0.4", features = ["std", "serde"] } noodles-vcf = { version = "0.49.0" } ordered-float = { version = "4.0", default-features = false } page_size = "0.6.0" +peak_alloc = "0.2.1" petgraph = { version = "0.6.2" } pretty_env_logger = { version = "0.4.0" } rust-htslib = { version = "0.46.0" } diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh index b51862d..b0d9c73 100644 --- a/experiments/bedtest.sh +++ b/experiments/bedtest.sh @@ -12,7 +12,7 @@ create() { --sizesim 0.95 --seqsim 0.90 --threads 4 \ --maxpaths 20000 --mapq 5 --hapsim 0.98 \ --chunksize 100 --maxhom 5 \ - --sample doesthiswork \ + --sample doesthiswork --mem 1 \ -o test_rs/hc.vcf --bed $bed # --bed /Users/english/code/kanpig/test/GRCh38_HG002-T2TQ100-V1.0_stvar.benchmark.bed \ # --bam /Users/english/code/kanpig/experiments/test_rs/GIABHG002.bam \ diff --git a/src/kplib/cli.rs b/src/kplib/cli.rs index 7b3a40c..51d4311 100644 --- a/src/kplib/cli.rs +++ b/src/kplib/cli.rs @@ -46,6 +46,10 @@ pub struct IOParams { #[arg(long, default_value_t = 1)] pub threads: usize, + /// Maximum memory (in GB) - should be at least `2 * threads` + #[arg(long, default_value_t = 16.0)] + pub mem: f32, + /// Verbose logging #[arg(long, default_value_t = false)] pub debug: bool, @@ -165,6 +169,13 @@ impl ArgParser { warn!("--kmer above 8 becomes memory intensive"); } + if self.io.mem < (self.io.threads as f32 * 2.0) { + warn!( + "{} GB of memory may be too little for {} threads", + self.io.mem, self.io.threads + ); + } + is_ok } } diff --git a/src/kplib/vcfreader.rs b/src/kplib/vcfreader.rs index e236b0b..d485ce7 100644 --- a/src/kplib/vcfreader.rs +++ b/src/kplib/vcfreader.rs @@ -19,7 +19,7 @@ pub struct VcfChunker { // fits in the current chunk. We need to hold on to it for the // next chunk hold_entry: Option, - chunk_count: u64, + pub chunk_count: u64, pub call_count: u64, pub skip_count: u64, result_sender: Sender>>, diff --git a/src/main.rs b/src/main.rs index 138fffa..79b8768 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,9 +7,11 @@ use clap::Parser; use crossbeam_channel::{unbounded, Receiver, Sender}; use indicatif::{ProgressBar, ProgressStyle}; use noodles_vcf::{self as vcf}; +use peak_alloc::PeakAlloc; use std::sync::{Arc, Mutex}; use std::thread; use std::thread::JoinHandle; + mod kplib; use kplib::{ @@ -20,6 +22,9 @@ use kplib::{ type InputType = Option>; type OutputType = Option>; +#[global_allocator] +static PEAK_ALLOC: PeakAlloc = PeakAlloc; + fn main() { let args = ArgParser::parse(); let level = if args.io.debug { @@ -32,11 +37,11 @@ fn main() { .init(); info!("starting"); + info!("params: {:#?}", args); if !args.validate() { error!("please fix arguments"); std::process::exit(1); } - info!("params: {:#?}", args); let mut input_vcf = vcf::reader::Builder::default() .build_from_path(args.io.input.clone()) @@ -49,12 +54,8 @@ fn main() { let ploidy = PloidyRegions::new(&args.io.ploidy_bed); // Create channels for communication between threads - let (task_sender, task_receiver): (Sender, Receiver) = - unbounded(); - let (result_sender, result_receiver): ( - Sender, - Receiver, - ) = unbounded(); + let (task_sender, task_receiver): (Sender, Receiver) = unbounded(); + let (result_sender, result_receiver): (Sender, Receiver) = unbounded(); info!("spawning {} threads", args.io.threads); let task_handles: Vec> = (0..args.io.threads) @@ -173,13 +174,34 @@ fn main() { ); // Send items to worker threads - let mut num_chunks: u64 = 0; + let mut hang_tracker: i64 = 0; for i in &mut m_input { task_sender.send(Some(i)).unwrap(); - num_chunks += 1; + + // The reader can get way ahead of the tasks, so we monitor memory usage + // and let threads catch up + let mut num_waits = 0; + while PEAK_ALLOC.current_usage_as_gb() >= args.io.mem && num_waits < 10 { + std::thread::sleep(std::time::Duration::from_millis(100)); + warn!( + "throttling vcf reading with memory @ {}", + PEAK_ALLOC.current_usage_as_gb() + ); + num_waits += 1; + } + if num_waits != 0 { + hang_tracker += 1; + } else { + hang_tracker -= 1; + } + if hang_tracker >= 10 { + warn!("memory seems pretty full, consider setting a higher --mem"); + hang_tracker = 0; + } + hang_tracker = hang_tracker.max(-10); } - if num_chunks == 0 { + if m_input.chunk_count == 0 { error!("No variants to be analyzed"); std::process::exit(1); } From 0635d722e8ddcdb48ce914c25c481fe4c3d8c4a6 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 22 May 2024 20:25:58 -0500 Subject: [PATCH 14/47] tweaking memory limiting but it doesn't seem to be working, so I'm putting this in a feature and will revisit later --- src/main.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/main.rs b/src/main.rs index 79b8768..51308eb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -175,18 +175,23 @@ fn main() { // Send items to worker threads let mut hang_tracker: i64 = 0; + // PEAK_ALLOC seems to be off by about 2x on memory usage + let alcor = 2.2; for i in &mut m_input { task_sender.send(Some(i)).unwrap(); // The reader can get way ahead of the tasks, so we monitor memory usage // and let threads catch up let mut num_waits = 0; - while PEAK_ALLOC.current_usage_as_gb() >= args.io.mem && num_waits < 10 { - std::thread::sleep(std::time::Duration::from_millis(100)); - warn!( - "throttling vcf reading with memory @ {}", - PEAK_ALLOC.current_usage_as_gb() - ); + while (PEAK_ALLOC.current_usage_as_gb() * alcor) >= args.io.mem && num_waits < 10 { + let sleep_time = 100i64.max(hang_tracker * 100); + std::thread::sleep(std::time::Duration::from_millis(sleep_time as u64)); + if num_waits >= 5 { + warn!( + "throttling vcf reading with memory @ {}", + PEAK_ALLOC.current_usage_as_gb() * alcor + ); + } num_waits += 1; } if num_waits != 0 { From 701f91eccf97de03f7f86b94630b4cdd34ae35db Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 22 May 2024 21:27:55 -0400 Subject: [PATCH 15/47] moving mem to feature measuring memory usage cross platform is difficult --- Cargo.toml | 1 - src/kplib/cli.rs | 11 ----------- src/main.rs | 32 -------------------------------- 3 files changed, 44 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 88b3069..932fe88 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,6 @@ log = { version = "0.4", features = ["std", "serde"] } noodles-vcf = { version = "0.49.0" } ordered-float = { version = "4.0", default-features = false } page_size = "0.6.0" -peak_alloc = "0.2.1" petgraph = { version = "0.6.2" } pretty_env_logger = { version = "0.4.0" } rust-htslib = { version = "0.46.0" } diff --git a/src/kplib/cli.rs b/src/kplib/cli.rs index 51d4311..7b3a40c 100644 --- a/src/kplib/cli.rs +++ b/src/kplib/cli.rs @@ -46,10 +46,6 @@ pub struct IOParams { #[arg(long, default_value_t = 1)] pub threads: usize, - /// Maximum memory (in GB) - should be at least `2 * threads` - #[arg(long, default_value_t = 16.0)] - pub mem: f32, - /// Verbose logging #[arg(long, default_value_t = false)] pub debug: bool, @@ -169,13 +165,6 @@ impl ArgParser { warn!("--kmer above 8 becomes memory intensive"); } - if self.io.mem < (self.io.threads as f32 * 2.0) { - warn!( - "{} GB of memory may be too little for {} threads", - self.io.mem, self.io.threads - ); - } - is_ok } } diff --git a/src/main.rs b/src/main.rs index 51308eb..acbc734 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,7 +7,6 @@ use clap::Parser; use crossbeam_channel::{unbounded, Receiver, Sender}; use indicatif::{ProgressBar, ProgressStyle}; use noodles_vcf::{self as vcf}; -use peak_alloc::PeakAlloc; use std::sync::{Arc, Mutex}; use std::thread; use std::thread::JoinHandle; @@ -22,9 +21,6 @@ use kplib::{ type InputType = Option>; type OutputType = Option>; -#[global_allocator] -static PEAK_ALLOC: PeakAlloc = PeakAlloc; - fn main() { let args = ArgParser::parse(); let level = if args.io.debug { @@ -174,36 +170,8 @@ fn main() { ); // Send items to worker threads - let mut hang_tracker: i64 = 0; - // PEAK_ALLOC seems to be off by about 2x on memory usage - let alcor = 2.2; for i in &mut m_input { task_sender.send(Some(i)).unwrap(); - - // The reader can get way ahead of the tasks, so we monitor memory usage - // and let threads catch up - let mut num_waits = 0; - while (PEAK_ALLOC.current_usage_as_gb() * alcor) >= args.io.mem && num_waits < 10 { - let sleep_time = 100i64.max(hang_tracker * 100); - std::thread::sleep(std::time::Duration::from_millis(sleep_time as u64)); - if num_waits >= 5 { - warn!( - "throttling vcf reading with memory @ {}", - PEAK_ALLOC.current_usage_as_gb() * alcor - ); - } - num_waits += 1; - } - if num_waits != 0 { - hang_tracker += 1; - } else { - hang_tracker -= 1; - } - if hang_tracker >= 10 { - warn!("memory seems pretty full, consider setting a higher --mem"); - hang_tracker = 0; - } - hang_tracker = hang_tracker.max(-10); } if m_input.chunk_count == 0 { From 377e98e45ce57f701bed900f02b5502987a64598 Mon Sep 17 00:00:00 2001 From: Adam English Date: Thu, 23 May 2024 09:40:49 -0400 Subject: [PATCH 16/47] cleaning removing /reducing jupyter notebooks --- experiments/Analysis.ipynb | 301 ------- experiments/Development.ipynb | 1584 --------------------------------- 2 files changed, 1885 deletions(-) delete mode 100644 experiments/Analysis.ipynb diff --git a/experiments/Analysis.ipynb b/experiments/Analysis.ipynb deleted file mode 100644 index bf6b9d5..0000000 --- a/experiments/Analysis.ipynb +++ /dev/null @@ -1,301 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 154, - "id": "7db0c486-584d-4193-81ca-ff96212f8b8b", - "metadata": {}, - "outputs": [], - "source": [ - "import networkx as nx\n", - "\n", - "G = nx.DiGraph()\n", - "G.add_nodes_from([\n", - " ('src', {'size':0}),\n", - " ('A', {'size': 5}),\n", - " ('B', {'size': 10}),\n", - " ('C', {'size': 7}),\n", - " ('D', {'size': 8}),\n", - " ('E', {'size': 3}),\n", - " ('F', {'size': 20}),\n", - " ('G', {'size': -4}),\n", - " ('snk', {'size': 0}),\n", - "])\n", - "\n", - "for i in \"ABCDEFG\":\n", - " G.add_edge('src', i)\n", - " G.add_edge(i, 'snk')\n", - "# Add directed edges\n", - "G.add_edges_from([\n", - " ('A', 'B'),\n", - " ('A', 'C'),\n", - " ('A', 'D'),\n", - " ('A', 'E'),\n", - " ('A', 'F'),\n", - " ('A', 'G'),\n", - " ('B', 'C'),\n", - " ('B', 'D'),\n", - " ('B', 'E'),\n", - " ('B', 'F'),\n", - " ('B', 'G'),\n", - " ('C', 'D'),\n", - " ('C', 'E'),\n", - " ('C', 'F'),\n", - " ('C', 'G'),\n", - " ('D', 'E'),\n", - " ('D', 'F'),\n", - " ('D', 'G'),\n", - " ('E', 'F'),\n", - " ('E', 'G'),\n", - " ('F', 'G'),\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 163, - "id": "fa3986aa-8553-4fe2-9e28-559d41713c7e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1, 2, 4, 5] [2, 3, 5]\n", - "[1, 3, 4, 5] [2, 3, 5]\n", - "Closest path: [0, 2, 3, 4]\n", - "Path sizes: [1, 3, 4, 5]\n" - ] - } - ], - "source": [ - "\n", - "\n", - "def min_diff(a, b):\n", - " dp = [0] * (len(b) + 1)\n", - " for i in range(1, len(b) + 1):\n", - " dp[i] = float('inf')\n", - " for j in range(i):\n", - " dp[i] = min(dp[i], dp[j] + abs(a[j] - b[i - 1]))\n", - " print(a, b)\n", - " return dp[-1]\n", - "\n", - "def path_sum(graph, path):\n", - " return sum(graph.nodes[node]['size'] for node in path)\n", - "\n", - "def closest_path(graph, weights, start, end):\n", - " closest_path = None\n", - " min_diff_val = float('inf')\n", - "\n", - " for path in nx.all_simple_paths(graph, source=start, target=end):\n", - " path_weights = [graph.nodes[node]['size'] for node in path]\n", - " diff = min_diff(path_weights, weights)\n", - " if diff < min_diff_val:\n", - " min_diff_val = diff\n", - " closest_path = path\n", - "\n", - " return closest_path\n", - "\n", - "if __name__ == \"__main__\":\n", - " graph = nx.DiGraph()\n", - " graph.add_nodes_from([(0, {'size': 1}), (1, {'size': 2}), (2, {'size': 3}),\n", - " (3, {'size': 4}), (4, {'size': 5})])\n", - " graph.add_edges_from([(0, 1), (0, 2), (1, 3), (2, 3), (3, 4)])\n", - "\n", - " weights = [2, 3, 5]\n", - " start = 0\n", - " end = 4\n", - "\n", - " path = closest_path(graph, weights, start, end)\n", - " if path:\n", - " path_weights = [graph.nodes[node]['size'] for node in path]\n", - " print(\"Closest path:\", path)\n", - " print(\"Path sizes:\", path_weights)\n", - " else:\n", - " print(\"No path found\")" - ] - }, - { - "cell_type": "code", - "execution_count": 164, - "id": "ea8e6771-96db-4d84-b759-b97287d189b3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0, 5, 0] [15, 5]\n", - "[0, 5, 10, 0] [15, 5]\n", - "[0, 5, 10, 7, 0] [15, 5]\n", - "[0, 5, 10, 7, 8, 0] [15, 5]\n", - "[0, 5, 10, 7, 8, 3, 0] [15, 5]\n", - "[0, 5, 10, 7, 8, 3, 20, 0] [15, 5]\n", - "[0, 5, 10, 7, 8, 3, 20, -4, 0] [15, 5]\n", - "[0, 5, 10, 7, 8, 3, -4, 0] [15, 5]\n", - "[0, 5, 10, 7, 8, 20, 0] [15, 5]\n", - "[0, 5, 10, 7, 8, 20, -4, 0] [15, 5]\n", - "[0, 5, 10, 7, 8, -4, 0] [15, 5]\n", - "[0, 5, 10, 7, 3, 0] [15, 5]\n", - "[0, 5, 10, 7, 3, 20, 0] [15, 5]\n", - "[0, 5, 10, 7, 3, 20, -4, 0] [15, 5]\n", - "[0, 5, 10, 7, 3, -4, 0] [15, 5]\n", - "[0, 5, 10, 7, 20, 0] [15, 5]\n", - "[0, 5, 10, 7, 20, -4, 0] [15, 5]\n", - "[0, 5, 10, 7, -4, 0] [15, 5]\n", - "[0, 5, 10, 8, 0] [15, 5]\n", - "[0, 5, 10, 8, 3, 0] [15, 5]\n", - "[0, 5, 10, 8, 3, 20, 0] [15, 5]\n", - "[0, 5, 10, 8, 3, 20, -4, 0] [15, 5]\n", - "[0, 5, 10, 8, 3, -4, 0] [15, 5]\n", - "[0, 5, 10, 8, 20, 0] [15, 5]\n", - "[0, 5, 10, 8, 20, -4, 0] [15, 5]\n", - "[0, 5, 10, 8, -4, 0] [15, 5]\n", - "[0, 5, 10, 3, 0] [15, 5]\n", - "[0, 5, 10, 3, 20, 0] [15, 5]\n", - "[0, 5, 10, 3, 20, -4, 0] [15, 5]\n", - "[0, 5, 10, 3, -4, 0] [15, 5]\n", - "[0, 5, 10, 20, 0] [15, 5]\n", - "[0, 5, 10, 20, -4, 0] [15, 5]\n", - "[0, 5, 10, -4, 0] [15, 5]\n", - "[0, 5, 7, 0] [15, 5]\n", - "[0, 5, 7, 8, 0] [15, 5]\n", - "[0, 5, 7, 8, 3, 0] [15, 5]\n", - "[0, 5, 7, 8, 3, 20, 0] [15, 5]\n", - "[0, 5, 7, 8, 3, 20, -4, 0] [15, 5]\n", - "[0, 5, 7, 8, 3, -4, 0] [15, 5]\n", - "[0, 5, 7, 8, 20, 0] [15, 5]\n", - "[0, 5, 7, 8, 20, -4, 0] [15, 5]\n", - "[0, 5, 7, 8, -4, 0] [15, 5]\n", - "[0, 5, 7, 3, 0] [15, 5]\n", - "[0, 5, 7, 3, 20, 0] [15, 5]\n", - "[0, 5, 7, 3, 20, -4, 0] [15, 5]\n", - "[0, 5, 7, 3, -4, 0] [15, 5]\n", - "[0, 5, 7, 20, 0] [15, 5]\n", - "[0, 5, 7, 20, -4, 0] [15, 5]\n", - "[0, 5, 7, -4, 0] [15, 5]\n", - "[0, 5, 8, 0] [15, 5]\n", - "[0, 5, 8, 3, 0] [15, 5]\n", - "[0, 5, 8, 3, 20, 0] [15, 5]\n", - "[0, 5, 8, 3, 20, -4, 0] [15, 5]\n", - "[0, 5, 8, 3, -4, 0] [15, 5]\n", - "[0, 5, 8, 20, 0] [15, 5]\n", - "[0, 5, 8, 20, -4, 0] [15, 5]\n", - "[0, 5, 8, -4, 0] [15, 5]\n", - "[0, 5, 3, 0] [15, 5]\n", - "[0, 5, 3, 20, 0] [15, 5]\n", - "[0, 5, 3, 20, -4, 0] [15, 5]\n", - "[0, 5, 3, -4, 0] [15, 5]\n", - "[0, 5, 20, 0] [15, 5]\n", - "[0, 5, 20, -4, 0] [15, 5]\n", - "[0, 5, -4, 0] [15, 5]\n", - "[0, 10, 0] [15, 5]\n", - "[0, 10, 7, 0] [15, 5]\n", - "[0, 10, 7, 8, 0] [15, 5]\n", - "[0, 10, 7, 8, 3, 0] [15, 5]\n", - "[0, 10, 7, 8, 3, 20, 0] [15, 5]\n", - "[0, 10, 7, 8, 3, 20, -4, 0] [15, 5]\n", - "[0, 10, 7, 8, 3, -4, 0] [15, 5]\n", - "[0, 10, 7, 8, 20, 0] [15, 5]\n", - "[0, 10, 7, 8, 20, -4, 0] [15, 5]\n", - "[0, 10, 7, 8, -4, 0] [15, 5]\n", - "[0, 10, 7, 3, 0] [15, 5]\n", - "[0, 10, 7, 3, 20, 0] [15, 5]\n", - "[0, 10, 7, 3, 20, -4, 0] [15, 5]\n", - "[0, 10, 7, 3, -4, 0] [15, 5]\n", - "[0, 10, 7, 20, 0] [15, 5]\n", - "[0, 10, 7, 20, -4, 0] [15, 5]\n", - "[0, 10, 7, -4, 0] [15, 5]\n", - "[0, 10, 8, 0] [15, 5]\n", - "[0, 10, 8, 3, 0] [15, 5]\n", - "[0, 10, 8, 3, 20, 0] [15, 5]\n", - "[0, 10, 8, 3, 20, -4, 0] [15, 5]\n", - "[0, 10, 8, 3, -4, 0] [15, 5]\n", - "[0, 10, 8, 20, 0] [15, 5]\n", - "[0, 10, 8, 20, -4, 0] [15, 5]\n", - "[0, 10, 8, -4, 0] [15, 5]\n", - "[0, 10, 3, 0] [15, 5]\n", - "[0, 10, 3, 20, 0] [15, 5]\n", - "[0, 10, 3, 20, -4, 0] [15, 5]\n", - "[0, 10, 3, -4, 0] [15, 5]\n", - "[0, 10, 20, 0] [15, 5]\n", - "[0, 10, 20, -4, 0] [15, 5]\n", - "[0, 10, -4, 0] [15, 5]\n", - "[0, 7, 0] [15, 5]\n", - "[0, 7, 8, 0] [15, 5]\n", - "[0, 7, 8, 3, 0] [15, 5]\n", - "[0, 7, 8, 3, 20, 0] [15, 5]\n", - "[0, 7, 8, 3, 20, -4, 0] [15, 5]\n", - "[0, 7, 8, 3, -4, 0] [15, 5]\n", - "[0, 7, 8, 20, 0] [15, 5]\n", - "[0, 7, 8, 20, -4, 0] [15, 5]\n", - "[0, 7, 8, -4, 0] [15, 5]\n", - "[0, 7, 3, 0] [15, 5]\n", - "[0, 7, 3, 20, 0] [15, 5]\n", - "[0, 7, 3, 20, -4, 0] [15, 5]\n", - "[0, 7, 3, -4, 0] [15, 5]\n", - "[0, 7, 20, 0] [15, 5]\n", - "[0, 7, 20, -4, 0] [15, 5]\n", - "[0, 7, -4, 0] [15, 5]\n", - "[0, 8, 0] [15, 5]\n", - "[0, 8, 3, 0] [15, 5]\n", - "[0, 8, 3, 20, 0] [15, 5]\n", - "[0, 8, 3, 20, -4, 0] [15, 5]\n", - "[0, 8, 3, -4, 0] [15, 5]\n", - "[0, 8, 20, 0] [15, 5]\n", - "[0, 8, 20, -4, 0] [15, 5]\n", - "[0, 8, -4, 0] [15, 5]\n", - "[0, 3, 0] [15, 5]\n", - "[0, 3, 20, 0] [15, 5]\n", - "[0, 3, 20, -4, 0] [15, 5]\n", - "[0, 3, -4, 0] [15, 5]\n", - "[0, 20, 0] [15, 5]\n", - "[0, 20, -4, 0] [15, 5]\n", - "[0, -4, 0] [15, 5]\n", - "Closest path: ['src', 'A', 'snk']\n", - "Path weights: [0, 5, 0]\n" - ] - } - ], - "source": [ - "weights = [15,5]\n", - "path = closest_path(G, weights, 'src', 'snk')\n", - "if path:\n", - " path_weights = [G.nodes[node]['size'] for node in path]\n", - " print(\"Closest path:\", path)\n", - " print(\"Path weights:\", path_weights)\n", - "else:\n", - " print(\"No path found\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab2cb934-57f9-42a4-bcd5-2c47f124c31c", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/experiments/Development.ipynb b/experiments/Development.ipynb index ad3e139..d3c4e11 100644 --- a/experiments/Development.ipynb +++ b/experiments/Development.ipynb @@ -159,1590 +159,6 @@ "p = sb.scatterplot(data=data, x='unroll', y='pj', hue='bin_len')\n", "p.set(title=\"Kmer estimate of unroll sequence similarity\", xlabel=\"Unroll Sequence Similarity\", ylabel=\"Kmer Count Similarity\")" ] - }, - { - "cell_type": "code", - "execution_count": 134, - "id": "4e584018-0fe7-495f-8a1d-83cb1d1a52e3", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/english/py/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", - " grouped_vals = vals.groupby(grouper)\n" - ] - }, - { - "data": { - "text/plain": [ - "[Text(0.5, 1.0, 'Cosine Sim by Sequence Similarity'), Text(0.5, 0, 'pj')]" - ] - }, - "execution_count": 134, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "p = sb.boxplot(data=data, x=\"bin_sim\", y=\"pj\") \n", - "p.set(title=\"Cosine Sim by Sequence Similarity\", xlabel=\"pj\")" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "id": "9a67123a-d155-4ce9-9e40-4e1fe0879a30", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/english/py/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.\n", - " grouped_vals = vals.groupby(grouper)\n" - ] - }, - { - "data": { - "text/plain": [ - "[Text(0.5, 1.0, 'Cosine Sim by Sequence Similarity (≥90% size sim)'),\n", - " Text(0.5, 0, 'seqsim')]" - ] - }, - "execution_count": 99, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "p = sb.boxplot(data=data[data['szsim'] >= 0.90], x=\"bin_sim\", y=\"cossim\") \n", - "p.set(title=\"Cosine Sim by Sequence Similarity (≥90% size sim)\", xlabel=\"seqsim\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "5ebf2ba9-b56c-41d6-9779-41b435130a5d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "8" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(list(nx.all_simple_paths(G, 'src', 'D')))" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "id": "1efe432f-6ff2-4403-a42e-a73bb50ad936", - "metadata": {}, - "outputs": [], - "source": [ - "import networkx as nx\n", - "import heapq\n", - "import random\n", - "\n", - "# Create a directed graph with node sizes\n", - "G = nx.DiGraph()\n", - "G.add_nodes_from([\n", - " ('src', {'size':0}),\n", - " ('A', {'size': 5}),\n", - " ('B', {'size': 10}),\n", - " ('C', {'size': 7}),\n", - " ('D', {'size': 8}),\n", - " ('E', {'size': 3}),\n", - " ('F', {'size': 20}),\n", - " ('G', {'size': -4}),\n", - "])\n", - "\n", - "for i in \"ABCDEFG\":\n", - " G.add_edge('src', i)\n", - " G.add_edge(i, 'snk')\n", - "# Add directed edges\n", - "G.add_edges_from([\n", - " ('A', 'B'),\n", - " ('A', 'C'),\n", - " ('A', 'D'),\n", - " ('A', 'E'),\n", - " ('A', 'F'),\n", - " ('A', 'G'),\n", - " ('B', 'C'),\n", - " ('B', 'D'),\n", - " ('B', 'E'),\n", - " ('B', 'F'),\n", - " ('B', 'G'),\n", - " ('C', 'D'),\n", - " ('C', 'E'),\n", - " ('C', 'F'),\n", - " ('C', 'G'),\n", - " ('D', 'E'),\n", - " ('D', 'F'),\n", - " ('D', 'G'),\n", - " ('E', 'F'),\n", - " ('E', 'G'),\n", - " ('F', 'G'),\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "id": "af556f07-f3df-4780-877d-0e3d4ce670fa", - "metadata": {}, - "outputs": [], - "source": [ - "def remove_edges_not_passing_through(graph, source, sink, kept_nodes):\n", - " visited = set()\n", - "\n", - " def dfs(node):\n", - " visited.add(node)\n", - " for predecessor in graph.predecessors(node):\n", - " if predecessor not in visited:\n", - " dfs(predecessor)\n", - "\n", - " dfs(sink)\n", - "\n", - " edges_to_remove = [(predecessor, node) for node in visited for predecessor in graph.predecessors(node) if node not in kept_nodes]\n", - " graph.remove_edges_from(edges_to_remove)" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "id": "d84299da-a980-4d1a-bd12-8d57bbdd4e96", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "nattempts 128\n", - "Path 1: ['src', 'A', 'C', 'D'], Total Size: 20, Size Difference from Target: 0\n", - "Path 2: ['src', 'B', 'C', 'E'], Total Size: 20, Size Difference from Target: 0\n", - "Path 3: ['src', 'F'], Total Size: 20, Size Difference from Target: 0\n" - ] - } - ], - "source": [ - "\n", - "def find_top_n_paths(graph, start_node, target_size, n):\n", - " top_paths = []\n", - " attempts = 0\n", - " threshold = float('inf')\n", - " def dfs(current_path, current_size):\n", - " nonlocal attempts, threshold\n", - " attempts += 1\n", - " sizediff = abs(current_size - target_size)\n", - "\n", - " if sizediff > threshold:\n", - " return\n", - " \n", - " heapq.heappush(top_paths, (sizediff, current_path))\n", - " \n", - " #threshold = top_paths[0][0] + 100000 \n", - " # Without: nattempts 128\n", - " # Path 1: ['src', 'A', 'C', 'D'], Total Size: 20, Size Difference from Target: 0\n", - " # Path 2: ['src', 'B', 'C', 'E'], Total Size: 20, Size Difference from Target: 0\n", - " # Path 3: ['src', 'F'], Total Size: 20, Size Difference from Target: 0\n", - " \n", - " #threshold = sorted(top_paths)[min(3, len(top_paths)-1)][0] + 9 # But 8 fails.\n", - " # With: nattempts 37\n", - " # Path 1: ['src', 'F'], Total Size: 20, Size Difference from Target: 0\n", - " # Path 2: ['src', 'A', 'B', 'C', 'E', 'G'], Total Size: 21, Size Difference from Target: 1\n", - " # Path 3: ['src', 'A', 'B', 'D', 'G'], Total Size: 19, Size Difference from Target: 1\n", - " for successor in graph.successors(current_path[-1]):\n", - " next_path = current_path + [successor]\n", - " next_size = current_size + graph.nodes[successor]['size']\n", - " dfs(next_path, next_size)\n", - "\n", - " dfs([start_node], graph.nodes[start_node]['size'])\n", - " print('nattempts', attempts)\n", - " return sorted(top_paths)[:n]\n", - "\n", - "# Example usage\n", - "start_node = 'src'\n", - "target_size = 20\n", - "n = 3\n", - "top_paths = find_top_n_paths(G, start_node, target_size, n)\n", - "\n", - "for i, (size_diff, path) in enumerate(top_paths, 1):\n", - " total_size = sum(G.nodes[node]['size'] for node in path)\n", - " print(f\"Path {i}: {path}, Total Size: {total_size}, Size Difference from Target: {size_diff}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "id": "dd7614a5-6ac6-4722-a276-ec42cc018fec", - "metadata": {}, - "outputs": [], - "source": [ - "remove_edges_not_passing_through(G, 'src', 'snk', list(\"BEF\")) " - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "id": "9c74b2a3-ac91-4e02-944f-ac2f93d83427", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "nattempts 8\n", - "Path 1: ['src', 'F'], Total Size: 20, Size Difference from Target: 0\n", - "Path 2: ['src', 'E', 'F'], Total Size: 23, Size Difference from Target: 3\n", - "Path 3: ['src', 'B', 'E'], Total Size: 13, Size Difference from Target: 7\n" - ] - } - ], - "source": [ - "top_paths = find_top_n_paths(G, start_node, target_size, n)\n", - "\n", - "for i, (size_diff, path) in enumerate(top_paths, 1):\n", - " total_size = sum(G.nodes[node]['size'] for node in path)\n", - " print(f\"Path {i}: {path}, Total Size: {total_size}, Size Difference from Target: {size_diff}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "68afbcec-a30e-42bb-a793-6a39549d20a0", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "a6c14d06-3fc5-4ce0-b632-79d3f8d7e59f", - "metadata": {}, - "outputs": [], - "source": [ - "from dataclasses import dataclass\n", - "\n", - "@dataclass\n", - "class Node():\n", - " value: int = 0\n", - " def __hash__(self):\n", - " return self.value" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "2f203f27-ead5-47ad-a086-4d9ee774bb34", - "metadata": {}, - "outputs": [], - "source": [ - "aNodes = [Node(1), Node(10), Node(20)]\n", - "import networkx as nx\n", - "g = nx.DiGraph()\n", - "one = Node(1)\n", - "nine = Node(9)\n", - "four = Node(4)\n", - "twenty = Node(20)\n", - "for i in [one, nine, four, twenty]:\n", - " g.add_edge('src', i)\n", - " g.add_edge(i, 'snk')\n", - "g.add_edge(one, nine)\n", - "g.add_edge(one, four)\n", - "g.add_edge(one, twenty)\n", - "g.add_edge(nine, twenty) # can only use one of nine/twenty\n", - "g.add_edge(four, twenty)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "33d47972-0432-407a-b60d-78641f9ea3ba", - "metadata": {}, - "outputs": [], - "source": [ - "import kdp\n", - "import logging\n", - "import pysam\n", - "import truvari\n", - "import networkx as nx\n", - "def vcf_haps(variants, kmer=4):\n", - " \"\"\"\n", - " Parse a set of phased variants and return the two Haplotypes\n", - " \"\"\"\n", - " h1 = []\n", - " h2 = []\n", - " for entry in variants:\n", - " m_hap = kdp.Haplotype.from_vcf(entry, kmer)\n", - " if entry.samples[0]['GT'][0] == 1:\n", - " h1.append(m_hap)\n", - " if len(entry.samples[0]['GT']) > 1 and entry.samples[0]['GT'][1] == 1:\n", - " h2.append(m_hap)\n", - " return h1, h2\n", - "\n", - "def vars_to_graph(variants, kmer=4):\n", - " \"\"\"\n", - " For a sorted set of variants, make a graph\n", - " Returns the digraph (and variants not used?)\n", - " \"\"\"\n", - " keep_vars = []\n", - " unused_vars = []\n", - " for entry in variants:\n", - " hap = kdp.Haplotype.from_vcf(entry, kmer)\n", - " if hap.size != 0:\n", - " keep_vars.append((truvari.entry_to_hash(entry), hap, entry))\n", - " else:\n", - " unused_vars.append(entry)\n", - "\n", - " graph = nx.DiGraph()\n", - " graph.add_node('src', hap=kdp.Haplotype.new(kmer), entry=None)\n", - " graph.add_node('snk', hap=kdp.Haplotype.new(kmer), entry=None)\n", - " for key, hap, entry in keep_vars:\n", - " logging.debug(\"%s %s\", key, str(v))\n", - " graph.add_node(key, hap=hap, entry=entry)\n", - " graph.add_edge('src', key)\n", - " graph.add_edge(key, 'snk')\n", - "\n", - " # link the variants\n", - " for i in range(len(keep_vars) - 1):\n", - " up_key, _, up_var = keep_vars[i]\n", - " up = truvari.entry_boundaries(up_var)\n", - " for j in range(i + 1, len(keep_vars)):\n", - " dn_key, _, dn_var = keep_vars[j]\n", - " dn = truvari.entry_boundaries(dn_var)\n", - " if not truvari.overlaps(up[0], up[1], dn[0], dn[1]):\n", - " graph.add_edge(up_key, dn_key)\n", - " return graph, unused_vars" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "8a755f39-00d9-4d81-b5cb-5c00149cac6b", - "metadata": {}, - "outputs": [], - "source": [ - "chrom, start, end = \"chr20\", 8031130, 8032374\n", - "\n", - "v = pysam.VariantFile(\"test/GRCh38_HG002-T2TQ100-V1.0_stvar.vcf.gz\")\n", - "hap1, hap2 = vcf_haps(v.fetch(chrom, start, end))\n", - "\n", - "v = pysam.VariantFile(\"/Users/english/code/aou_sv_merge/intermerge/hgsvc2/truvari_collapsed.vcf.gz\")\n", - "graph = vars_to_graph(v.fetch(chrom, start, end), 4)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "4e8243d6-2adc-4a02-8e27-049a84d2f253", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "THRESH = 0.90\n", - "def longest_common_subsequence(graph, path1, path2):\n", - " # Compute the length of the longest common subsequence between two paths\n", - " m, n = len(path1), len(path2)\n", - " dp = [[0] * (n + 1) for _ in range(m + 1)]\n", - "\n", - " for i in range(1, m + 1):\n", - " for j in range(1, n + 1):\n", - " ssim, _ = truvari.sizesim(graph.nodes[path1[i-1]]['hap'].size, path2[j - 1].size)\n", - " csim = kdp.weighted_cosinesim(graph.nodes[path1[i - 1]]['hap'].kfeat, path2[j - 1].kfeat)\n", - " if ssim >= THRESH and csim >= THRESH: # If they're over the similarity minimums\n", - " dp[i][j] = dp[i - 1][j - 1] + csim # Matched, add one to the previous match\n", - " else:\n", - " dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) # \n", - "\n", - " ret = dp[m][n]\n", - " #print(ret, '\\t\\t', \"\".join(path1), \"<---\\t--->\", \"\".join(path2))\n", - " return ret, None\n", - "\n", - "def dfs(g, target, cur_node=None, cur_len=0, path=None): \n", - " if not cur_node:\n", - " cur_node = 'src'\n", - " else:\n", - " if path is None:\n", - " path = []\n", - " path.append(cur_node)\n", - " cur_len += g.nodes[cur_node]['hap'].size\n", - " diffs = sorted([(abs(target - (cur_len + g.nodes[n]['hap'].size)), n)\n", - " for _, n in g.out_edges(cur_node)])\n", - " for _, next_node in diffs:\n", - " if next_node == 'snk' and cur_node != 'src':\n", - " yield list(path)\n", - " else:\n", - " n_path = list(path)\n", - " for sub_path in dfs(g, target, next_node, cur_len, n_path):\n", - " yield sub_path" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "483b6c4e-0ecd-47cf-91ab-29530b7bf7bc", - "metadata": {}, - "outputs": [], - "source": [ - "def align_to_graph(graph, haplotype, start_node='src', mincos=0.90, minsize=0.90, wcoslen=2000):\n", - " \"\"\"\n", - " Align a haplotype to a graph\n", - " Return the graph path and haplotype path used.\n", - " \"\"\"\n", - " best_path_hap = []\n", - " best_score = 0\n", - " best_path_graph = []\n", - " n_attempts = 0\n", - " target_len = sum(_.size for _ in haplotype)\n", - " for n_path in dfs(graph, target_len, start_node):\n", - " print(\"trying\", n_path)\n", - " n_attempts += 1\n", - " n_score, best_path_graph = longest_common_subsequence(graph, n_path, haplotype)\n", - " print(n_score, best_path_graph)\n", - " if n_score > best_score:\n", - " best_score = n_score\n", - " best_path_hap = n_path\n", - " elif n_score == best_score and (len(n_path) < len(best_path_hap)): #prefer fewer changes\n", - " best_path_hap = n_path\n", - " if n_attempts >= 100:\n", - " break\n", - " return best_path_graph, best_path_hap" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "id": "59f22d96-8583-4131-bb6f-2baeba0fce14", - "metadata": {}, - "outputs": [], - "source": [ - "import math\n", - "def genotyper(totCov, altCov, priors=None):\n", - " if totCov == 0:\n", - " return None\n", - "\n", - " # previously had avgCov\n", - " if priors is None:\n", - " priors = [0.05, 0.5, 0.95]\n", - "\n", - " # if len(priors) != 3: # raise exception?\n", - "\n", - " def log_choose(n, k):\n", - " \"\"\" swap for efficiency if k is more than half of n \"\"\"\n", - " r = 0.0\n", - " if k * 2 > n:\n", - " k = n - k\n", - "\n", - " for d in range(1, k + 1):\n", - " r += math.log(n, 10)\n", - " r -= math.log(d, 10)\n", - " n -= 1\n", - "\n", - " return r\n", - "\n", - " total = totCov # refCoverage + altCoverage if avgCov is None else avgCov\n", - " alt = altCov # int(spot.tags[\"szCount\"])\n", - " non_alt = total - alt\n", - "\n", - " gtList = []\n", - "\n", - " comb = log_choose(total, alt)\n", - " for p_alt in priors:\n", - " gtList.append(comb + alt * math.log(p_alt, 10) + non_alt * math.log(1 - p_alt, 10))\n", - "\n", - " return gtList" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bfb4fea5-8287-404d-a6d9-111b45b968b3", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 194, - "id": "df09471c-b2b5-4cd3-9846-b9521628f7ea", - "metadata": {}, - "outputs": [], - "source": [ - "def log_choose(n, k):\n", - " r = 0.0\n", - " # swap for efficiency if k is more than half of n\n", - " if k * 2 > n:\n", - " k = n - k\n", - "\n", - " for d in range(1,k+1):\n", - " r += math.log(n, 10)\n", - " r -= math.log(d, 10)\n", - " n -= 1\n", - "\n", - " return r\n", - "\n", - "def bayes_gt(ref, alt, is_dup):\n", - " # probability of seeing an alt read with true genotype of of hom_ref, het, hom_alt respectively\n", - " if is_dup: # specialized logic to handle non-destructive events such as duplications\n", - " p_alt = [1e-2, 0.2, 1/3.0]\n", - " else:\n", - " #p_alt = [1e-3, 0.5, 0.9]\n", - " p_alt = [0.05, 0.5, 0.95]\n", - "\n", - " total = ref + alt\n", - " log_combo = log_choose(total, alt)\n", - "\n", - " lp_homref = log_combo + alt * math.log(p_alt[0], 10) + ref * math.log(1 - p_alt[0], 10)\n", - " lp_het = log_combo + alt * math.log(p_alt[1], 10) + ref * math.log(1 - p_alt[1], 10)\n", - " lp_homalt = log_combo + alt * math.log(p_alt[2], 10) + ref * math.log(1 - p_alt[2], 10)\n", - "\n", - " return (lp_homref, lp_het, lp_homalt)\n", - "\n", - "def do_math(gt_lplist):\n", - " print(gt_lplist)\n", - " best, second_best = sorted([ (i, e) for i, e in enumerate(gt_lplist) ], key=(lambda x: x[1]), reverse=True)[0:2]\n", - "\n", - " gt_sum = 0\n", - " for gt in gt_lplist:\n", - " try:\n", - " gt_sum += 10**gt\n", - " except OverflowError:\n", - " print('no?')\n", - " gt_sum += 0\n", - "\n", - " gt_sum_log = math.log(gt_sum, 10)\n", - " # SQ - higher better?\n", - " sample_qual = min(abs(-10 * (gt_lplist[0] - gt_sum_log)),100) # phred-scaled probability site is non-reference in this sample\n", - " # GQ - higher better?\n", - " phred_gq = min(-10 * (second_best[1] - best[1]), 100)\n", - " return sample_qual, phred_gq" - ] - }, - { - "cell_type": "code", - "execution_count": 195, - "id": "7b44b09d-5fa4-4242-86e0-d92468bfbc51", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(-11.235308810520248, -1.7928160124259056, -1.0052800028976154)\n" - ] - }, - { - "data": { - "text/plain": [ - "(100, 7.875360095282902)" - ] - }, - "execution_count": 195, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "do_math(bayes_gt(2, 10, False))" - ] - }, - { - "cell_type": "code", - "execution_count": 138, - "id": "d9a2f480-0e53-4880-a8f7-c20f388e5515", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[-4.527426373031058, -0.806179973983887, -0.6911655701725709]\n" - ] - }, - { - "data": { - "text/plain": [ - "[3.2370816095851325, 0.06089080720808001, 0.042719299791641144]" - ] - }, - "execution_count": 138, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x = genotyper(5, 4)\n", - "print(x)\n", - "[10 ** (math.log(abs(i)))/10 for i in x]" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "316cae30-dd3e-48b1-a734-a9e0bffd7e46", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "src None\n", - "snk None\n", - "1ab39dff06fc5af6366551bc222faf4d1142fb99 chr20\t8031512\tpbsv.INS.183\tT\tTATGTATGTATGTATGATGTATGTATGATGTATGTATGTATGATGTATGTATGATGTATGTATGATGTATGTATGTATGATGTATGTATGATGTATGTATGATGTATGTATGTATGATGTATGTATGATGTATGATGTATGTATGTATGATGTATGTATGTATGATGTATGTATGATGTATGTATGATGTATG\t3\tPASS\tSVTYPE=INS;SVLEN=192;SVANN=TANDEM;AC=1\tGT:AD:DP:SAC:SUPP\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t0/1:6,3:9:3,3,2,1:1\t./.:.:.:.:.\t./.:.:.:.:.\n", - "\n", - "9a4086d2c45a87751cebde7061f0c9705d6d8425 chr20\t8031517\tSniffles2.INS.50S13\tA\tATGTATGTATGATGTATGTATGATGTATGTATGTATGATGTATGTATGATGTATGTATGATGTATGTATGTATGATGTATGTATGATGTATGTATG\t2\tPASS\tPRECISE;SVTYPE=INS;SVLEN=95;SUPPORT=3;COVERAGE=8,9,9,9,9;STRAND=+-;AF=0.333;STDEV_LEN=0;STDEV_POS=0;SUPPORT_LONG=0;AC=1\tGT:GQ:DR:DV:SUPP\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t0/1:13:6:3:2\t./.:.:.:.:.\t./.:.:.:.:.\n", - "\n", - "30dbc4c3ecb41a50196a8dd592a77dbf6bef1aeb chr20\t8031531\tchr20-8031532-INS-124\tG\tGATGTATGTATGATATGTATGTATGTATGATGTATGTATTATGTATGTATGTATGATGTATGTATGATGTATGTATGATGTATGTATGATGTATGTATGTATGATGTATGTATGATGTATGATGT\t4\t.\tID=chr20-8031532-INS-124;SVTYPE=INS;SVLEN=124;TIG_REGION=h2tg006170l:2843-2966;QUERY_STRAND=-;HOM_REF=0,11;HOM_TIG=0,11;AC=1\tGT:SUPP\t./.:.\t./.:.\t0/1:4\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\n", - "\n", - "0cacd6858efe76ccf815b689378558bc98226ca6 chr20\t8031561\tpbsv.INS.170;chr20-8031562-INS-53\tG\tGATGTATGTATGATGTATGTATGTATGATGTATGTATGTATGTATGTATGATGT\t4\tPASS\tSVTYPE=INS;SVLEN=53;SVANN=TANDEM;ID=chr20-8031562-INS-53;TIG_REGION=h1tg004137l:95190-95242;QUERY_STRAND=-;HOM_REF=0,22;HOM_TIG=0,22;AC=1\tGT:AD:DP:SAC:SUPP\t./.:.:.:.:.\t0/1:4,6:10:1,3,4,2:5\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\n", - "\n", - "aef483aba32454fdae99e87dc6b14cc375ddb362 chr20\t8031562\tSniffles2.INS.31S13\tA\tTATGTATGATGTATGTATGATGTATGTATGATGTATGTATGATGTATGTATGTTGATGTATGTATGATGTATGATGTATGTATGTATGTATGATGT\t2\tPASS\tIMPRECISE;SVTYPE=INS;SVLEN=95;SUPPORT=9;COVERAGE=10,10,10,10,11;STRAND=+-;AF=0.9;STDEV_LEN=30.162;STDEV_POS=43.894;SUPPORT_LONG=0;AC=2\tGT:GQ:DR:DV:SUPP\t./.:.:.:.:.\t1/1:15:1:9:2\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\n", - "\n", - "4be2ce31efa8a1c8cf23c4e5e0b1d2aecb7c5510 chr20\t8031594\tpbsv.INS.238\tG\tGATGTATGTATGATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGATGTATGTATGATGTATGTATGATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGATGTATGTGTGTATGATGTATGTATGT\t3\tPASS\tSVTYPE=INS;SVLEN=142;SVANN=TANDEM;AC=1\tGT:AD:DP:SAC:SUPP\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t0/1:4,4:8:2,2,2,2:1\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\n", - "\n", - "3003b4b88dfa7ff1b8605d4d5439f6fe40092534 chr20\t8031594\tchr20-8031595-INS-61\tG\tGTATGTATGTATGATGTATGTATGATGTATGTATGTATGATGTATGTATGATGTATGTATGC\t4\t.\tID=chr20-8031595-INS-61;SVTYPE=INS;SVLEN=61;TIG_REGION=h1tg009636l:15255-15315,h2tg003757l:470716-470776;QUERY_STRAND=+,+;HOM_REF=0,0;HOM_TIG=0,0;NumCollapsed=2;NumConsolidated=2;CollapseId=27074.0;AC=1\tGT:AD:DP:DR:DV:GQ:SAC:SUPP\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t0/1:2,5:7:2:5:6:1,1,1,4:7\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\n", - "\n", - "9b630a836027cac054e863df7c97f0f9fea19f9e chr20\t8031631\tchr20-8031632-INS-86\tG\tGTATGATGTATGTATGATGTATGTATGATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGATGTATGTGTGTATGATGT\t4\t.\tID=chr20-8031632-INS-86;SVTYPE=INS;SVLEN=86;TIG_REGION=h2tg021944l:13667-13752;QUERY_STRAND=+;HOM_REF=0,0;HOM_TIG=0,0;NumCollapsed=1;NumConsolidated=1;CollapseId=28855.0;AC=2\tGT:SUPP:DR:DV:GQ\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t0/1:4:.:.:.\t./.:.:.:.:.\t0/1:6:8:9:54\t./.:.:.:.:.\n", - "\n", - "a6cae606c444f276efe66c46b4757d633a0b23db chr20\t8031631\tchr20-8031632-INS-132\tG\tGTATGATGTATGTATGATGTATGTATGATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGATGTATGTGTGTATGATGTATGTATGTATGTATGATGTATGTATGTATGTATGATGTATGTATGT\t4\t.\tID=chr20-8031632-INS-132;SVTYPE=INS;SVLEN=132;TIG_REGION=h2tg006654l:15345-15476;QUERY_STRAND=-;HOM_REF=0,0;HOM_TIG=0,0;AC=1\tGT:SUPP\t0/1:4\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\t./.:.\n", - "\n", - "f2a285fa1cfc415fdaffbda752a0d1d90fb762c7 chr20\t8031632\tSniffles2.INS.55S13\tA\tTATGATGTATGTATGATGTATGTATGATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGATGTATGTGTGTATGATGTATGTATGTATGTATGATGT\t2\tPASS\tPRECISE;SVTYPE=INS;SVLEN=104;SUPPORT=4;COVERAGE=8,8,8,8,8;STRAND=+-;AF=0.5;STDEV_LEN=0;STDEV_POS=0;SUPPORT_LONG=0;AC=1\tGT:GQ:DR:DV:SUPP\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t0/1:28:4:4:2\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\n", - "\n", - "eb5ba19503f7d198a58b21b9538778e25d88840c chr20\t8031646\tchr20-8031647-INS-143\tG\tGTATGTATGATGTATGTATGTATGATGTATGTATGATGTATGTATGATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGATGTATGTGTGTATGATGTATGTATGTATGTATGATGTATGTATGTATGTATGATGT\t4\t.\tID=chr20-8031647-INS-143;SVTYPE=INS;SVLEN=143;TIG_REGION=h2tg003866l:178061-178203;QUERY_STRAND=+;HOM_REF=0,0;HOM_TIG=0,0;NumCollapsed=1;NumConsolidated=1;CollapseId=83766.6;AC=1\tGT:DR:DV:GQ:SUPP\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t0/1:4:8:17:2\t./.:.:.:.:.\t0/1:.:.:.:.\t./.:.:.:.:.\n", - "\n", - "dcedc19956e539c5bbad3582ce6a1480c565cda2 chr20\t8031646\tchr20-8031647-INS-170\tG\tGTATGTATGTATGATGTATGTATGTATGATGTATGTATGATGTATGTATGATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGATGTATGTGTGTATGATGTATGTATGTATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGTATGATGTATGT\t4\t.\tID=chr20-8031647-INS-170;SVTYPE=INS;SVLEN=170;TIG_REGION=h1tg002840l:315234-315403;QUERY_STRAND=+;HOM_REF=0,0;HOM_TIG=0,0;NumCollapsed=2;NumConsolidated=2;CollapseId=26784.1;AC=2\tGT:AD:DP:DR:DV:GQ:SAC:SUPP\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t1/1:0,3:3:1:2:4:0,0,0,3:7\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\t./.:.:.:.:.:.:.:.\n", - "\n", - "79667743877d64eb4ac1746e78ffe8deede061de chr20\t8031677\tSniffles2.INS.48S13\tA\tTATGTATGCGTGTATGACGTATGTATGTATGTATGATGTATGTATGTATGTATGATGTGTGTATGCATGTATGATGT\t2\tPASS\tPRECISE;SVTYPE=INS;SVLEN=76;SUPPORT=5;COVERAGE=10,11,11,12,12;STRAND=+;AF=0.455;STDEV_LEN=2.309;STDEV_POS=0;SUPPORT_LONG=0;AC=1\tGT:GQ:DR:DV:SUPP\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t0/1:33:6:5:2\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\n", - "\n", - "efba80a423c9a50fb92f5f47f2ca6d8a7ce8b2dd chr20\t8031689\tpbsv.INS.207\tA\tATGTATGATGTGTGTATGCATGTATGATGTCTGTATGTATGTGTGTATGATGTATGTATGTATGTG\t3\tPASS\tSVTYPE=INS;SVLEN=65;SVANN=TANDEM;AC=2\tGT:AD:DP:SAC:SUPP\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t1/1:0,3:3:0,0,0,3:1\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\n", - "\n", - "94aeca41ecea467f214c43fe57a50425c6e85a62 chr20\t8031712\tchr20-8031713-INS-61\tA\tATGATGTGTGTATGCATGTATGATGTCTGTATGTATGTGTGTATGATGTATGTATGTATGTG\t4\t.\tID=chr20-8031713-INS-61;SVTYPE=INS;SVLEN=61;TIG_REGION=h1tg009304l:96829-96889;QUERY_STRAND=-;HOM_REF=0,2;HOM_TIG=0,2;NumCollapsed=1;NumConsolidated=1;CollapseId=29397.0;AC=3\tGT:AD:DP:SAC:SUPP\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t1/1:0,11:11:0,0,5,6:5\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t0/1:8,5:13:3,5,2,3:5\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\n", - "\n", - "716c0317c70673f9f7edfadc04bded59891880dc chr20\t8031731\tpbsv.INS.174;pbsv.INS.182\tA\tATGATGTATGTGTGTATGATGTATGTATGTATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGTATGTATGATGTATGTATGTATGATGTATGTATGTATGTATGTATGATGTATGTATGTATGTATGATGTATGTGTGTATGTATGATGTATGTG\t3\tPASS\tSVTYPE=INS;SVLEN=182;SVANN=TANDEM;NumCollapsed=1;NumConsolidated=3;CollapseId=83766.11;AC=2\tGT:AD:DP:SAC:DR:DV:GQ:SUPP\t0/1:.,.:.:.:.:.:.:.\t0/1:6,4:10:4,2,1,3:.:.:.:1\t./.:.,.:.:.:.:.:.:.\t0/1:.,.:.:.:.:.:.:.\t./.:.,.:.:.:.:.:.:.\t./.:.,.:.:.:.:.:.:.\t./.:.,.:.:.:.:.:.:.\t./.:.,.:.:.:.:.:.:.\t./.:.,.:.:.:.:.:.:.\t./.:.,.:.:.:.:.:.:.\t0/1:6,7:13:3,3,2,5:.:.:.:1\t./.:.,.:.:.:.:.:.:.\t1/1:0,17:17:0,0,9,8:.:.:.:1\t./.:.,.:.:.:.:.:.:.\n", - "\n" - ] - } - ], - "source": [ - "for k,n in graph.nodes.items():\n", - " print(k, n['entry'])" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "c9c18464-8fb5-4243-8d24-1fb480fe6d23", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Haplotype(kfeat=array([-1., 6., 0., 0., 11., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 10., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 5.,\n", - " 5., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32), size=37, n=1, coverage=1),\n", - " Haplotype(kfeat=array([-1., 20., 0., 0., 25., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 25., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 20.,\n", - " 5., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32), size=95, n=1, coverage=1),\n", - " Haplotype(kfeat=array([-1., 4., 0., 0., 8., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 7., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 4.,\n", - " 3., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", - " 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32), size=26, n=1, coverage=1)]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hap2" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "24bf0450-83f2-43d8-aa3f-7daa17ea4dd7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "None\n", - "chr20\t8031594\tpbsv.INS.238\tG\tGATGTATGTATGATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGATGTATGTATGATGTATGTATGATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGATGTATGTGTGTATGATGTATGTATGT\t3\tPASS\tSVTYPE=INS;SVLEN=142;SVANN=TANDEM;AC=1\tGT:AD:DP:SAC:SUPP\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t0/1:4,4:8:2,2,2,2:1\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\n", - "\n", - "chr20\t8031631\tchr20-8031632-INS-86\tG\tGTATGATGTATGTATGATGTATGTATGATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGATGTATGTGTGTATGATGT\t4\t.\tID=chr20-8031632-INS-86;SVTYPE=INS;SVLEN=86;TIG_REGION=h2tg021944l:13667-13752;QUERY_STRAND=+;HOM_REF=0,0;HOM_TIG=0,0;NumCollapsed=1;NumConsolidated=1;CollapseId=28855.0;AC=2\tGT:SUPP:DR:DV:GQ\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t0/1:4:.:.:.\t./.:.:.:.:.\t0/1:6:8:9:54\t./.:.:.:.:.\n", - "\n", - "chr20\t8031632\tSniffles2.INS.55S13\tA\tTATGATGTATGTATGATGTATGTATGATGTATGATGTATGTATGTATGTATGATGTATGTATGTATGATGTATGTGTGTATGATGTATGTATGTATGTATGATGT\t2\tPASS\tPRECISE;SVTYPE=INS;SVLEN=104;SUPPORT=4;COVERAGE=8,8,8,8,8;STRAND=+-;AF=0.5;STDEV_LEN=0;STDEV_POS=0;SUPPORT_LONG=0;AC=1\tGT:GQ:DR:DV:SUPP\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t0/1:28:4:4:2\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\t./.:.:.:.:.\n", - "\n" - ] - } - ], - "source": [ - "for i in result[1]:\n", - " print(graph.nodes[i]['entry'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9392b520-094f-4e54-afa9-822390546366", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "THRESH = 0.90\n", - "def longest_common_subsequence(path1, path2):\n", - " # Compute the length of the longest common subsequence between two paths\n", - " m, n = len(path1), len(path2)\n", - " dp = [[0] * (n + 1) for _ in range(m + 1)]\n", - "\n", - " for i in range(1, m + 1):\n", - " for j in range(1, n + 1):\n", - " if path1[i - 1] == path2[j - 1]: # If they're over the similarity minimums\n", - " dp[i][j] = dp[i - 1][j - 1] + 1 # Matched, add one to the previous match\n", - " else:\n", - " dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) # \n", - "\n", - " ret = dp[m][n]\n", - " #print(ret, '\\t\\t', \"\".join(path1), \"<---\\t--->\", \"\".join(path2))\n", - " return ret\n", - "\n", - "def dfs(g, target, cur_len=0, cur_node=None, path=None): \n", - " if not cur_node:\n", - " cur_node = 'src'\n", - " else:\n", - " if path is None:\n", - " path = []\n", - " path.append(cur_node)\n", - " cur_len += g.nodes[cur_node]['hap'].size\n", - " diffs = sorted([(abs(target - (cur_len + g.nodes[n]['hap'].size)), n)\n", - " for _, n in g.out_edges(cur_node)])\n", - " for next_node in diffs:\n", - " if next_node == 'snk' and cur_node != 'src':\n", - " yield list(path)\n", - " else:\n", - " n_path = list(path)\n", - " for sub_path in dfs(g, next_node, n_path):\n", - " yield sub_path" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "240a6589-c0aa-48b7-8c0c-e587ef12c295", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "trying ['B', 'D', 'E', 'F', 'G']\n", - "trying ['B', 'D', 'E', 'F']\n", - "trying ['B', 'D', 'E', 'G']\n", - "trying ['B', 'D', 'E']\n", - "trying ['B', 'D', 'F', 'G']\n", - "trying ['B', 'D', 'F']\n", - "trying ['B', 'D', 'G']\n", - "trying ['B', 'D']\n", - "trying ['B', 'E', 'F', 'G']\n", - "trying ['B', 'E', 'F']\n", - "trying ['B', 'E', 'G']\n", - "trying ['B', 'E']\n", - "trying ['B', 'F', 'G']\n", - "trying ['B', 'F']\n", - "trying ['B', 'G']\n", - "trying ['B']\n", - "['B', 'D', 'G'] 16\n" - ] - } - ], - "source": [ - "# Example usage:\n", - "import itertools\n", - "graph = {\n", - " 'src': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'snk'],\n", - " 'A': ['B', 'C', 'D', 'E', 'F', 'G', 'snk'],\n", - " # B/C overlapping\n", - " 'B': ['D', 'E', 'F', 'G', 'snk'],\n", - " 'C': ['D', 'E', 'F', 'G', 'snk'],\n", - " 'D': ['E', 'F', 'G', 'snk'],\n", - " 'E': ['F', 'G', 'snk'],\n", - " 'F': ['G', 'snk'],\n", - " 'G': ['snk'],\n", - " 'snk': [],\n", - "}\n", - "\n", - "start_node = 'src'\n", - "target_path = ['Do', 'lamma', 'B', 'R', 'D', 'X', 'Y', 'G'] # Example target path with extra nodes\n", - "\n", - "graph_nodes = list(graph.keys())\n", - "t_idx = 0\n", - "g_idx = 0\n", - "\n", - "# Find the first graph node that matches to the target path\n", - "# If no graph node matches t_idx[0], then increase t_idx\n", - "# Pick the starting point\n", - "for t_start, start_node in itertools.product(target_path, graph_nodes[1:-1]):\n", - " if t_start == start_node:\n", - " break\n", - "target_path = target_path[target_path.index(t_start):] \n", - "\n", - "# Iterate all the paths, trying to find the best.\n", - "best_path = []\n", - "best_score = 0\n", - "n_attempts = 0\n", - "for n_path in dfs(graph, start_node):\n", - " print(\"trying\", n_path)\n", - " n_attempts += 1\n", - " n_score = longest_common_subsequence(n_path, target_path)\n", - " if n_score > best_score:\n", - " best_score = n_score\n", - " best_path = n_path\n", - " elif n_score == best_score and (len(n_path) < len(best_path)): #prefer fewer changes\n", - " best_path = n_path\n", - " \n", - "print(best_path, n_attempts)\n", - "\n", - "# So now I need to figure out what happens when we don't have an anchor?\n", - "# Like, If \n" - ] - }, - { - "cell_type": "code", - "execution_count": 143, - "id": "7082da1b-4dea-4e42-a9fa-c62504abbfe1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['D'] ['B', 'R', 'D', 'X', 'Y', 'G'] 1\n", - "['E'] ['B', 'R', 'D', 'X', 'Y', 'G'] 0\n", - "['F'] ['B', 'R', 'D', 'X', 'Y', 'G'] 0\n", - "['G'] ['B', 'R', 'D', 'X', 'Y', 'G'] 1\n", - "['E'] ['B', 'R', 'D', 'X', 'Y', 'G'] 0\n", - "['F'] ['B', 'R', 'D', 'X', 'Y', 'G'] 0\n", - "['G'] ['B', 'R', 'D', 'X', 'Y', 'G'] 1\n", - "Most similar path: []\n" - ] - } - ], - "source": [ - "# Example usage:\n", - "# Try again, but this time have sizes\n", - "# Also, move this to a DiGraph\n", - "import itertools\n", - "import networkx as nx\n", - "\n", - "g = nx.DiGraph()\n", - "g.add_edge('src', 'snk')\n", - "variants = ['A', 'B', 'C', 'D', 'E', 'F', 'G']\n", - "sizes = [40, 100, 125, \n", - "for i in \n", - "for i in range(len(variants) - 1):\n", - " for j in range(i + 1, len(variants)):\n", - " g.add_node\n", - "for i in \n", - "graph = {\n", - " 'src': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'snk'],\n", - " 'A': ['B', 'C', 'D', 'E', 'F', 'G', 'snk'],\n", - " # B/C overlapping\n", - " 'B': ['D', 'E', 'F', 'G', 'snk'],\n", - " 'C': ['D', 'E', 'F', 'G', 'snk'],\n", - " 'D': ['E', 'F', 'G', 'snk'],\n", - " 'E': ['F', 'G', 'snk'],\n", - " 'F': ['G', 'snk'],\n", - " 'G': ['snk'],\n", - " 'snk': [],\n", - "}\n", - "\n", - "start_node = 'src'\n", - "target_path = ['D', 'lamma', 'B', 'R', 'D', 'X', 'Y', 'G'] # Example target path with extra nodes\n", - "\n", - "graph_nodes = list(graph.keys())\n", - "t_idx = 0\n", - "g_idx = 0\n", - "\n", - "# Find the first graph node that matches to the target path\n", - "# If no graph node matches t_idx[0], then increase t_idx\n", - "# Pick the starting point\n", - "for t_start, start_node in itertools.product(target_path, graph_nodes[1:-1]):\n", - " if t_start == start_node:\n", - " break\n", - "target_path = target_path[target_path.index(t_start):] \n", - "\n", - "# Iterate all the paths, trying to find the best.\n", - "best_path = []\n", - "best_score = 0\n", - "n_attempts = 0\n", - "for n_path in dfs(graph, start_node):\n", - " print(\"trying\", n_path)\n", - " n_attempts += 1\n", - " n_score = longest_common_subsequence(n_path, target_path)\n", - " if n_score > best_score:\n", - " best_score = n_score\n", - " best_path = n_path\n", - " elif n_score == best_score and (len(n_path) < len(best_path)): #prefer fewer changes\n", - " best_path = n_path\n", - " \n", - "print(best_path, n_attempts)\n", - "\n", - "# So now I need to figure out what happens when we don't have an anchor?\n", - "# Like, If \n" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "id": "8876c26d-bd94-42ed-9088-aa2ffa2fffd2", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 118, - "id": "6acb6272-1eea-4ebb-b56d-a9ed43d62dd8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3.7390368499084894" - ] - }, - "execution_count": 118, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "10**((5.727597453449266)/10)" - ] - }, - { - "cell_type": "code", - "execution_count": 187, - "id": "2a4ed4eb-b9e3-4a92-85b5-3e2dc7f22d96", - "metadata": {}, - "outputs": [], - "source": [ - "import pysam\n", - "bam = pysam.AlignmentFile(\"/Users/english/code/kfdphase/kdprs/test/GIABHG002.bam\")\n", - "aln = next(bam)" - ] - }, - { - "cell_type": "code", - "execution_count": 197, - "id": "badf2789-718b-4076-b744-c43e610cb7cf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['__class__',\n", - " '__delattr__',\n", - " '__dir__',\n", - " '__doc__',\n", - " '__enter__',\n", - " '__eq__',\n", - " '__exit__',\n", - " '__format__',\n", - " '__ge__',\n", - " '__getattribute__',\n", - " '__gt__',\n", - " '__hash__',\n", - " '__init__',\n", - " '__init_subclass__',\n", - " '__iter__',\n", - " '__le__',\n", - " '__lt__',\n", - " '__ne__',\n", - " '__new__',\n", - " '__next__',\n", - " '__pyx_vtable__',\n", - " '__reduce__',\n", - " '__reduce_ex__',\n", - " '__repr__',\n", - " '__setattr__',\n", - " '__setstate__',\n", - " '__sizeof__',\n", - " '__str__',\n", - " '__subclasshook__',\n", - " '_open',\n", - " 'add_hts_options',\n", - " 'category',\n", - " 'check_index',\n", - " 'check_truncation',\n", - " 'close',\n", - " 'closed',\n", - " 'compression',\n", - " 'count',\n", - " 'count_coverage',\n", - " 'description',\n", - " 'duplicate_filehandle',\n", - " 'fetch',\n", - " 'filename',\n", - " 'find_introns',\n", - " 'find_introns_slow',\n", - " 'format',\n", - " 'get_index_statistics',\n", - " 'get_reference_length',\n", - " 'get_reference_name',\n", - " 'get_tid',\n", - " 'getrname',\n", - " 'gettid',\n", - " 'has_index',\n", - " 'head',\n", - " 'header',\n", - " 'index_filename',\n", - " 'is_bam',\n", - " 'is_bcf',\n", - " 'is_closed',\n", - " 'is_cram',\n", - " 'is_open',\n", - " 'is_read',\n", - " 'is_remote',\n", - " 'is_sam',\n", - " 'is_stream',\n", - " 'is_valid_reference_name',\n", - " 'is_valid_tid',\n", - " 'is_vcf',\n", - " 'is_write',\n", - " 'lengths',\n", - " 'mapped',\n", - " 'mate',\n", - " 'mode',\n", - " 'nocoordinate',\n", - " 'nreferences',\n", - " 'parse_region',\n", - " 'pileup',\n", - " 'reference_filename',\n", - " 'references',\n", - " 'reset',\n", - " 'seek',\n", - " 'tell',\n", - " 'text',\n", - " 'threads',\n", - " 'unmapped',\n", - " 'version',\n", - " 'write']" - ] - }, - "execution_count": 197, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dir(bam)" - ] - }, - { - "cell_type": "code", - "execution_count": 227, - "id": "a842e7bf-c2db-4539-8a44-c20866d54034", - "metadata": {}, - "outputs": [], - "source": [ - "x = pysam.AlignedSegment(bam.header)" - ] - }, - { - "cell_type": "code", - "execution_count": 211, - "id": "1fd1dcee-9902-4232-832e-09d2551ac09c", - "metadata": {}, - "outputs": [], - "source": [ - "x.tid = 0" - ] - }, - { - "cell_type": "code", - "execution_count": 223, - "id": "a392f415-66bf-44c1-9b53-058dc8da3a51", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['__class__',\n", - " '__copy__',\n", - " '__deepcopy__',\n", - " '__delattr__',\n", - " '__dir__',\n", - " '__doc__',\n", - " '__eq__',\n", - " '__format__',\n", - " '__ge__',\n", - " '__getattribute__',\n", - " '__gt__',\n", - " '__hash__',\n", - " '__init__',\n", - " '__init_subclass__',\n", - " '__le__',\n", - " '__lt__',\n", - " '__ne__',\n", - " '__new__',\n", - " '__pyx_vtable__',\n", - " '__reduce__',\n", - " '__reduce_ex__',\n", - " '__repr__',\n", - " '__setattr__',\n", - " '__setstate__',\n", - " '__sizeof__',\n", - " '__str__',\n", - " '__subclasshook__',\n", - " 'aend',\n", - " 'alen',\n", - " 'aligned_pairs',\n", - " 'bin',\n", - " 'blocks',\n", - " 'cigar',\n", - " 'cigarstring',\n", - " 'cigartuples',\n", - " 'compare',\n", - " 'flag',\n", - " 'from_dict',\n", - " 'fromstring',\n", - " 'get_aligned_pairs',\n", - " 'get_blocks',\n", - " 'get_cigar_stats',\n", - " 'get_forward_qualities',\n", - " 'get_forward_sequence',\n", - " 'get_overlap',\n", - " 'get_reference_positions',\n", - " 'get_reference_sequence',\n", - " 'get_tag',\n", - " 'get_tags',\n", - " 'has_tag',\n", - " 'header',\n", - " 'infer_query_length',\n", - " 'infer_read_length',\n", - " 'inferred_length',\n", - " 'is_duplicate',\n", - " 'is_forward',\n", - " 'is_mapped',\n", - " 'is_paired',\n", - " 'is_proper_pair',\n", - " 'is_qcfail',\n", - " 'is_read1',\n", - " 'is_read2',\n", - " 'is_reverse',\n", - " 'is_secondary',\n", - " 'is_supplementary',\n", - " 'is_unmapped',\n", - " 'isize',\n", - " 'mapping_quality',\n", - " 'mapq',\n", - " 'mate_is_forward',\n", - " 'mate_is_mapped',\n", - " 'mate_is_reverse',\n", - " 'mate_is_unmapped',\n", - " 'modified_bases',\n", - " 'modified_bases_forward',\n", - " 'mpos',\n", - " 'mrnm',\n", - " 'next_reference_id',\n", - " 'next_reference_name',\n", - " 'next_reference_start',\n", - " 'opt',\n", - " 'overlap',\n", - " 'pnext',\n", - " 'pos',\n", - " 'positions',\n", - " 'qend',\n", - " 'qlen',\n", - " 'qname',\n", - " 'qqual',\n", - " 'qstart',\n", - " 'qual',\n", - " 'query',\n", - " 'query_alignment_end',\n", - " 'query_alignment_length',\n", - " 'query_alignment_qualities',\n", - " 'query_alignment_sequence',\n", - " 'query_alignment_start',\n", - " 'query_length',\n", - " 'query_name',\n", - " 'query_qualities',\n", - " 'query_sequence',\n", - " 'reference_end',\n", - " 'reference_id',\n", - " 'reference_length',\n", - " 'reference_name',\n", - " 'reference_start',\n", - " 'rlen',\n", - " 'rname',\n", - " 'rnext',\n", - " 'seq',\n", - " 'setTag',\n", - " 'set_tag',\n", - " 'set_tags',\n", - " 'tags',\n", - " 'template_length',\n", - " 'tid',\n", - " 'tlen',\n", - " 'to_dict',\n", - " 'to_string',\n", - " 'tostring']" - ] - }, - "execution_count": 223, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pysam.Align" - ] - }, - { - "cell_type": "code", - "execution_count": 267, - "id": "14d55b1d-ac5b-47ac-b3d5-c1194303924a", - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "sequence item 3: expected str instance, int found", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [267], line 52\u001b[0m\n\u001b[1;32m 50\u001b[0m input_bam_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/Users/english/code/kfdphase/kdprs/test/GIABHG002.bam\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 51\u001b[0m output_bam_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput.sam\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 52\u001b[0m \u001b[43msplit_alignments\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_bam_file\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_bam_file\u001b[49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn [267], line 45\u001b[0m, in \u001b[0;36msplit_alignments\u001b[0;34m(input_bam_path, output_bam_path, max_length)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m pysam\u001b[38;5;241m.\u001b[39mAlignmentFile(input_bam_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m input_bam, \\\n\u001b[1;32m 43\u001b[0m pysam\u001b[38;5;241m.\u001b[39mAlignmentFile(output_bam_path, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mw\u001b[39m\u001b[38;5;124m\"\u001b[39m, header\u001b[38;5;241m=\u001b[39minput_bam\u001b[38;5;241m.\u001b[39mheader) \u001b[38;5;28;01mas\u001b[39;00m output_bam:\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m alignment \u001b[38;5;129;01min\u001b[39;00m input_bam:\n\u001b[0;32m---> 45\u001b[0m sub_alignments \u001b[38;5;241m=\u001b[39m \u001b[43msplit_alignment\u001b[49m\u001b[43m(\u001b[49m\u001b[43malignment\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_bam\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mheader\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 46\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sub_align \u001b[38;5;129;01min\u001b[39;00m sub_alignments:\n\u001b[1;32m 47\u001b[0m output_bam\u001b[38;5;241m.\u001b[39mwrite(sub_align)\n", - "Cell \u001b[0;32mIn [267], line 26\u001b[0m, in \u001b[0;36msplit_alignment\u001b[0;34m(alignment, max_length, header)\u001b[0m\n\u001b[1;32m 23\u001b[0m cur_align[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mseq\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m curd[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mseq\u001b[39m\u001b[38;5;124m'\u001b[39m][start:start\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m50\u001b[39m]\n\u001b[1;32m 24\u001b[0m cur_align[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcurd[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00midx\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 26\u001b[0m sub_align \u001b[38;5;241m=\u001b[39m \u001b[43mpysam\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mAlignedSegment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcur_align\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheader\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheader\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 27\u001b[0m sub_align\u001b[38;5;241m.\u001b[39mfrom_dict(cur_align)\n\u001b[1;32m 29\u001b[0m \u001b[38;5;66;03m#sub_align = pysam.AlignedSegment(header=header, \u001b[39;00m\n\u001b[1;32m 30\u001b[0m \u001b[38;5;66;03m# reference_name=alignment.reference_name,\u001b[39;00m\n\u001b[1;32m 31\u001b[0m \u001b[38;5;66;03m# pos=start)\u001b[39;00m\n", - "File \u001b[0;32m~/py/lib/python3.9/site-packages/pysam/libcalignedsegment.pyx:1142\u001b[0m, in \u001b[0;36mpysam.libcalignedsegment.AlignedSegment.from_dict\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: sequence item 3: expected str instance, int found" - ] - } - ], - "source": [ - "import pysam\n", - "import copy\n", - "def split_alignment(alignment, max_length, header):\n", - " \"\"\"\n", - " Split a single alignment into multiple alignments with a maximum length.\n", - " \"\"\"\n", - " if alignment.query_alignment_length <= max_length:\n", - " return [alignment]\n", - " \n", - " curd = alignment.to_dict()\n", - " sub_alignments = []\n", - " num_splits = (alignment.query_alignment_length + max_length - 1) // max_length\n", - " for idx,i in enumerate(range(num_splits)):\n", - " start = i * max_length\n", - " end = min(start + max_length, alignment.query_alignment_length)\n", - "\n", - " # dict_keys(['name', 'flag', 'ref_name', 'ref_pos', 'map_quality', 'cigar', 'next_ref_name', \n", - " # 'next_ref_pos', 'length', 'seq', 'qual', 'tags'])\n", - "\n", - " cur_align = copy.copy(curd)\n", - " cur_align['ref_pos'] = start\n", - " cur_align['cigar'] = None\n", - " cur_align['seq'] = curd['seq'][start:start+50]\n", - " cur_align['name'] = f\"{curd['name']}_{idx}\"\n", - "\n", - " sub_align = pysam.AlignedSegment.from_dict(cur_align, header=header)\n", - " sub_align.from_dict(cur_align)\n", - "\n", - " #sub_align = pysam.AlignedSegment(header=header, \n", - " # reference_name=alignment.reference_name,\n", - " # pos=start)\n", - " if alignment.has_tag(\"MD\"):\n", - " sub_align.set_tag(\"MD\", alignment.get_tag(\"MD\")[start:end])\n", - " print(sub_align)\n", - " sub_alignments.append(sub_align)\n", - " return sub_alignments\n", - "\n", - "def split_alignments(input_bam_path, output_bam_path, max_length=50000):\n", - " \"\"\"\n", - " Split alignments longer than max_length into multiple alignments.\n", - " \"\"\"\n", - " with pysam.AlignmentFile(input_bam_path, \"rb\") as input_bam, \\\n", - " pysam.AlignmentFile(output_bam_path, \"w\", header=input_bam.header) as output_bam:\n", - " for alignment in input_bam:\n", - " sub_alignments = split_alignment(alignment, max_length, input_bam.header)\n", - " for sub_align in sub_alignments:\n", - " output_bam.write(sub_align)\n", - "\n", - "# Example usage\n", - "input_bam_file = \"/Users/english/code/kfdphase/kdprs/test/GIABHG002.bam\"\n", - "output_bam_file = \"output.sam\"\n", - "split_alignments(input_bam_file, output_bam_file)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 262, - "id": "183f7a1c-9a5c-4a28-9af6-7c1b60296cc3", - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'sub_align' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [262], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m help(\u001b[43msub_align\u001b[49m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'sub_align' is not defined" - ] - } - ], - "source": [ - "help(sub_align)" - ] - }, - { - "cell_type": "code", - "execution_count": 289, - "id": "c39e5c24-cb31-4d69-8221-1fffd29febf7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['name', 'flag', 'ref_name', 'ref_pos', 'map_quality', 'cigar', 'next_ref_name', 'next_ref_pos', 'length', 'seq', 'qual', 'tags'])" - ] - }, - "execution_count": 289, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "x.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 290, - "id": "e29d5248-5e28-4a36-943e-570aea03ecd7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "18165H610M3I284M236D917M2I333M3D15M1D87M1I1024M1I214\n", - "2048 *\n" - ] - } - ], - "source": [ - "x = aln.to_dict()\n", - "print(x['cigar'][:52])\n", - "print(x['flag'], x['qual'])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 286, - "id": "3e21d70c-14c1-4ca4-9d80-7894aaa7aa1e", - "metadata": {}, - "outputs": [], - "source": [ - "x = aln.to_dict()\n", - "x['seq'] = x['seq'][:10]\n", - "x['cigar'] = \"10M\" # x['cigar'][:10]\n", - "x['tags'] = []" - ] - }, - { - "cell_type": "code", - "execution_count": 288, - "id": "f837cfe9-e2ee-4510-8fdc-edb2420b84be", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "chr20_PATERNAL\t2048\t#0\t89154\t59\t10M\t*\t0\t0\tACACTGGTTG\tNone\t[]\n" - ] - } - ], - "source": [ - "print(pysam.AlignedSegment.from_dict(x, s.header))" - ] - }, - { - "cell_type": "code", - "execution_count": 264, - "id": "475966aa-3a4a-43c7-b55f-769d8d191e66", - "metadata": {}, - "outputs": [], - "source": [ - "s = pysam.AlignmentFile(input_bam_file, \"rb\")" - ] - }, - { - "cell_type": "code", - "execution_count": 266, - "id": "41956276-0288-4a1f-81f8-fa07b9628329", - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "from_dict() takes exactly 2 positional arguments (0 given)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [266], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mpysam\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mAlignedSegment\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/py/lib/python3.9/site-packages/pysam/libcalignedsegment.pyx:1128\u001b[0m, in \u001b[0;36mpysam.libcalignedsegment.AlignedSegment.from_dict\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: from_dict() takes exactly 2 positional arguments (0 given)" - ] - } - ], - "source": [ - "pysam.AlignedSegment.from_dict()" - ] - }, - { - "cell_type": "code", - "execution_count": 248, - "id": "a6c8310b-446a-4055-b846-96a576ca2ad8", - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "from_dict() takes exactly 2 positional arguments (1 given)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [248], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m x[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mseq\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43maln\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/py/lib/python3.9/site-packages/pysam/libcalignedsegment.pyx:1128\u001b[0m, in \u001b[0;36mpysam.libcalignedsegment.AlignedSegment.from_dict\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: from_dict() takes exactly 2 positional arguments (1 given)" - ] - } - ], - "source": [ - "x['seq'] = None\n", - "aln.from_dict(x, input)" - ] - }, - { - "cell_type": "code", - "execution_count": 249, - "id": "4ffe19ee-0a10-4c5a-9ac9-d39f7b9768ef", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on method from_dict in module pysam.libcalignedsegment:\n", - "\n", - "from_dict(sam_dict, header) method of builtins.type instance\n", - " AlignedSegment.from_dict(cls, sam_dict, AlignmentHeader header)\n", - " parses a dictionary representation of the aligned segment.\n", - " \n", - " Parameters\n", - " ----------\n", - " sam_dict:\n", - " dictionary of alignment values, keys corresponding to output from\n", - " :meth:`todict()`.\n", - "\n" - ] - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2408709-5bf1-4107-92a0-fd36d07320af", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From cd79a9b66b4aa08eca055142fa84e695c5f3bf07 Mon Sep 17 00:00:00 2001 From: Adam English Date: Fri, 24 May 2024 20:01:19 -0400 Subject: [PATCH 17/47] closer to moving over I just have to move to the new Samples or whatever instead of Genotypes --- Cargo.lock | 23 +++------ Cargo.toml | 2 +- src/kplib/annotator.rs | 23 +++++---- src/kplib/regions.rs | 4 +- src/kplib/vargraph.rs | 2 +- src/kplib/vcf_traits.rs | 111 ++++++++++------------------------------ src/kplib/vcfreader.rs | 14 ++--- src/kplib/vcfwriter.rs | 85 +++++++++++++++--------------- src/main.rs | 3 +- 9 files changed, 105 insertions(+), 162 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5cf4171..392ae79 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -468,7 +468,6 @@ dependencies = [ "noodles-vcf", "ordered-float", "page_size", - "peak_alloc", "petgraph", "pretty_env_logger", "rust-htslib", @@ -552,9 +551,9 @@ dependencies = [ [[package]] name = "noodles-bgzf" -version = "0.29.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dba1c82e9f92c00b23538359e5d191dff7ccb300cf659ee3a835af65c3cd143" +checksum = "13f54d4840fd26ed94103ded9524aa5fdd757255a556f24653d162c0a45c47e8" dependencies = [ "byteorder", "bytes", @@ -573,9 +572,9 @@ dependencies = [ [[package]] name = "noodles-csi" -version = "0.34.0" +version = "0.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad09737d94ec2674361219fb3d46a81561a15773585805de807cab323a15648a" +checksum = "beb1618ca2aa88662d387197a188686105d6b5e25f6959c52b766276cbfc4620" dependencies = [ "bit-vec", "byteorder", @@ -586,9 +585,9 @@ dependencies = [ [[package]] name = "noodles-tabix" -version = "0.40.0" +version = "0.41.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f8da8182c56b64d28d0330ce209857ba2a5a4981c6925838e4d4ffeea82db09" +checksum = "0cc110f78cb406f69f42c482d1986526c590b7295f37f0e37f1fc380413400ef" dependencies = [ "bit-vec", "byteorder", @@ -600,9 +599,9 @@ dependencies = [ [[package]] name = "noodles-vcf" -version = "0.56.0" +version = "0.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2576e5b1e12d729c93d30ba25676b4a3efb19134f14f30fa30ae70a064d12eb2" +checksum = "4df6e162ec2e898581b5ccf5e8972e376a7c41807061b66152280dea2c53a989" dependencies = [ "indexmap", "memchr", @@ -669,12 +668,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "peak_alloc" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c4e8e2dd832fd76346468f822e4e600d30ba4e5aa545a128abf12cfae7ea3e" - [[package]] name = "percent-encoding" version = "2.3.1" diff --git a/Cargo.toml b/Cargo.toml index fc4b1d7..18742c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ indicatif = "0.17.8" itertools = { version = "0.12.1" } lazy_static = "1.4.0" log = { version = "0.4", features = ["std", "serde"] } -noodles-vcf = { version = "0.56.0" } +noodles-vcf = { version = "0.57.0" } ordered-float = { version = "4.0", default-features = false } page_size = "0.6.0" petgraph = { version = "0.6.2" } diff --git a/src/kplib/annotator.rs b/src/kplib/annotator.rs index 57d3d47..2011ad3 100644 --- a/src/kplib/annotator.rs +++ b/src/kplib/annotator.rs @@ -3,7 +3,10 @@ use bitflags::bitflags; use petgraph::graph::NodeIndex; -use noodles_vcf::{self as vcf, record::sample::Value}; +use noodles_vcf::{ + self as vcf, + variant::record_buf::samples::sample::value::{Array, Value}, +}; bitflags! { pub struct FiltFlags: u32 { @@ -50,15 +53,15 @@ impl GenotypeAnno { // These are tied to VcfWriter.keys pub fn make_fields(&self, phase_group: i32) -> Vec> { vec![ - Some(Value::from(self.gt.clone())), - Some(Value::from(self.filt.bits() as i32)), - Some(Value::from(self.sq)), - Some(Value::from(self.gq)), - Some(Value::from(phase_group)), - Some(Value::from(self.dp)), - Some(Value::from(self.ad.clone())), - Some(Value::from(self.zs.clone())), - Some(Value::from(self.ss.clone())), + Some(Value::Genotype(self.gt.parse().expect("Should have made GT correctly"))), + Some(Value::Integer(self.filt.bits() as i32)), + Some(Value::Integer(self.sq)), + Some(Value::Integer(self.gq)), + Some(Value::Integer(phase_group)), + Some(Value::Integer(self.dp)), + Some(Value::Array(Array::Integer(self.ad.clone()))), + Some(Value::Array(Array::Integer(self.zs.clone()))), + Some(Value::Array(Array::Integer(self.ss.clone()))), ] } } diff --git a/src/kplib/regions.rs b/src/kplib/regions.rs index 13bedd7..f1e960f 100644 --- a/src/kplib/regions.rs +++ b/src/kplib/regions.rs @@ -1,9 +1,9 @@ use indexmap::IndexMap; -use noodles_vcf::header::record::value::{map::contig::Name, map::Contig, Map}; +use noodles_vcf::header::record::value::{map::contig::Contig, Map}; use std::collections::HashMap; use std::collections::VecDeque; -pub type ContigMap = IndexMap>; +pub type ContigMap = IndexMap>; pub type Regions = HashMap>; use crate::kplib::BedParser; diff --git a/src/kplib/vargraph.rs b/src/kplib/vargraph.rs index 7b748f2..076fe47 100644 --- a/src/kplib/vargraph.rs +++ b/src/kplib/vargraph.rs @@ -113,7 +113,7 @@ impl Variants { /// Again, TR aware, we need to set the bounds for doing the pileup /// to the TR boundaries. fn get_region(entries: &[vcf::Record]) -> (String, u64, u64) { - let chrom = entries[0].chromosome().to_string(); + let chrom = entries[0].reference_sequence_name().to_string(); let (min_start, max_end) = entries.iter().fold((u64::MAX, 0), |acc, e| { let (start, end) = e.boundaries(); diff --git a/src/kplib/vcf_traits.rs b/src/kplib/vcf_traits.rs index f49d9e5..4bc8c02 100644 --- a/src/kplib/vcf_traits.rs +++ b/src/kplib/vcf_traits.rs @@ -1,10 +1,5 @@ use crate::kplib::seq_to_kmer; -use noodles_vcf::{ - self as vcf, - variant::record::info::field::value::Value, - variant::record::info::field::key::Key, -}; -use std::cmp::Ordering; +use noodles_vcf::{self as vcf, variant::record::AlternateBases}; use std::str::FromStr; #[derive(Debug, Hash, PartialEq, Eq, Clone)] @@ -38,20 +33,16 @@ pub trait KdpVcf { fn boundaries(&self) -> (u64, u64); fn size(&self) -> u64; fn is_filtered(&self) -> bool; - fn variant_type(&self) -> Svtype; fn is_symbolic(&self) -> bool; fn valid_alt(&self) -> bool; + fn get_alt(&self) -> String; } impl KdpVcf for vcf::Record { /// Convert variant sequence to Kfeat fn to_kfeat(&self, kmer: u8, maxhom: usize) -> (Vec, i64) { let ref_seq = self.reference_bases().to_string(); - let alt_seq = self - .alternate_bases() - .first() - .expect("Can only work on sequence resolved variants") - .to_string(); + let alt_seq = self.get_alt(); let size = alt_seq.len() as i64 - ref_seq.len() as i64; @@ -69,37 +60,18 @@ impl KdpVcf for vcf::Record { /// start and end positions of an entry fn boundaries(&self) -> (u64, u64) { - let start: u64 = u64::try_from(usize::from(self.position())).unwrap() - 1; - let end: u64 = u64::try_from(usize::from(self.end().expect("No Variant End"))).unwrap(); + let start = match self.variant_start().and_then(|res| res.ok()) { + Some(pos) => pos.get() as u64 - 1, + None => panic!("Variant doesn't have a start"), + }; + let end: u64 = start + self.reference_bases().len() as u64; (start, end) } /// grab entry's length from either SVLEN field or infer it from the REF ALT fields fn size(&self) -> u64 { - let svlen = self - .info() - .get(&Key::from_str("SVLEN").unwrap_or_else(|_| panic!("No SVLEN INFO"))); - - if let Some(Some(Value::Integer(svlen))) = svlen { - return svlen.unsigned_abs() as u64; - } else if let Some(Some(Value::Array(field::value::Array::Integer(svlen)))) = svlen { - return svlen - .first() - .unwrap_or_else(|| panic!("Bad SVLEN")) - .unwrap() - .unsigned_abs() as u64; - } - let r_len: u64 = self.reference_bases().len() as u64; - let a_len: u64 = if self.is_symbolic() { - let (start, end) = self.boundaries(); - start.abs_diff(end) + 1 - } else { - match self.alternate_bases().first() { - Some(alt) => alt.len(), - None => 0 - } - }; + let a_len: u64 = self.get_alt().len() as u64; if r_len == a_len { if r_len == 1 { @@ -114,61 +86,34 @@ impl KdpVcf for vcf::Record { /// checks if an entry's FILTER is '.' or PASS, true if it is filtered fn is_filtered(&self) -> bool { - match &self.filters() { - Some(map) => **map != Filters::Pass, - None => false, - } + let binding = self.filters(); + let mfilt = binding.as_ref(); + mfilt == "." || mfilt == "PASS" } /// Alternate sequence isn't '.' or '*' fn valid_alt(&self) -> bool { - match self.alternate_bases().first() { - Some(alt) => { - let alt = alt.to_string(); - alt != "." && alt != "*" && !alt.contains(':') - } - _ => false, - } - } - - /// return the Svtype of a vcf entry - fn variant_type(&self) -> Svtype { - match self - .info() - .get(&Key::from_str("SVTYPE").expect("Unable to make key")) - { - // INFO/SVTYPE - Some(Some(Value::String(svtype))) => svtype.parse().expect("Bad SVTYPE"), - Some(Some(Value::Array(field::value::Array::String(svtype)))) => svtype - .first() - .cloned() - .unwrap_or_else(|| panic!("Bad SVTYPE")) - .expect("parsed") - .parse() - .unwrap(), - // Direct from REF/ALT - _ => match self.alternate_bases().first() { - Some(allele::Allele::Bases(alt)) => { - match alt.len().cmp(&self.reference_bases().len()) { - Ordering::Greater => Svtype::Ins, - Ordering::Less => Svtype::Del, - Ordering::Equal if alt.len() == 1 => Svtype::Snp, - _ => Svtype::Unk, - } - } - Some(allele::Allele::Symbol(alt)) => Svtype::from_str(&alt.to_string()) - .unwrap_or_else(|_| panic!("Bad Symbolic Alt")), - _ => Svtype::Unk, - }, - } + let alt = self.get_alt(); + alt != "." && alt != "*" && !alt.contains(':') } /// Checks if its a symbolic allele e.g. /// Returns false if its a monozygotic reference fn is_symbolic(&self) -> bool { - match self.alternate_bases().first() { - Some(alt) => alt.contains('<'), - None => false + self.get_alt().contains('<') + } + + /// Returns the first alternate allele or a blank string with '.' if there isn't any + fn get_alt(&self) -> String { + let alts = self.alternate_bases(); + match alts.len() { + 0 => ".".to_string(), + _ => alts + .iter() + .next() + .expect("I just checked") + .unwrap() + .to_string(), // I don't like all this String when str should be simplier } } } diff --git a/src/kplib/vcfreader.rs b/src/kplib/vcfreader.rs index 8994e02..eda268f 100644 --- a/src/kplib/vcfreader.rs +++ b/src/kplib/vcfreader.rs @@ -8,7 +8,7 @@ use std::io::BufRead; /// Takes a vcf and filtering parameters to create in iterable which will /// return chunks of variants in the same neighborhood pub struct VcfChunker { - pub m_vcf: vcf::reader::Reader, + pub m_vcf: vcf::io::Reader, pub m_header: vcf::Header, regions: Regions, params: KDParams, @@ -55,12 +55,12 @@ impl VcfChunker { return false; } - let size = entry.size(); - if self.params.sizemin > size || self.params.sizemax < size { + if !entry.valid_alt() { return false; } - if !entry.valid_alt() { + let size = entry.size(); + if self.params.sizemin > size || self.params.sizemax < size { return false; } @@ -68,7 +68,7 @@ impl VcfChunker { let mut default = VecDeque::new(); let m_coords = self .regions - .get_mut(&entry.chromosome().to_string()) + .get_mut(&entry.reference_sequence_name().to_string()) .unwrap_or(&mut default); if m_coords.is_empty() { @@ -102,7 +102,7 @@ impl VcfChunker { let mut entry = vcf::Record::default(); loop { - match self.m_vcf.read_record(&self.m_header, &mut entry) { + match self.m_vcf.read_record(&mut entry) { Ok(0) => return None, Err(e) => { error!("skipping invalid VCF entry {:?}", e); @@ -132,7 +132,7 @@ impl VcfChunker { /// cur_end but also the TR catalog. We want to chunk all TR changes together /// regardless of their distance. fn entry_in_chunk(&mut self, entry: &vcf::Record) -> bool { - let check_chrom = entry.chromosome().to_string(); + let check_chrom = entry.reference_sequence_name().to_string(); let new_chrom = !self.cur_chrom.is_empty() && check_chrom != self.cur_chrom; let (start, end) = entry.boundaries(); diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index 17589fd..27fbc90 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -5,10 +5,8 @@ use std::io::BufWriter; use std::path::PathBuf; use noodles_vcf::{ - self as vcf, - header::record::value::map::format, - header::record::value::Map, - record::genotypes::{keys::Keys, sample::Value, Genotypes}, + self as vcf, header::record::value::map::format, header::record::value::Map, + header::record::Value, variant::record_buf::samples::keys::Keys, }; pub struct VcfWriter { @@ -47,81 +45,84 @@ impl VcfWriter { // Setup FORMAT header definitions // Overwrites existing definitions let all_formats = header.formats_mut(); - - let keys: Keys = "GT:FT:SQ:GQ:PG:DP:AD:ZS:SS".parse().unwrap(); + let new_fmts: Vec = "GT:FT:SQ:GQ:PG:DP:AD:ZS:SS" + .split(':') + .map(String::from) + .collect(); + let keys: Keys = Keys::from_iter(new_fmts); // GT - let gtid = keys[0].clone(); - let mut gtfmt = Map::::from(>id); - *gtfmt.number_mut() = vcf::header::Number::Count(1); + let gtid = "GT"; + let mut gtfmt = Map::::from(gtid); + *gtfmt.number_mut() = format::Number::Count(1); *gtfmt.type_mut() = format::Type::String; *gtfmt.description_mut() = "Kanplug genotype".to_string(); - all_formats.insert(gtid, gtfmt); + all_formats.insert(gtid.to_string(), gtfmt); // FT - let ftid = keys[1].clone(); - let mut ftfmt = Map::::from(&ftid); - *ftfmt.number_mut() = vcf::header::Number::Count(1); + let ftid = "FT"; + let mut ftfmt = Map::::from(ftid); + *ftfmt.number_mut() = format::Number::Count(1); *ftfmt.type_mut() = format::Type::Integer; *ftfmt.description_mut() = "Kanpig filter".to_string(); - all_formats.insert(ftid, ftfmt); + all_formats.insert(ftid.to_string(), ftfmt); // SQ - let sqid = keys[2].clone(); - let mut sqfmt = Map::::from(&sqid); - *sqfmt.number_mut() = vcf::header::Number::Count(1); + let sqid = "SQ"; + let mut sqfmt = Map::::from(sqid); + *sqfmt.number_mut() = format::Number::Count(1); *sqfmt.type_mut() = format::Type::Integer; *sqfmt.description_mut() = "Phred scaled quality of sample being non-ref at this variant".to_string(); - all_formats.insert(sqid, sqfmt); + all_formats.insert(sqid.to_string(), sqfmt); // GQ - let gqid = keys[3].clone(); - let mut gqfmt = Map::::from(&gqid); - *gqfmt.number_mut() = vcf::header::Number::Count(1); + let gqid = "GQ"; + let mut gqfmt = Map::::from(gqid); + *gqfmt.number_mut() = format::Number::Count(1); *gqfmt.type_mut() = format::Type::Integer; *gqfmt.description_mut() = "Phred scaled quality of genotype".to_string(); - all_formats.insert(gqid, gqfmt); + all_formats.insert(gqid.to_string(), gqfmt); // PG - let pgid = keys[4].clone(); - let mut pgfmt = Map::::from(&pgid); - *pgfmt.number_mut() = vcf::header::Number::Count(1); + let pgid = "PG"; + let mut pgfmt = Map::::from(pgid); + *pgfmt.number_mut() = format::Number::Count(1); *pgfmt.type_mut() = format::Type::Integer; *pgfmt.description_mut() = "Local phase group of entries".to_string(); - all_formats.insert(pgid, pgfmt); + all_formats.insert(pgid.to_string(), pgfmt); // DP - let dpid = keys[5].clone(); - let mut dpfmt = Map::::from(&dpid); - *dpfmt.number_mut() = vcf::header::Number::Count(1); + let dpid = "DP"; + let mut dpfmt = Map::::from(dpid); + *dpfmt.number_mut() = format::Number::Count(1); *dpfmt.type_mut() = format::Type::Integer; *dpfmt.description_mut() = "Coverage over region".to_string(); - all_formats.insert(dpid, dpfmt); + all_formats.insert(dpid.to_string(), dpfmt); // AD - let adid = keys[6].clone(); - let mut adfmt = Map::::from(&adid); - *adfmt.number_mut() = vcf::header::Number::R; + let adid = "AD"; + let mut adfmt = Map::::from(adid); + *adfmt.number_mut() = format::Number::ReferenceAlternateBases; *adfmt.type_mut() = format::Type::Integer; *adfmt.description_mut() = "Coverage for reference and alternate alleles".to_string(); - all_formats.insert(adid, adfmt); + all_formats.insert(adid.to_string(), adfmt); // ZS - let zsid = keys[7].clone(); - let mut zsfmt = Map::::from(&zsid); - *zsfmt.number_mut() = vcf::header::Number::R; + let zsid = "ZS"; + let mut zsfmt = Map::::from(zsid); + *zsfmt.number_mut() = format::Number::ReferenceAlternateBases; *zsfmt.type_mut() = format::Type::Integer; *zsfmt.description_mut() = "Size similarity of path to entry".to_string(); - all_formats.insert(zsid, zsfmt); + all_formats.insert(zsid.to_string(), zsfmt); // SS - let ssid = keys[8].clone(); - let mut ssfmt = Map::::from(&ssid); - *ssfmt.number_mut() = vcf::header::Number::R; + let ssid = "SS"; + let mut ssfmt = Map::::from(ssid); + *ssfmt.number_mut() = format::Number::ReferenceAlternateBases; *ssfmt.type_mut() = format::Type::Integer; *ssfmt.description_mut() = "Sequence similarity of path to entry".to_string(); - all_formats.insert(ssid, ssfmt); + all_formats.insert(ssid.to_string(), ssfmt); // Ready to make files let out_buf = BufWriter::with_capacity( diff --git a/src/main.rs b/src/main.rs index acbc734..045cd3d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -60,6 +60,7 @@ fn main() { let m_receiver = task_receiver.clone(); let m_result_sender = result_sender.clone(); let m_ploidy = ploidy.clone(); + let m_header = input_header.clone(); thread::spawn(move || { let mut m_bam = @@ -69,7 +70,7 @@ fn main() { Ok(None) | Err(_) => break, Ok(Some(chunk)) => { let mut m_graph = - Variants::new(chunk, m_args.kd.kmer, m_args.kd.maxhom); + Variants::new(chunk, m_args.kd.kmer, m_args.kd.maxhom, &m_header); let ploidy = m_ploidy.get_ploidy(&m_graph.chrom, m_graph.start); // For zero, we don't have to waste time going into the bam From 66e0f451cdacf2b92ae1fcfdd00ab27c6e0c4518 Mon Sep 17 00:00:00 2001 From: Adam English Date: Fri, 24 May 2024 21:39:43 -0400 Subject: [PATCH 18/47] this might not be possible there is no more genotypes_mut. genotypes is now samples, but no samples_mut. I'm going to have to move to rust-htslib --- src/kplib/vcfwriter.rs | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index 27fbc90..61fc5b6 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -6,7 +6,7 @@ use std::path::PathBuf; use noodles_vcf::{ self as vcf, header::record::value::map::format, header::record::value::Map, - header::record::Value, variant::record_buf::samples::keys::Keys, + variant::record_buf::samples::{Sample, keys::Keys}, }; pub struct VcfWriter { @@ -142,20 +142,9 @@ impl VcfWriter { pub fn anno_write(&mut self, mut annot: GenotypeAnno, phase_group: i32) { *self.gtcounts.entry(annot.gt_state).or_insert(0) += 1; - *annot.entry.genotypes_mut() = - Genotypes::new(self.keys.clone(), vec![annot.make_fields(phase_group)]); + let sample = annot.entry.samples().get_index(0).unwrap(); + //Sample::new(&self.keys, annot.make_fields(phase_group).as_slice()); let _result = self.writer.write_record(&self.header, &annot.entry); } - - pub fn __write_entry(&mut self, mut entry: vcf::Record) { - *entry.genotypes_mut() = Genotypes::new( - "GT".parse().unwrap(), - vec![vec![ - Some(Value::from("./.")), // GT - ]], - ); - *self.gtcounts.entry(GTstate::Non).or_insert(0) += 1; - let _result = self.writer.write_record(&self.header, &entry); - } } From 76c08bbc071a193c28cd4709cfea89559518c315 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 00:16:56 -0400 Subject: [PATCH 19/47] updating noodles .......... --- experiments/bedtest.sh | 4 ++-- src/kplib/annotator.rs | 16 +++++++++------- src/kplib/vargraph.rs | 8 ++++---- src/kplib/vcf_traits.rs | 32 ++++++++++++-------------------- src/kplib/vcfreader.rs | 18 +++++++++--------- src/kplib/vcfwriter.rs | 13 ++++++++----- src/main.rs | 8 +++----- 7 files changed, 47 insertions(+), 52 deletions(-) diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh index 0d644f1..44e5417 100644 --- a/experiments/bedtest.sh +++ b/experiments/bedtest.sh @@ -3,7 +3,7 @@ bed=test_rs/test.chr20.bed create() { #../target/release/kanpig \ - time cargo run -- \ + time cargo run --release -- \ --input test_rs/test2.vcf.gz \ --bam /Users/english/code/kanpig/experiments/test_rs/NA24385.chr20.bam \ --reference /Users/english/code/references/grch38/GRCh38_1kg_mainchrs.fa \ @@ -11,7 +11,7 @@ create() { --sizesim 0.95 --seqsim 0.90 --threads 4 \ --maxpaths 20000 --mapq 5 --hapsim 0.98 \ --chunksize 100 --maxhom 5 \ - --sample doesthiswork --mem 1 \ + --sample doesthiswork \ -o test_rs/hc.vcf --bed $bed # --bed /Users/english/code/kanpig/test/GRCh38_HG002-T2TQ100-V1.0_stvar.benchmark.bed \ # --bam /Users/english/code/kanpig/experiments/test_rs/GIABHG002.bam \ diff --git a/src/kplib/annotator.rs b/src/kplib/annotator.rs index 2011ad3..0a8b65a 100644 --- a/src/kplib/annotator.rs +++ b/src/kplib/annotator.rs @@ -4,7 +4,7 @@ use bitflags::bitflags; use petgraph::graph::NodeIndex; use noodles_vcf::{ - self as vcf, + self as vcf, variant::record_buf::samples::sample::value::{Array, Value}, }; @@ -23,7 +23,7 @@ bitflags! { //Format Integer Type Number = G type IntG = Vec>; pub struct GenotypeAnno { - pub entry: vcf::Record, + pub entry: vcf::variant::RecordBuf, pub gt: String, pub filt: FiltFlags, pub sq: i32, @@ -37,7 +37,7 @@ pub struct GenotypeAnno { impl GenotypeAnno { pub fn new( - entry: vcf::Record, + entry: vcf::variant::RecordBuf, var_idx: &NodeIndex, paths: &[PathScore], coverage: u64, @@ -53,7 +53,9 @@ impl GenotypeAnno { // These are tied to VcfWriter.keys pub fn make_fields(&self, phase_group: i32) -> Vec> { vec![ - Some(Value::Genotype(self.gt.parse().expect("Should have made GT correctly"))), + Some(Value::Genotype( + self.gt.parse().expect("Should have made GT correctly"), + )), Some(Value::Integer(self.filt.bits() as i32)), Some(Value::Integer(self.sq)), Some(Value::Integer(self.gq)), @@ -68,7 +70,7 @@ impl GenotypeAnno { /// For annotating a variant in diploid regions fn diploid( - entry: vcf::Record, + entry: vcf::variant::RecordBuf, var_idx: &NodeIndex, paths: &[PathScore], coverage: u64, @@ -146,7 +148,7 @@ fn diploid( } /// For annotating a variant in a zero ploidy region -fn zero(entry: vcf::Record, coverage: u64) -> GenotypeAnno { +fn zero(entry: vcf::variant::RecordBuf, coverage: u64) -> GenotypeAnno { GenotypeAnno { entry, gt: "./.".to_string(), @@ -163,7 +165,7 @@ fn zero(entry: vcf::Record, coverage: u64) -> GenotypeAnno { /// For annotating a variant in a one ploidy region fn haploid( - entry: vcf::Record, + entry: vcf::variant::RecordBuf, var_idx: &NodeIndex, paths: &[PathScore], coverage: u64, diff --git a/src/kplib/vargraph.rs b/src/kplib/vargraph.rs index 076fe47..7ff6b35 100644 --- a/src/kplib/vargraph.rs +++ b/src/kplib/vargraph.rs @@ -17,12 +17,12 @@ pub struct VarNode { pub coverage: (Option, Option), pub seqsim: (Option, Option), pub sizesim: (Option, Option), - pub entry: Option, + pub entry: Option, pub kfeat: Vec, } impl VarNode { - pub fn new(entry: vcf::Record, kmer: u8, maxhom: usize) -> Self { + pub fn new(entry: vcf::variant::RecordBuf, kmer: u8, maxhom: usize) -> Self { // Want to make a hash for these names for debugging, I think. let name = "".to_string(); let (start, end) = entry.boundaries(); @@ -71,7 +71,7 @@ pub struct Variants { /// The graph has an upstream 'src' node that point to every variant node /// The graph has a dnstream 'snk' node that is pointed to by every variant node and 'src' impl Variants { - pub fn new(mut variants: Vec, kmer: u8, maxhom: usize) -> Self { + pub fn new(mut variants: Vec, kmer: u8, maxhom: usize) -> Self { if variants.is_empty() { panic!("Cannot create a graph from no variants"); } @@ -112,7 +112,7 @@ impl Variants { /// Again, TR aware, we need to set the bounds for doing the pileup /// to the TR boundaries. - fn get_region(entries: &[vcf::Record]) -> (String, u64, u64) { + fn get_region(entries: &[vcf::variant::RecordBuf]) -> (String, u64, u64) { let chrom = entries[0].reference_sequence_name().to_string(); let (min_start, max_end) = entries.iter().fold((u64::MAX, 0), |acc, e| { diff --git a/src/kplib/vcf_traits.rs b/src/kplib/vcf_traits.rs index 4bc8c02..392b3d9 100644 --- a/src/kplib/vcf_traits.rs +++ b/src/kplib/vcf_traits.rs @@ -1,5 +1,5 @@ use crate::kplib::seq_to_kmer; -use noodles_vcf::{self as vcf, variant::record::AlternateBases}; +use noodles_vcf::{self as vcf, variant::record::AlternateBases, variant::record::Filters}; use std::str::FromStr; #[derive(Debug, Hash, PartialEq, Eq, Clone)] @@ -32,13 +32,12 @@ pub trait KdpVcf { fn to_kfeat(&self, kmer: u8, maxhom: usize) -> (Vec, i64); fn boundaries(&self) -> (u64, u64); fn size(&self) -> u64; - fn is_filtered(&self) -> bool; - fn is_symbolic(&self) -> bool; + fn is_filtered(&self, header: &vcf::Header) -> bool; fn valid_alt(&self) -> bool; fn get_alt(&self) -> String; } -impl KdpVcf for vcf::Record { +impl KdpVcf for vcf::variant::RecordBuf { /// Convert variant sequence to Kfeat fn to_kfeat(&self, kmer: u8, maxhom: usize) -> (Vec, i64) { let ref_seq = self.reference_bases().to_string(); @@ -60,10 +59,7 @@ impl KdpVcf for vcf::Record { /// start and end positions of an entry fn boundaries(&self) -> (u64, u64) { - let start = match self.variant_start().and_then(|res| res.ok()) { - Some(pos) => pos.get() as u64 - 1, - None => panic!("Variant doesn't have a start"), - }; + let start: u64 = u64::try_from(usize::from(self.variant_start().unwrap())).unwrap() - 1; let end: u64 = start + self.reference_bases().len() as u64; (start, end) } @@ -85,22 +81,18 @@ impl KdpVcf for vcf::Record { } /// checks if an entry's FILTER is '.' or PASS, true if it is filtered - fn is_filtered(&self) -> bool { - let binding = self.filters(); - let mfilt = binding.as_ref(); - mfilt == "." || mfilt == "PASS" + fn is_filtered(&self, header: &vcf::Header) -> bool { + !(self.filters().is_empty() + || self.filters().iter(header).any(|res| match res { + Ok(s) => s == "PASS", + Err(_) => false, + })) } - /// Alternate sequence isn't '.' or '*' + /// Alternate sequence isn't '.' or '*' or bnd or symbolic fn valid_alt(&self) -> bool { let alt = self.get_alt(); - alt != "." && alt != "*" && !alt.contains(':') - } - - /// Checks if its a symbolic allele e.g. - /// Returns false if its a monozygotic reference - fn is_symbolic(&self) -> bool { - self.get_alt().contains('<') + alt != "." && alt != "*" && !alt.contains(':') && !alt.contains('<') } /// Returns the first alternate allele or a blank string with '.' if there isn't any diff --git a/src/kplib/vcfreader.rs b/src/kplib/vcfreader.rs index eda268f..e94356e 100644 --- a/src/kplib/vcfreader.rs +++ b/src/kplib/vcfreader.rs @@ -18,7 +18,7 @@ pub struct VcfChunker { // When iterating, we will encounter a variant that no longer // fits in the current chunk. We need to hold on to it for the // next chunk - hold_entry: Option, + hold_entry: Option, pub chunk_count: u64, pub call_count: u64, pub skip_count: u64, @@ -50,8 +50,8 @@ impl VcfChunker { /// Checks if entry passes all parameter conditions including /// within --bed regions, passing, and within expected size - fn filter_entry(&mut self, entry: &vcf::Record) -> bool { - if self.params.passonly & entry.is_filtered() { + fn filter_entry(&mut self, entry: &vcf::variant::RecordBuf) -> bool { + if self.params.passonly & entry.is_filtered(&self.m_header) { return false; } @@ -68,7 +68,7 @@ impl VcfChunker { let mut default = VecDeque::new(); let m_coords = self .regions - .get_mut(&entry.reference_sequence_name().to_string()) + .get_mut(entry.reference_sequence_name()) .unwrap_or(&mut default); if m_coords.is_empty() { @@ -98,11 +98,11 @@ impl VcfChunker { } /// Return the next vcf entry which passes parameter conditions - fn get_next_entry(&mut self) -> Option { - let mut entry = vcf::Record::default(); + fn get_next_entry(&mut self) -> Option { + let mut entry = vcf::variant::RecordBuf::default(); loop { - match self.m_vcf.read_record(&mut entry) { + match self.m_vcf.read_record_buf(&self.m_header, &mut entry) { Ok(0) => return None, Err(e) => { error!("skipping invalid VCF entry {:?}", e); @@ -131,7 +131,7 @@ impl VcfChunker { /// If we wanted to be TR aware, when checking new_chunk, we don't just look at /// cur_end but also the TR catalog. We want to chunk all TR changes together /// regardless of their distance. - fn entry_in_chunk(&mut self, entry: &vcf::Record) -> bool { + fn entry_in_chunk(&mut self, entry: &vcf::variant::RecordBuf) -> bool { let check_chrom = entry.reference_sequence_name().to_string(); let new_chrom = !self.cur_chrom.is_empty() && check_chrom != self.cur_chrom; @@ -150,7 +150,7 @@ impl VcfChunker { } impl Iterator for VcfChunker { - type Item = Vec; + type Item = Vec; fn next(&mut self) -> Option { let mut ret = self.hold_entry.take().into_iter().collect::>(); diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index 61fc5b6..89abd6f 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -5,8 +5,11 @@ use std::io::BufWriter; use std::path::PathBuf; use noodles_vcf::{ - self as vcf, header::record::value::map::format, header::record::value::Map, - variant::record_buf::samples::{Sample, keys::Keys}, + self as vcf, + header::record::value::map::format, + header::record::value::Map, + variant::io::Write, + variant::record_buf::samples::{keys::Keys, Samples}, }; pub struct VcfWriter { @@ -142,9 +145,9 @@ impl VcfWriter { pub fn anno_write(&mut self, mut annot: GenotypeAnno, phase_group: i32) { *self.gtcounts.entry(annot.gt_state).or_insert(0) += 1; - let sample = annot.entry.samples().get_index(0).unwrap(); - //Sample::new(&self.keys, annot.make_fields(phase_group).as_slice()); + *annot.entry.samples_mut() = + Samples::new(self.keys.clone(), vec![annot.make_fields(phase_group)]); - let _result = self.writer.write_record(&self.header, &annot.entry); + let _result = self.writer.write_variant_record(&self.header, &annot.entry); } } diff --git a/src/main.rs b/src/main.rs index 045cd3d..7af7272 100644 --- a/src/main.rs +++ b/src/main.rs @@ -18,7 +18,7 @@ use kplib::{ PathScore, Ploidy, PloidyRegions, Variants, VcfChunker, VcfWriter, }; -type InputType = Option>; +type InputType = Option>; type OutputType = Option>; fn main() { @@ -38,8 +38,7 @@ fn main() { error!("please fix arguments"); std::process::exit(1); } - - let mut input_vcf = vcf::reader::Builder::default() + let mut input_vcf = vcf::io::reader::Builder::default() .build_from_path(args.io.input.clone()) .expect("Unable to parse vcf"); let input_header = input_vcf.read_header().expect("Unable to parse vcf header"); @@ -60,7 +59,6 @@ fn main() { let m_receiver = task_receiver.clone(); let m_result_sender = result_sender.clone(); let m_ploidy = ploidy.clone(); - let m_header = input_header.clone(); thread::spawn(move || { let mut m_bam = @@ -70,7 +68,7 @@ fn main() { Ok(None) | Err(_) => break, Ok(Some(chunk)) => { let mut m_graph = - Variants::new(chunk, m_args.kd.kmer, m_args.kd.maxhom, &m_header); + Variants::new(chunk, m_args.kd.kmer, m_args.kd.maxhom); let ploidy = m_ploidy.get_ploidy(&m_graph.chrom, m_graph.start); // For zero, we don't have to waste time going into the bam From 7c3c9b595a60d15ff1ca21550ed3b683db136fef Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 01:10:25 -0400 Subject: [PATCH 20/47] code clean --- Cargo.toml | 12 ++++++------ experiments/bedtest.sh | 2 +- src/kplib/annotator.rs | 12 ++++++------ src/kplib/vargraph.rs | 10 +++++----- src/kplib/vcf_traits.rs | 11 ++++++----- src/kplib/vcfreader.rs | 10 +++++----- 6 files changed, 29 insertions(+), 28 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 18742c8..a274f41 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,21 +4,21 @@ version = "0.2.1-dev" edition = "2021" [dependencies] -bitflags = "2.5.0" +bitflags = { version = "2.5.0" } clap = { version = "4.0", features = ["derive"] } crossbeam-channel = { version = "0.5.12" } -indexmap = "2.2.3" -indicatif = "0.17.8" +indexmap = { version = "2.2.3" } +indicatif = { version = "0.17.8" } itertools = { version = "0.12.1" } -lazy_static = "1.4.0" +lazy_static = { version = "1.4.0" } log = { version = "0.4", features = ["std", "serde"] } noodles-vcf = { version = "0.57.0" } ordered-float = { version = "4.0", default-features = false } -page_size = "0.6.0" +page_size = { version = "0.6.0" } petgraph = { version = "0.6.2" } pretty_env_logger = { version = "0.4.0" } rust-htslib = { version = "0.46.0" } -rust-lapper = "1.1.0" +rust-lapper = { version = "1.1.0" } [profile.release] opt-level = 3 diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh index 44e5417..8028892 100644 --- a/experiments/bedtest.sh +++ b/experiments/bedtest.sh @@ -9,7 +9,7 @@ create() { --reference /Users/english/code/references/grch38/GRCh38_1kg_mainchrs.fa \ --sizemin 50 \ --sizesim 0.95 --seqsim 0.90 --threads 4 \ - --maxpaths 20000 --mapq 5 --hapsim 0.98 \ + --maxpaths 1000 --mapq 5 --hapsim 0.98 \ --chunksize 100 --maxhom 5 \ --sample doesthiswork \ -o test_rs/hc.vcf --bed $bed diff --git a/src/kplib/annotator.rs b/src/kplib/annotator.rs index 0a8b65a..fa22e54 100644 --- a/src/kplib/annotator.rs +++ b/src/kplib/annotator.rs @@ -4,8 +4,8 @@ use bitflags::bitflags; use petgraph::graph::NodeIndex; use noodles_vcf::{ - self as vcf, variant::record_buf::samples::sample::value::{Array, Value}, + variant::RecordBuf, }; bitflags! { @@ -23,7 +23,7 @@ bitflags! { //Format Integer Type Number = G type IntG = Vec>; pub struct GenotypeAnno { - pub entry: vcf::variant::RecordBuf, + pub entry: RecordBuf, pub gt: String, pub filt: FiltFlags, pub sq: i32, @@ -37,7 +37,7 @@ pub struct GenotypeAnno { impl GenotypeAnno { pub fn new( - entry: vcf::variant::RecordBuf, + entry: RecordBuf, var_idx: &NodeIndex, paths: &[PathScore], coverage: u64, @@ -70,7 +70,7 @@ impl GenotypeAnno { /// For annotating a variant in diploid regions fn diploid( - entry: vcf::variant::RecordBuf, + entry: RecordBuf, var_idx: &NodeIndex, paths: &[PathScore], coverage: u64, @@ -148,7 +148,7 @@ fn diploid( } /// For annotating a variant in a zero ploidy region -fn zero(entry: vcf::variant::RecordBuf, coverage: u64) -> GenotypeAnno { +fn zero(entry: RecordBuf, coverage: u64) -> GenotypeAnno { GenotypeAnno { entry, gt: "./.".to_string(), @@ -165,7 +165,7 @@ fn zero(entry: vcf::variant::RecordBuf, coverage: u64) -> GenotypeAnno { /// For annotating a variant in a one ploidy region fn haploid( - entry: vcf::variant::RecordBuf, + entry: RecordBuf, var_idx: &NodeIndex, paths: &[PathScore], coverage: u64, diff --git a/src/kplib/vargraph.rs b/src/kplib/vargraph.rs index 7ff6b35..4d8d6fd 100644 --- a/src/kplib/vargraph.rs +++ b/src/kplib/vargraph.rs @@ -4,7 +4,7 @@ use crate::kplib::{ Ploidy, }; use itertools::Itertools; -use noodles_vcf::{self as vcf}; +use noodles_vcf::variant::RecordBuf; use petgraph::graph::{DiGraph, NodeIndex}; /// Every --input variant is placed inside a node is turned into a graph. @@ -17,12 +17,12 @@ pub struct VarNode { pub coverage: (Option, Option), pub seqsim: (Option, Option), pub sizesim: (Option, Option), - pub entry: Option, + pub entry: Option, pub kfeat: Vec, } impl VarNode { - pub fn new(entry: vcf::variant::RecordBuf, kmer: u8, maxhom: usize) -> Self { + pub fn new(entry: RecordBuf, kmer: u8, maxhom: usize) -> Self { // Want to make a hash for these names for debugging, I think. let name = "".to_string(); let (start, end) = entry.boundaries(); @@ -71,7 +71,7 @@ pub struct Variants { /// The graph has an upstream 'src' node that point to every variant node /// The graph has a dnstream 'snk' node that is pointed to by every variant node and 'src' impl Variants { - pub fn new(mut variants: Vec, kmer: u8, maxhom: usize) -> Self { + pub fn new(mut variants: Vec, kmer: u8, maxhom: usize) -> Self { if variants.is_empty() { panic!("Cannot create a graph from no variants"); } @@ -112,7 +112,7 @@ impl Variants { /// Again, TR aware, we need to set the bounds for doing the pileup /// to the TR boundaries. - fn get_region(entries: &[vcf::variant::RecordBuf]) -> (String, u64, u64) { + fn get_region(entries: &[RecordBuf]) -> (String, u64, u64) { let chrom = entries[0].reference_sequence_name().to_string(); let (min_start, max_end) = entries.iter().fold((u64::MAX, 0), |acc, e| { diff --git a/src/kplib/vcf_traits.rs b/src/kplib/vcf_traits.rs index 392b3d9..857b1b1 100644 --- a/src/kplib/vcf_traits.rs +++ b/src/kplib/vcf_traits.rs @@ -1,5 +1,7 @@ use crate::kplib::seq_to_kmer; -use noodles_vcf::{self as vcf, variant::record::AlternateBases, variant::record::Filters}; +use noodles_vcf::{ + variant::record::AlternateBases, variant::record::Filters, variant::RecordBuf, Header, +}; use std::str::FromStr; #[derive(Debug, Hash, PartialEq, Eq, Clone)] @@ -27,17 +29,16 @@ impl FromStr for Svtype { } } -/// Convert vcf::Record to kfeat pub trait KdpVcf { fn to_kfeat(&self, kmer: u8, maxhom: usize) -> (Vec, i64); fn boundaries(&self) -> (u64, u64); fn size(&self) -> u64; - fn is_filtered(&self, header: &vcf::Header) -> bool; + fn is_filtered(&self, header: &Header) -> bool; fn valid_alt(&self) -> bool; fn get_alt(&self) -> String; } -impl KdpVcf for vcf::variant::RecordBuf { +impl KdpVcf for RecordBuf { /// Convert variant sequence to Kfeat fn to_kfeat(&self, kmer: u8, maxhom: usize) -> (Vec, i64) { let ref_seq = self.reference_bases().to_string(); @@ -81,7 +82,7 @@ impl KdpVcf for vcf::variant::RecordBuf { } /// checks if an entry's FILTER is '.' or PASS, true if it is filtered - fn is_filtered(&self, header: &vcf::Header) -> bool { + fn is_filtered(&self, header: &Header) -> bool { !(self.filters().is_empty() || self.filters().iter(header).any(|res| match res { Ok(s) => s == "PASS", diff --git a/src/kplib/vcfreader.rs b/src/kplib/vcfreader.rs index e94356e..db0f473 100644 --- a/src/kplib/vcfreader.rs +++ b/src/kplib/vcfreader.rs @@ -1,6 +1,6 @@ use crate::kplib::{GenotypeAnno, KDParams, KdpVcf, Ploidy, Regions}; use crossbeam_channel::Sender; -use noodles_vcf::{self as vcf}; +use noodles_vcf::{self as vcf, variant::RecordBuf}; use petgraph::graph::NodeIndex; use std::collections::VecDeque; use std::io::BufRead; @@ -98,8 +98,8 @@ impl VcfChunker { } /// Return the next vcf entry which passes parameter conditions - fn get_next_entry(&mut self) -> Option { - let mut entry = vcf::variant::RecordBuf::default(); + fn get_next_entry(&mut self) -> Option { + let mut entry = RecordBuf::default(); loop { match self.m_vcf.read_record_buf(&self.m_header, &mut entry) { @@ -131,7 +131,7 @@ impl VcfChunker { /// If we wanted to be TR aware, when checking new_chunk, we don't just look at /// cur_end but also the TR catalog. We want to chunk all TR changes together /// regardless of their distance. - fn entry_in_chunk(&mut self, entry: &vcf::variant::RecordBuf) -> bool { + fn entry_in_chunk(&mut self, entry: &RecordBuf) -> bool { let check_chrom = entry.reference_sequence_name().to_string(); let new_chrom = !self.cur_chrom.is_empty() && check_chrom != self.cur_chrom; @@ -150,7 +150,7 @@ impl VcfChunker { } impl Iterator for VcfChunker { - type Item = Vec; + type Item = Vec; fn next(&mut self) -> Option { let mut ret = self.hold_entry.take().into_iter().collect::>(); From f0095bd1182e0a124adce12727d7d4157c3382b7 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 02:14:58 -0400 Subject: [PATCH 21/47] clean --- src/kplib/vcf_traits.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/kplib/vcf_traits.rs b/src/kplib/vcf_traits.rs index 857b1b1..79cc9df 100644 --- a/src/kplib/vcf_traits.rs +++ b/src/kplib/vcf_traits.rs @@ -35,7 +35,7 @@ pub trait KdpVcf { fn size(&self) -> u64; fn is_filtered(&self, header: &Header) -> bool; fn valid_alt(&self) -> bool; - fn get_alt(&self) -> String; + fn get_alt(&self) -> &str; } impl KdpVcf for RecordBuf { @@ -97,16 +97,12 @@ impl KdpVcf for RecordBuf { } /// Returns the first alternate allele or a blank string with '.' if there isn't any - fn get_alt(&self) -> String { + fn get_alt(&self) -> &str { let alts = self.alternate_bases(); match alts.len() { - 0 => ".".to_string(), - _ => alts - .iter() - .next() - .expect("I just checked") - .unwrap() - .to_string(), // I don't like all this String when str should be simplier + 0 => ".", + _ => alts.iter().next().expect("I just checked").unwrap(), + //.to_string(), // I don't like all this String when str should be simplier } } } From 80ec08725cdf2b64e5eef08dc98c360337d7f2db Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 08:33:12 -0400 Subject: [PATCH 22/47] saftey around edge case shouldn't happen, but still --- src/kplib/cluster.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/kplib/cluster.rs b/src/kplib/cluster.rs index 5c0e898..85a9f2b 100644 --- a/src/kplib/cluster.rs +++ b/src/kplib/cluster.rs @@ -129,6 +129,9 @@ pub fn diploid_haplotypes( if hap1.n == 0 { // HOMALT vec![hap2.clone(), hap2] + } else if hap2.n == 0 { + // Doesn't happen? + vec![hap1.clone(), hap1] } else { // Compound Het vec![hap1, hap2] From 0a54d35509fb92554cd4cc7e4bbda56203a12617 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 09:18:57 -0400 Subject: [PATCH 23/47] small clean --- experiments/bedtest.sh | 4 ++-- src/kplib/kmer.rs | 9 ++++----- src/kplib/vcfreader.rs | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh index 8028892..88434b4 100644 --- a/experiments/bedtest.sh +++ b/experiments/bedtest.sh @@ -9,8 +9,8 @@ create() { --reference /Users/english/code/references/grch38/GRCh38_1kg_mainchrs.fa \ --sizemin 50 \ --sizesim 0.95 --seqsim 0.90 --threads 4 \ - --maxpaths 1000 --mapq 5 --hapsim 0.98 \ - --chunksize 100 --maxhom 5 \ + --maxpaths 20000 --mapq 5 --hapsim 0.98 \ + --chunksize 100 --maxhom 0 \ --sample doesthiswork \ -o test_rs/hc.vcf --bed $bed # --bed /Users/english/code/kanpig/test/GRCh38_HG002-T2TQ100-V1.0_stvar.benchmark.bed \ diff --git a/src/kplib/kmer.rs b/src/kplib/kmer.rs index 005ddae..9bf92e9 100644 --- a/src/kplib/kmer.rs +++ b/src/kplib/kmer.rs @@ -11,11 +11,10 @@ fn encode_nuc(nuc: u8) -> u64 { /// Count kmers in a sequence pub fn seq_to_kmer(sequence: &[u8], kmer: u8, negative: bool, maxhom: usize) -> Vec { - let sequence = if maxhom != 0 { - compress_homopolymer(sequence, maxhom) - } else { - sequence.to_vec() - }; + if maxhom != 0 { + return seq_to_kmer(&compress_homopolymer(sequence, maxhom), kmer, negative, 0) + } + let ukmer = kmer as usize; let mut kcounts = vec![0f32; 1 << (2 * ukmer)]; let cnt = if negative { -1.0 } else { 1.0 }; diff --git a/src/kplib/vcfreader.rs b/src/kplib/vcfreader.rs index db0f473..2efa4be 100644 --- a/src/kplib/vcfreader.rs +++ b/src/kplib/vcfreader.rs @@ -18,7 +18,7 @@ pub struct VcfChunker { // When iterating, we will encounter a variant that no longer // fits in the current chunk. We need to hold on to it for the // next chunk - hold_entry: Option, + hold_entry: Option, pub chunk_count: u64, pub call_count: u64, pub skip_count: u64, @@ -50,7 +50,7 @@ impl VcfChunker { /// Checks if entry passes all parameter conditions including /// within --bed regions, passing, and within expected size - fn filter_entry(&mut self, entry: &vcf::variant::RecordBuf) -> bool { + fn filter_entry(&mut self, entry: &RecordBuf) -> bool { if self.params.passonly & entry.is_filtered(&self.m_header) { return false; } From ad21904eb9c03604788f1fe1c094f3cd7c69b8c1 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 11:20:26 -0400 Subject: [PATCH 24/47] Update README.md --- README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 688b9dc..e40852c 100644 --- a/README.md +++ b/README.md @@ -51,12 +51,11 @@ of an upstream window's variants at least `chunksize` base-pairs away from the n This chunksize also determines the region over which read pileups are generated. Only reads with at least `mapq` mapping quality, passing the `mapflag` filter, and which fully span the minimum variant start and maximum variant end are considered. -This is an important parameter because too small of a `chunksize` may not recruit read pileups that support variants but are -further away. Similarly, too large of a value may create windows with many SVs which are also too large for reads to fully-span. +This is an important parameter because too small of a `chunksize` may not recruit distant read pileups which support variants. Similarly, +too large of a value may create windows with many SVs which are also too large for reads to fully-span. ### `--sizemin` and `--sizemax` -Variant sizes are determined by `INFO/SVLEN`. If `INFO/SVLEN` tag is not in the VCF entry, the variant's size is set as -`abs(length(ALT) - length(REF))`. Genotypes of variants not within the size boundaries are set to missing (`./.`). +Variant sizes are determined by `abs(length(ALT) - length(REF))`. Genotypes of variants not within the size boundaries are set to missing (`./.`). ### `--sizesim` and `--seqsim` When applying a haplotype to a variant graph, only paths above these two thresholds are allowed. If there are multiple @@ -85,7 +84,7 @@ The `SAMPLE` column fields populated by kanpig are: | **FT** | Bit flag for properties of the variant's genotyping. Flags == 0 are considered PASS. | | **SQ** | Phred scaled likelihood variant alternate is present in the sample | | **GQ** | Phred scale difference between most and second-most likely genotypes | -| **PG** | Each chunk of variants is assigned a phase group | +| **PS** | Each chunk of variants is assigned a phase set | | **DP** | Read coverage over the region | | **AD** | Read coverage supporting the reference and alternate alleles. | | **SZ** | Size similarity of the two haplotypes to this variant | From 62dc256e5394f1f6785d500d5e8e358fefb7c3c3 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 11:53:27 -0400 Subject: [PATCH 25/47] change PG to PS that's the vcf spec name --- src/kplib/kmer.rs | 3 +-- src/kplib/vcfwriter.rs | 16 ++++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/kplib/kmer.rs b/src/kplib/kmer.rs index 9bf92e9..7d0722e 100644 --- a/src/kplib/kmer.rs +++ b/src/kplib/kmer.rs @@ -12,7 +12,7 @@ fn encode_nuc(nuc: u8) -> u64 { /// Count kmers in a sequence pub fn seq_to_kmer(sequence: &[u8], kmer: u8, negative: bool, maxhom: usize) -> Vec { if maxhom != 0 { - return seq_to_kmer(&compress_homopolymer(sequence, maxhom), kmer, negative, 0) + return seq_to_kmer(&compress_homopolymer(sequence, maxhom), kmer, negative, 0); } let ukmer = kmer as usize; @@ -39,7 +39,6 @@ pub fn seq_to_kmer(sequence: &[u8], kmer: u8, negative: bool, maxhom: usize) -> // rolling sum masks off first nuc and adds the next one let mask: u64 = (1 << (2 * (kmer - 1) as usize)) - 1; - //for i in sequence[1..(sequence.len() - ukmer + 1)].iter() { for i in sequence.iter().skip(ukmer) { let f_nuc = encode_nuc(*i); f_result = ((f_result & mask) << 2) + f_nuc; diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index 89abd6f..4090823 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -48,7 +48,7 @@ impl VcfWriter { // Setup FORMAT header definitions // Overwrites existing definitions let all_formats = header.formats_mut(); - let new_fmts: Vec = "GT:FT:SQ:GQ:PG:DP:AD:ZS:SS" + let new_fmts: Vec = "GT:FT:SQ:GQ:PS:DP:AD:ZS:SS" .split(':') .map(String::from) .collect(); @@ -87,13 +87,13 @@ impl VcfWriter { *gqfmt.description_mut() = "Phred scaled quality of genotype".to_string(); all_formats.insert(gqid.to_string(), gqfmt); - // PG - let pgid = "PG"; - let mut pgfmt = Map::::from(pgid); - *pgfmt.number_mut() = format::Number::Count(1); - *pgfmt.type_mut() = format::Type::Integer; - *pgfmt.description_mut() = "Local phase group of entries".to_string(); - all_formats.insert(pgid.to_string(), pgfmt); + // PS + let psid = "PS"; + let mut psfmt = Map::::from(psid); + *psfmt.number_mut() = format::Number::Count(1); + *psfmt.type_mut() = format::Type::Integer; + *psfmt.description_mut() = "Local phase group of entries".to_string(); + all_formats.insert(psid.to_string(), psfmt); // DP let dpid = "DP"; From 976115c2a2f962531f071fcb8e25d219162e31c8 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 12:08:53 -0400 Subject: [PATCH 26/47] write refactor --- experiments/bedtest.sh | 2 +- src/main.rs | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh index 88434b4..f848ec3 100644 --- a/experiments/bedtest.sh +++ b/experiments/bedtest.sh @@ -39,7 +39,7 @@ bench_full() { } create -#bcftools sort -O z -o test_rs/hc.vcf.gz test_rs/hc.vcf +bcftools sort -O z -o test_rs/hc.vcf.gz test_rs/hc.vcf #tabix test_rs/hc.vcf.gz #bench_lite #bench_medium diff --git a/src/main.rs b/src/main.rs index 7af7272..efb3226 100644 --- a/src/main.rs +++ b/src/main.rs @@ -106,16 +106,13 @@ fn main() { .collect(); //Before we start the workers, we'll start the writer - let writer = Arc::new(Mutex::new(VcfWriter::new( - &args.io.out, - input_header.clone(), - &args.io.sample, - ))); - - let cloned_writer = writer.clone(); + // This is the semaphore for the progress bar that communicates between main and writer let num_variants = Arc::new(Mutex::new(0)); - let cloned_num_variants = num_variants.clone(); + let wthread_num_variants = num_variants.clone(); + let wthread_io = args.io.clone(); + let wthread_header = input_header.clone(); + let write_handler = std::thread::spawn(move || { let sty = ProgressStyle::with_template( " [{elapsed_precise}] {bar:44.cyan/blue} > {pos} completed", @@ -125,7 +122,11 @@ fn main() { let mut pbar: Option = None; let mut phase_group: i32 = 0; let mut completed_variants: u64 = 0; - let mut m_writer = cloned_writer.lock().unwrap(); + let mut m_writer = VcfWriter::new( + &wthread_io.out, + wthread_header.clone(), + &wthread_io.sample, + ); loop { match result_receiver.recv() { @@ -146,7 +147,7 @@ fn main() { } else { completed_variants += rsize; // check if the reader is finished so we can setup the pbar - let value_guard = cloned_num_variants.lock().unwrap(); + let value_guard = wthread_num_variants.lock().unwrap(); if *value_guard != 0 { let t_bar = ProgressBar::new(*value_guard).with_style(sty.clone()); t_bar.inc(completed_variants); @@ -157,6 +158,7 @@ fn main() { } } } + info!("genotype counts: {:#?}", m_writer.gtcounts); }); info!("building variant graphs"); @@ -199,7 +201,5 @@ fn main() { // Wait on the writer write_handler.join().unwrap(); - let writer = writer.lock().unwrap(); - info!("genotype counts: {:#?}", writer.gtcounts); info!("finished"); } From 4d9d6b823167b86446f9e895f860e0506a9541b0 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 14:35:32 -0400 Subject: [PATCH 27/47] panic on write fail better than writing a corrupted vcf --- src/kplib/vcfwriter.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index 4090823..fefd5bf 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -148,6 +148,9 @@ impl VcfWriter { *annot.entry.samples_mut() = Samples::new(self.keys.clone(), vec![annot.make_fields(phase_group)]); - let _result = self.writer.write_variant_record(&self.header, &annot.entry); + match self.writer.write_variant_record(&self.header, &annot.entry) { + Ok(_) => {} + Err(error) => panic!("Couldn't write record {:?}", error), + } } } From d1f255a3c73fe935d54c1a8cfcbd972f9188cf4e Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 17:16:36 -0400 Subject: [PATCH 28/47] iupac translator --- src/kplib/vcfwriter.rs | 54 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index fefd5bf..29c34f0 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -147,10 +147,62 @@ impl VcfWriter { *self.gtcounts.entry(annot.gt_state).or_insert(0) += 1; *annot.entry.samples_mut() = Samples::new(self.keys.clone(), vec![annot.make_fields(phase_group)]); - + //*annot.entry.reference_bases_mut() = replace_iupac(annot.entry.reference_bases()); + replace_iupac_inplace(annot.entry.reference_bases_mut()); match self.writer.write_variant_record(&self.header, &annot.entry) { Ok(_) => {} Err(error) => panic!("Couldn't write record {:?}", error), } } } + +lazy_static::lazy_static! { + static ref IUPAC: [u8; 128] = { + let mut arr = [0u8; 128]; + arr[b'R' as usize] = b'A'; // A or G + arr[b'Y' as usize] = b'C'; // C or T + arr[b'S' as usize] = b'C'; // C or G + arr[b'W' as usize] = b'A'; // A or T + arr[b'K' as usize] = b'G'; // G or T + arr[b'M' as usize] = b'A'; // A or C + arr[b'B' as usize] = b'C'; // C, G, or T + arr[b'D' as usize] = b'A'; // A, G, or T + arr[b'H' as usize] = b'A'; // A, C, or T + arr[b'V' as usize] = b'A'; // A, C, or G + arr[b'N' as usize] = b'A'; // Any base (A, C, G, or T) + arr[b'r' as usize] = b'a'; // a or g + arr[b'y' as usize] = b'c'; // c or t + arr[b's' as usize] = b'c'; // c or g + arr[b'w' as usize] = b'a'; // a or t + arr[b'k' as usize] = b'g'; // g or t + arr[b'm' as usize] = b'a'; // a or c + arr[b'b' as usize] = b'c'; // c, g, or t + arr[b'd' as usize] = b'a'; // a, g, or t + arr[b'h' as usize] = b'a'; // a, c, or t + arr[b'v' as usize] = b'a'; // a, c, or g + arr[b'n' as usize] = b'a'; // any base (a, c, g, or t) + arr + }; +} + +fn replace_iupac_inplace(sequence: &mut String) { + unsafe { + let bytes = sequence.as_bytes_mut(); + bytes.iter_mut().for_each(|b| { + let t = IUPAC[*b as usize]; + if t != 0u8 { + *b = t; + } + }); + } +} + +/*fn replace_iupac(sequence: &str) -> String { + let mut result = String::with_capacity(sequence.len()); + + for c in sequence.chars() { + result.push(REPLACEMENTS[c as usize]); + } + + result +}*/ From 80039c77a971ea39c8b92af701b875ca4592c2b8 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 17:35:16 -0400 Subject: [PATCH 29/47] default stdout allows pipining into e.g. bcftools sort --- experiments/bedtest.sh | 5 +++-- src/kplib/cli.rs | 4 ++-- src/kplib/vcfwriter.rs | 28 ++++++++++++++++++---------- src/main.rs | 9 +++------ 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh index f848ec3..862c79d 100644 --- a/experiments/bedtest.sh +++ b/experiments/bedtest.sh @@ -9,10 +9,11 @@ create() { --reference /Users/english/code/references/grch38/GRCh38_1kg_mainchrs.fa \ --sizemin 50 \ --sizesim 0.95 --seqsim 0.90 --threads 4 \ - --maxpaths 20000 --mapq 5 --hapsim 0.98 \ + --maxpaths 1000 --mapq 5 --hapsim 0.98 \ --chunksize 100 --maxhom 0 \ --sample doesthiswork \ - -o test_rs/hc.vcf --bed $bed + --bed $bed -o test_rs/hc.vcf.gz + #| bcftools sort -O z -o test_rs/hc.vcf.gz # --bed /Users/english/code/kanpig/test/GRCh38_HG002-T2TQ100-V1.0_stvar.benchmark.bed \ # --bam /Users/english/code/kanpig/experiments/test_rs/GIABHG002.bam \ } diff --git a/src/kplib/cli.rs b/src/kplib/cli.rs index 7b3a40c..bf0daeb 100644 --- a/src/kplib/cli.rs +++ b/src/kplib/cli.rs @@ -26,9 +26,9 @@ pub struct IOParams { #[arg(short = 'f', long)] pub reference: std::path::PathBuf, - /// Output vcf (unsorted) + /// Output vcf (unsorted, uncompressed, default stdout) #[arg(short, long)] - pub out: std::path::PathBuf, + pub out: Option, /// Regions to analyze #[arg(long)] diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index 29c34f0..cdc20e2 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -1,19 +1,19 @@ use crate::kplib::{metrics::GTstate, GenotypeAnno}; use std::collections::HashMap; use std::fs::File; -use std::io::BufWriter; +use std::io::{BufWriter, Write}; use std::path::PathBuf; use noodles_vcf::{ self as vcf, header::record::value::map::format, header::record::value::Map, - variant::io::Write, + variant::io::Write as vcfWrite, variant::record_buf::samples::{keys::Keys, Samples}, }; pub struct VcfWriter { - writer: vcf::io::Writer>, + writer: vcf::io::Writer>, header: vcf::Header, keys: Keys, pub gtcounts: HashMap, @@ -21,7 +21,11 @@ pub struct VcfWriter { impl VcfWriter { /// Given a path and a header, setup a new output vcf - pub fn new(out_path: &PathBuf, mut header: vcf::Header, sample: &Option) -> Self { + pub fn new( + out_path: &Option, + mut header: vcf::Header, + sample: &Option, + ) -> Self { // Ensure sample is correctly setup let sample_name = match sample { Some(name) => name.clone(), @@ -128,10 +132,14 @@ impl VcfWriter { all_formats.insert(ssid.to_string(), ssfmt); // Ready to make files - let out_buf = BufWriter::with_capacity( - page_size::get() * 500, - File::create(out_path).expect("Error Creating Output File"), - ); + let m_page = page_size::get() * 1000; + let out_buf: Box = match out_path { + Some(ref path) => { + let file = File::create(path).expect("Error Creating Output File"); + Box::new(BufWriter::with_capacity(m_page, file)) + } + None => Box::new(BufWriter::with_capacity(m_page, std::io::stdout())), + }; let mut writer = vcf::io::Writer::new(out_buf); let _ = writer.write_header(&header); @@ -185,11 +193,11 @@ lazy_static::lazy_static! { }; } -fn replace_iupac_inplace(sequence: &mut String) { +fn replace_iupac_inplace(sequence: &mut str) { unsafe { let bytes = sequence.as_bytes_mut(); bytes.iter_mut().for_each(|b| { - let t = IUPAC[*b as usize]; + let t = IUPAC[*b as usize]; if t != 0u8 { *b = t; } diff --git a/src/main.rs b/src/main.rs index efb3226..1c30b26 100644 --- a/src/main.rs +++ b/src/main.rs @@ -112,7 +112,7 @@ fn main() { let wthread_num_variants = num_variants.clone(); let wthread_io = args.io.clone(); let wthread_header = input_header.clone(); - + let write_handler = std::thread::spawn(move || { let sty = ProgressStyle::with_template( " [{elapsed_precise}] {bar:44.cyan/blue} > {pos} completed", @@ -122,11 +122,8 @@ fn main() { let mut pbar: Option = None; let mut phase_group: i32 = 0; let mut completed_variants: u64 = 0; - let mut m_writer = VcfWriter::new( - &wthread_io.out, - wthread_header.clone(), - &wthread_io.sample, - ); + let mut m_writer = + VcfWriter::new(&wthread_io.out, wthread_header.clone(), &wthread_io.sample); loop { match result_receiver.recv() { From ded265729a926587e48d3fd9858d3d87b5f69532 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 18:08:19 -0400 Subject: [PATCH 30/47] cleaning --- src/kplib/vcfwriter.rs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index cdc20e2..adc7064 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -155,8 +155,9 @@ impl VcfWriter { *self.gtcounts.entry(annot.gt_state).or_insert(0) += 1; *annot.entry.samples_mut() = Samples::new(self.keys.clone(), vec![annot.make_fields(phase_group)]); - //*annot.entry.reference_bases_mut() = replace_iupac(annot.entry.reference_bases()); + replace_iupac_inplace(annot.entry.reference_bases_mut()); + match self.writer.write_variant_record(&self.header, &annot.entry) { Ok(_) => {} Err(error) => panic!("Couldn't write record {:?}", error), @@ -204,13 +205,3 @@ fn replace_iupac_inplace(sequence: &mut str) { }); } } - -/*fn replace_iupac(sequence: &str) -> String { - let mut result = String::with_capacity(sequence.len()); - - for c in sequence.chars() { - result.push(REPLACEMENTS[c as usize]); - } - - result -}*/ From d2c5f362d371b93e4ffacb73695bcb0d53456061 Mon Sep 17 00:00:00 2001 From: Adam English Date: Sat, 25 May 2024 22:37:28 -0400 Subject: [PATCH 31/47] added iupac warning users will be informed once if there are any iupac bases changed in the output --- src/kplib/vcfwriter.rs | 13 +++++++++++-- src/main.rs | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index adc7064..e613b98 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -17,6 +17,7 @@ pub struct VcfWriter { header: vcf::Header, keys: Keys, pub gtcounts: HashMap, + iupac_warned: bool, } impl VcfWriter { @@ -148,6 +149,7 @@ impl VcfWriter { header, keys, gtcounts: HashMap::new(), + iupac_warned: false, } } @@ -156,7 +158,11 @@ impl VcfWriter { *annot.entry.samples_mut() = Samples::new(self.keys.clone(), vec![annot.make_fields(phase_group)]); - replace_iupac_inplace(annot.entry.reference_bases_mut()); + let changed = replace_iupac_inplace(annot.entry.reference_bases_mut()); + if changed && !self.iupac_warned { + warn!("Some IUPAC codes in REF sequences have been fixed"); + self.iupac_warned = true; + } match self.writer.write_variant_record(&self.header, &annot.entry) { Ok(_) => {} @@ -194,14 +200,17 @@ lazy_static::lazy_static! { }; } -fn replace_iupac_inplace(sequence: &mut str) { +fn replace_iupac_inplace(sequence: &mut str) -> bool { + let mut any_change = false; unsafe { let bytes = sequence.as_bytes_mut(); bytes.iter_mut().for_each(|b| { let t = IUPAC[*b as usize]; if t != 0u8 { + any_change = true; *b = t; } }); } + any_change } diff --git a/src/main.rs b/src/main.rs index 1c30b26..51e2f05 100644 --- a/src/main.rs +++ b/src/main.rs @@ -109,9 +109,9 @@ fn main() { // This is the semaphore for the progress bar that communicates between main and writer let num_variants = Arc::new(Mutex::new(0)); - let wthread_num_variants = num_variants.clone(); let wthread_io = args.io.clone(); let wthread_header = input_header.clone(); + let wthread_num_variants = num_variants.clone(); let write_handler = std::thread::spawn(move || { let sty = ProgressStyle::with_template( From 5568247a9b791e049121217bdd257e4dcd3e117b Mon Sep 17 00:00:00 2001 From: Adam English Date: Sun, 26 May 2024 00:11:07 -0400 Subject: [PATCH 32/47] iupac fix Ns are allowed. warning printed at the end as to not interrupt progress bar --- src/kplib/vcfwriter.rs | 11 +++-------- src/main.rs | 3 +++ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index e613b98..c54b91a 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -17,7 +17,7 @@ pub struct VcfWriter { header: vcf::Header, keys: Keys, pub gtcounts: HashMap, - iupac_warned: bool, + pub iupac_fixed: bool, } impl VcfWriter { @@ -149,7 +149,7 @@ impl VcfWriter { header, keys, gtcounts: HashMap::new(), - iupac_warned: false, + iupac_fixed: false, } } @@ -159,10 +159,7 @@ impl VcfWriter { Samples::new(self.keys.clone(), vec![annot.make_fields(phase_group)]); let changed = replace_iupac_inplace(annot.entry.reference_bases_mut()); - if changed && !self.iupac_warned { - warn!("Some IUPAC codes in REF sequences have been fixed"); - self.iupac_warned = true; - } + self.iupac_fixed = self.iupac_fixed | changed; match self.writer.write_variant_record(&self.header, &annot.entry) { Ok(_) => {} @@ -184,7 +181,6 @@ lazy_static::lazy_static! { arr[b'D' as usize] = b'A'; // A, G, or T arr[b'H' as usize] = b'A'; // A, C, or T arr[b'V' as usize] = b'A'; // A, C, or G - arr[b'N' as usize] = b'A'; // Any base (A, C, G, or T) arr[b'r' as usize] = b'a'; // a or g arr[b'y' as usize] = b'c'; // c or t arr[b's' as usize] = b'c'; // c or g @@ -195,7 +191,6 @@ lazy_static::lazy_static! { arr[b'd' as usize] = b'a'; // a, g, or t arr[b'h' as usize] = b'a'; // a, c, or t arr[b'v' as usize] = b'a'; // a, c, or g - arr[b'n' as usize] = b'a'; // any base (a, c, g, or t) arr }; } diff --git a/src/main.rs b/src/main.rs index 51e2f05..b72d2cc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -155,6 +155,9 @@ fn main() { } } } + if m_writer.iupac_fixed { + warn!("Some IUPAC codes in REF sequences have been fixed in output"); + } info!("genotype counts: {:#?}", m_writer.gtcounts); }); From 8203bbfc56223fff5ef989c0fb756cfef553da1d Mon Sep 17 00:00:00 2001 From: Adam English Date: Sun, 26 May 2024 16:00:02 -0400 Subject: [PATCH 33/47] clean --- src/main.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/main.rs b/src/main.rs index b72d2cc..043c102 100644 --- a/src/main.rs +++ b/src/main.rs @@ -109,22 +109,22 @@ fn main() { // This is the semaphore for the progress bar that communicates between main and writer let num_variants = Arc::new(Mutex::new(0)); - let wthread_io = args.io.clone(); - let wthread_header = input_header.clone(); - let wthread_num_variants = num_variants.clone(); + let wt_io = args.io.clone(); + let wt_header = input_header.clone(); + let wt_num_variants = num_variants.clone(); let write_handler = std::thread::spawn(move || { + let mut m_writer = VcfWriter::new(&wt_io.out, wt_header.clone(), &wt_io.sample); + + let mut pbar: Option = None; let sty = ProgressStyle::with_template( " [{elapsed_precise}] {bar:44.cyan/blue} > {pos} completed", ) .unwrap() .progress_chars("##-"); - let mut pbar: Option = None; + let mut phase_group: i32 = 0; let mut completed_variants: u64 = 0; - let mut m_writer = - VcfWriter::new(&wthread_io.out, wthread_header.clone(), &wthread_io.sample); - loop { match result_receiver.recv() { Ok(None) | Err(_) => { @@ -144,9 +144,9 @@ fn main() { } else { completed_variants += rsize; // check if the reader is finished so we can setup the pbar - let value_guard = wthread_num_variants.lock().unwrap(); - if *value_guard != 0 { - let t_bar = ProgressBar::new(*value_guard).with_style(sty.clone()); + let value = *wt_num_variants.lock().unwrap(); + if value != 0 { + let t_bar = ProgressBar::new(value).with_style(sty.clone()); t_bar.inc(completed_variants); pbar = Some(t_bar); } From b411f5ac1c370240d38fdf6ae7b86fc775db55eb Mon Sep 17 00:00:00 2001 From: Adam English Date: Sun, 26 May 2024 19:56:27 -0400 Subject: [PATCH 34/47] Fix #1 invalid reference base errors are caught and fixed --- experiments/bedtest.sh | 7 ++++--- src/kplib/vcfwriter.rs | 24 ++++++++++++++++++------ 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh index 862c79d..4ef2cef 100644 --- a/experiments/bedtest.sh +++ b/experiments/bedtest.sh @@ -2,13 +2,14 @@ set -e bed=test_rs/test.chr20.bed create() { - #../target/release/kanpig \ + #time kanpig-v0.2.0-x86_64-apple-darwin/kanpig \ + #time ../target/release/kanpig \ time cargo run --release -- \ --input test_rs/test2.vcf.gz \ --bam /Users/english/code/kanpig/experiments/test_rs/NA24385.chr20.bam \ --reference /Users/english/code/references/grch38/GRCh38_1kg_mainchrs.fa \ --sizemin 50 \ - --sizesim 0.95 --seqsim 0.90 --threads 4 \ + --sizesim 0.95 --seqsim 0.90 --threads 5 \ --maxpaths 1000 --mapq 5 --hapsim 0.98 \ --chunksize 100 --maxhom 0 \ --sample doesthiswork \ @@ -40,7 +41,7 @@ bench_full() { } create -bcftools sort -O z -o test_rs/hc.vcf.gz test_rs/hc.vcf +#bcftools sort -O z -o test_rs/hc.vcf.gz test_rs/hc.vcf #tabix test_rs/hc.vcf.gz #bench_lite #bench_medium diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index c54b91a..99443b1 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -18,6 +18,7 @@ pub struct VcfWriter { keys: Keys, pub gtcounts: HashMap, pub iupac_fixed: bool, + buf: Vec, } impl VcfWriter { @@ -150,6 +151,7 @@ impl VcfWriter { keys, gtcounts: HashMap::new(), iupac_fixed: false, + buf: vec![], } } @@ -158,12 +160,22 @@ impl VcfWriter { *annot.entry.samples_mut() = Samples::new(self.keys.clone(), vec![annot.make_fields(phase_group)]); - let changed = replace_iupac_inplace(annot.entry.reference_bases_mut()); - self.iupac_fixed = self.iupac_fixed | changed; - - match self.writer.write_variant_record(&self.header, &annot.entry) { - Ok(_) => {} - Err(error) => panic!("Couldn't write record {:?}", error), + self.buf.clear(); + let mut tmp = vcf::io::Writer::new(&mut self.buf); + // Let noodles check it, first + match tmp.write_variant_record(&self.header, &annot.entry) { + Ok(_) => { + let _ = self.writer.get_mut().write_all(&self.buf); + } + Err(_) => { + let changed = replace_iupac_inplace(annot.entry.reference_bases_mut()); + self.iupac_fixed |= changed; + // Assuming it will work now + match self.writer.write_variant_record(&self.header, &annot.entry) { + Ok(_) => {} + Err(error) => panic!("Couldn't write record {:?}", error), + } + } } } } From 171d0b9c293c21f7b4a64431ca5c6231b0269c0c Mon Sep 17 00:00:00 2001 From: Adam English Date: Sun, 26 May 2024 21:59:57 -0400 Subject: [PATCH 35/47] documentation --- src/kplib/annotator.rs | 20 +++++- src/kplib/kmer.rs | 40 ++++++++++- src/kplib/metrics.rs | 82 ++++++++++++++++++++--- src/kplib/mod.rs | 4 +- src/kplib/pathscore.rs | 21 ------ src/kplib/regions.rs | 3 +- src/kplib/traverse.rs | 10 +-- src/kplib/{vcf_traits.rs => vcftraits.rs} | 0 8 files changed, 136 insertions(+), 44 deletions(-) rename src/kplib/{vcf_traits.rs => vcftraits.rs} (100%) diff --git a/src/kplib/annotator.rs b/src/kplib/annotator.rs index fa22e54..31132e0 100644 --- a/src/kplib/annotator.rs +++ b/src/kplib/annotator.rs @@ -36,6 +36,17 @@ pub struct GenotypeAnno { } impl GenotypeAnno { + /// Constructs a new `GenotypeAnno` instance based on the provided parameters. + /// + /// # Parameters + /// - `entry`: A `RecordBuf` representing the variant record. + /// - `var_idx`: A reference to the node index. + /// - `paths`: A slice of `PathScore` representing the paths. + /// - `coverage`: An unsigned 64-bit integer representing the coverage. + /// - `ploidy`: A reference to the `Ploidy` enum representing the ploidy level. + /// + /// # Returns + /// A `GenotypeAnno` instance initialized based on the provided parameters. pub fn new( entry: RecordBuf, var_idx: &NodeIndex, @@ -50,7 +61,14 @@ impl GenotypeAnno { } } - // These are tied to VcfWriter.keys + /// Generates fields for the `GenotypeAnno` instance. + /// These fields correspond to the keys defined in `VcfWriter`. + /// + /// # Parameters + /// - `phase_group`: An integer representing the phase group. + /// + /// # Returns + /// A vector containing optional `Value` instances representing the fields of the `GenotypeAnno`. pub fn make_fields(&self, phase_group: i32) -> Vec> { vec![ Some(Value::Genotype( diff --git a/src/kplib/kmer.rs b/src/kplib/kmer.rs index 7d0722e..140c10d 100644 --- a/src/kplib/kmer.rs +++ b/src/kplib/kmer.rs @@ -1,3 +1,11 @@ +/// Encodes a nucleotide character into its 2-bit representation. +/// +/// # Parameters +/// - `nuc`: A byte representing the nucleotide character to encode. +/// +/// # Returns +/// A 64-bit unsigned integer representing the binary encoding of the nucleotide. +/// ``` #[inline] fn encode_nuc(nuc: u8) -> u64 { match nuc.to_ascii_uppercase() { @@ -9,7 +17,28 @@ fn encode_nuc(nuc: u8) -> u64 { } } -/// Count kmers in a sequence +/// Converts a DNA sequence into k-mer counts. +/// Optionally compresses homopolymers before counting k-mers. +/// The k-mers are counted as either positive or negative counts based on the `negative` flag. +/// +/// # Parameters +/// - `sequence`: A slice of bytes representing the DNA sequence. +/// - `kmer`: The length of the k-mers to count. +/// - `negative`: If true, the k-mer counts are negative. +/// - `maxhom`: The maximum length of homopolymers; if non-zero, compresses homopolymers before counting k-mers. +/// +/// # Returns +/// A vector of k-mer counts represented as floats. +/// +/// # Example +/// ``` +/// let sequence = b"ACGTACGTAC"; +/// let kmer = 3; +/// let negative = false; +/// let maxhom = 2; +/// let kmer_counts = kanpig::seq_to_kmer(sequence, kmer, negative, maxhom); +/// assert_eq!(kmer_counts.len(), 64); // Example length for k=3 +/// ``` pub fn seq_to_kmer(sequence: &[u8], kmer: u8, negative: bool, maxhom: usize) -> Vec { if maxhom != 0 { return seq_to_kmer(&compress_homopolymer(sequence, maxhom), kmer, negative, 0); @@ -51,6 +80,15 @@ pub fn seq_to_kmer(sequence: &[u8], kmer: u8, negative: bool, maxhom: usize) -> kcounts } +/// Compresses sequences of repeated bytes (homopolymers) in the input vector. +/// Limits the length of any sequence of repeated bytes to `maxspan`. +/// +/// # Parameters +/// - `vector`: A slice of bytes to be compressed. +/// - `maxspan`: The maximum allowable length for sequences of repeated bytes. +/// +/// # Returns +/// A vector of bytes with homopolymer lengths limited to `maxspan`. pub fn compress_homopolymer(vector: &[u8], maxspan: usize) -> Vec { let mut result = Vec::new(); let mut count = 0; diff --git a/src/kplib/metrics.rs b/src/kplib/metrics.rs index 434b8bd..7614223 100644 --- a/src/kplib/metrics.rs +++ b/src/kplib/metrics.rs @@ -1,6 +1,17 @@ use ordered_float::OrderedFloat; -/// Canberra distance of featurized kmers +/// Computes the Canberra distance similarity between two featurized k-mer vectors. +/// The similarity is calculated as 1 minus the Canberra distance, providing a measure of similarity between 0 and 1. +/// +/// # Parameters +/// - `a`: A slice of floating-point numbers representing the first k-mer vector. +/// - `b`: A slice of floating-point numbers representing the second k-mer vector. +/// - `mink`: A floating-point threshold below which differences are ignored. +/// +/// # Returns +/// A floating-point value representing the similarity between the two vectors: +/// - 1.0 indicates identical vectors. +/// - 0.0 indicates no kmers or maximum dissimilarity. pub fn seqsim(a: &[f32], b: &[f32], mink: f32) -> f32 { let mut deno: f32 = 0.0; let mut neum: f32 = 0.0; @@ -14,12 +25,10 @@ pub fn seqsim(a: &[f32], b: &[f32], mink: f32) -> f32 { neum += (x - y).abs(); } - // no kmers if deno == 0.0 { return 0.0; } - // identical if neum == 0.0 { return 1.0; } @@ -27,8 +36,18 @@ pub fn seqsim(a: &[f32], b: &[f32], mink: f32) -> f32 { 1.0 - (neum / deno) } -/// size similarity of two variant sizes -/// sizes must be positive +/// Computes size similarity +/// The similarity is defined as the ratio of the smaller size to the larger size, +/// with special handling for cases where either size is zero. +/// +/// # Parameters +/// - `size_a`: The first size as a 64-bit unsigned integer. +/// - `size_b`: The second size as a 64-bit unsigned integer. +/// +/// # Returns +/// A floating-point value representing the similarity score between the two sizes. +/// - If both sizes are zero, the function returns 1.0. +/// - Otherwise, the similarity is calculated as the ratio of the smaller size to the larger size. pub fn sizesim(size_a: u64, size_b: u64) -> f32 { debug!("sz: {} <-> {}", size_a, size_b); if ((size_a == 0) || (size_b == 0)) && size_a == size_b { @@ -38,7 +57,18 @@ pub fn sizesim(size_a: u64, size_b: u64) -> f32 { / std::cmp::max(std::cmp::max(size_a, size_b), 1) as f32 } -/// do two intervals overlap +/// Determines if two intervals overlap. +/// Each interval is defined by a start and an end position. +/// The intervals overlap if the maximum of the start positions is less than the minimum of the end positions. +/// +/// # Parameters +/// - `s1`: The start position of the first interval as a 64-bit unsigned integer. +/// - `e1`: The end position of the first interval as a 64-bit unsigned integer. +/// - `s2`: The start position of the second interval as a 64-bit unsigned integer. +/// - `e2`: The end position of the second interval as a 64-bit unsigned integer. +/// +/// # Returns +/// A boolean value indicating whether the intervals overlap (`true`) or not (`false`). pub fn overlaps(s1: u64, e1: u64, s2: u64, e2: u64) -> bool { std::cmp::max(s1, s2) < std::cmp::min(e1, e2) } @@ -52,7 +82,19 @@ pub enum GTstate { //Hemi should be a thing } -/// Generate genotypes given observed allele coverages +/// Determines the genotype state based on coverage values for two alternate alleles. +/// The genotype state can be one of three: reference (Ref), heterozygous (Het), or homozygous (Hom). +/// If both coverage values are zero, the state is `Non`. +/// +/// # Parameters +/// - `alt1_cov`: The coverage value for the first alternate allele as a floating-point number. +/// - `alt2_cov`: The coverage value for the second alternate allele as a floating-point number. +/// +/// # Returns +/// A `GTstate` enum value representing the genotype state +/// +/// # Panics +/// This function will panic if an invalid state is encountered, which should be impossible under normal circumstances. pub fn genotyper(alt1_cov: f64, alt2_cov: f64) -> GTstate { if (alt1_cov + alt2_cov) == 0.0 { return GTstate::Non; @@ -72,7 +114,19 @@ pub fn genotyper(alt1_cov: f64, alt2_cov: f64) -> GTstate { ret } -/// Probabilities of each genotype given the allele coveages +/// Calculates genotype scores for three possible genotypes (reference, heterozygous, homozygous) +/// based on the coverage values for two alternate alleles. +/// The scores are adjusted based on the total coverage to account for lower coverage scenarios. +/// +/// # Parameters +/// - `alt1_cov`: The coverage value for the first alternate allele as a floating-point number. +/// - `alt2_cov`: The coverage value for the second alternate allele as a floating-point number. +/// +/// # Returns +/// An array of three floating-point values representing the log-probabilities for each genotype: +/// - The first value corresponds to the reference genotype. +/// - The second value corresponds to the heterozygous genotype. +/// - The third value corresponds to the homozygous genotype. fn genotype_scores(alt1_cov: f64, alt2_cov: f64) -> [f64; 3] { // Needs to be more pure for lower coverage let p_alt: &[f64] = if alt1_cov + alt2_cov < 10.0 { @@ -91,8 +145,16 @@ fn genotype_scores(alt1_cov: f64, alt2_cov: f64) -> [f64; 3] { ] } -/// Genotype quality: confidence in the assigned genotype -/// Sample quality: confidence that there is non-reference present +/// Calculates genotype quality (GQ) and sample quality (SQ) based on the coverage values for reference and alternate alleles. +/// +/// # Parameters +/// - `ref_cov`: The coverage value for the reference allele as a floating-point number. +/// - `alt_cov`: The coverage value for the alternate allele as a floating-point number. +/// +/// # Returns +/// A tuple containing two floating-point values: +/// - The first value is the genotype quality (GQ). +/// - The second value is the sample quality (SQ). pub fn genotype_quals(ref_cov: f64, alt_cov: f64) -> (f64, f64) { let mut gt_lplist = genotype_scores(ref_cov, alt_cov); gt_lplist.sort_by(|a, b| b.partial_cmp(a).unwrap()); diff --git a/src/kplib/mod.rs b/src/kplib/mod.rs index a9ecda4..3451793 100644 --- a/src/kplib/mod.rs +++ b/src/kplib/mod.rs @@ -42,8 +42,8 @@ pub use crate::kplib::traverse::brute_force_find_path; mod vargraph; pub use crate::kplib::vargraph::{VarNode, Variants}; -mod vcf_traits; -pub use crate::kplib::vcf_traits::{KdpVcf, Svtype}; +mod vcftraits; +pub use crate::kplib::vcftraits::{KdpVcf, Svtype}; mod vcfreader; pub use crate::kplib::vcfreader::VcfChunker; diff --git a/src/kplib/pathscore.rs b/src/kplib/pathscore.rs index 3cfeec1..1b259e6 100644 --- a/src/kplib/pathscore.rs +++ b/src/kplib/pathscore.rs @@ -37,27 +37,6 @@ impl Ord for PathScore { other_ordering => other_ordering, } } - // Sort by size then sequence - /* fn cmp(&self, other: &Self) -> Ordering { - match self - .align_pct - .partial_cmp(&other.align_pct) - .unwrap_or(Ordering::Equal) - { - Ordering::Equal => match self - .sizesim - .partial_cmp(&other.sizesim) - .unwrap_or(Ordering::Equal) - { - Ordering::Equal => self - .seqsim - .partial_cmp(&other.seqsim) - .unwrap_or(Ordering::Equal), - other_ordering => other_ordering, - }, - other_ordering => other_ordering, - } - }*/ } impl PartialOrd for PathScore { diff --git a/src/kplib/regions.rs b/src/kplib/regions.rs index f1e960f..26bf736 100644 --- a/src/kplib/regions.rs +++ b/src/kplib/regions.rs @@ -3,9 +3,10 @@ use noodles_vcf::header::record::value::{map::contig::Contig, Map}; use std::collections::HashMap; use std::collections::VecDeque; +use crate::kplib::BedParser; + pub type ContigMap = IndexMap>; pub type Regions = HashMap>; -use crate::kplib::BedParser; /// create a HashMap with keys of chromsome names and /// values a list of start, end positions with regions diff --git a/src/kplib/traverse.rs b/src/kplib/traverse.rs index 9df4820..fa183a6 100644 --- a/src/kplib/traverse.rs +++ b/src/kplib/traverse.rs @@ -52,10 +52,11 @@ pub fn brute_force_find_path( path: vec![], }; let mut stack: BinaryHeap = BinaryHeap::new(); - //let mut stack: Vec = Vec::new(); stack.push(start_path); + let mut best_path = PathScore::default(); let mut npaths = 0; + let partial_haps = target.partial_haplotypes(params.kmer); let snk_node = NodeIndex::new(graph.node_count() - 1); @@ -63,7 +64,6 @@ pub fn brute_force_find_path( // Throw all of cur_node's neighbors on the stack // Except snk_node, which is an indicator that the // current path has ended - //let mut any_push = false; for next_node in graph.edges(cur_path.node).filter_map(|edge| { if skip_edges.contains(&edge.id()) { None @@ -83,7 +83,6 @@ pub fn brute_force_find_path( npaths += 1; } else { let nsize = cur_path.size + graph.node_weight(next_node).unwrap().size; - //any_push = true; let mut npath = cur_path.path.clone(); npath.push(next_node); stack.push(PathNodeState { @@ -95,10 +94,6 @@ pub fn brute_force_find_path( } } - /*if any_push { - stack.sort_by_key(|node| std::cmp::Reverse(node.dist)); - }*/ - if npaths > params.maxpaths { break; } @@ -128,7 +123,6 @@ pub fn get_one_to_one( } else { None } - //(node.size >= size_range_lower) & (node.size <= size_range_upper) & ( }) .collect() } diff --git a/src/kplib/vcf_traits.rs b/src/kplib/vcftraits.rs similarity index 100% rename from src/kplib/vcf_traits.rs rename to src/kplib/vcftraits.rs From dae201b6874eb86513ad2ebea18b422d544c1825 Mon Sep 17 00:00:00 2001 From: Adam English Date: Mon, 27 May 2024 09:57:10 -0400 Subject: [PATCH 36/47] speed boost path-searching, only create path_kfeat if ever needed and reuse the PathNodeState size instead of recomputing. ~8% faster. --- experiments/bedtest.sh | 4 ++-- src/kplib/pathscore.rs | 39 ++++++++++++++++++++++----------------- src/kplib/traverse.rs | 20 ++++++++++---------- 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh index 4ef2cef..5b8abaa 100644 --- a/experiments/bedtest.sh +++ b/experiments/bedtest.sh @@ -3,8 +3,8 @@ bed=test_rs/test.chr20.bed create() { #time kanpig-v0.2.0-x86_64-apple-darwin/kanpig \ - #time ../target/release/kanpig \ - time cargo run --release -- \ + #time cargo run --release -- \ + time ../target/release/kanpig \ --input test_rs/test2.vcf.gz \ --bam /Users/english/code/kanpig/experiments/test_rs/NA24385.chr20.bam \ --reference /Users/english/code/references/grch38/GRCh38_1kg_mainchrs.fa \ diff --git a/src/kplib/pathscore.rs b/src/kplib/pathscore.rs index 1b259e6..2fbebfb 100644 --- a/src/kplib/pathscore.rs +++ b/src/kplib/pathscore.rs @@ -61,26 +61,12 @@ impl PathScore { pub fn new( graph: &DiGraph, path: Vec, + path_size: i64, targets: &Vec, target_size: i64, params: &KDParams, ) -> Self { - let path_size: i64 = path - .iter() - .filter_map(|&node_index| graph.node_weight(node_index)) - .map(|x| x.size) - .sum(); - - let path_k: Vec = path - .iter() - .filter_map(|&node_index| graph.node_weight(node_index)) - .map(|x| x.kfeat.as_ref()) - .fold( - vec![0f32; 4_usize.pow(params.kmer.into())], - |acc: Vec, other: &Vec| { - acc.iter().zip(other).map(|(x, y)| x + y).collect() - }, - ); + let mut path_k: Option> = None; // Return the partials in order from all to least for hap_parts in targets { @@ -94,7 +80,26 @@ impl PathScore { continue; } - let seqsim = metrics::seqsim(&path_k, &hap_parts.kfeat, params.minkfreq as f32); + if !path_k.is_some() { + // only make if it is ever needed + path_k = Some( + path.iter() + .filter_map(|&node_index| graph.node_weight(node_index)) + .map(|x| x.kfeat.as_ref()) + .fold( + vec![0f32; 4_usize.pow(params.kmer.into())], + |acc: Vec, other: &Vec| { + acc.iter().zip(other).map(|(x, y)| x + y).collect() + }, + ), + ); + } + + let seqsim = metrics::seqsim( + path_k.as_ref().unwrap(), + &hap_parts.kfeat, + params.minkfreq as f32, + ); debug!("sqsim: {}", seqsim); if seqsim < params.seqsim { continue; diff --git a/src/kplib/traverse.rs b/src/kplib/traverse.rs index fa183a6..e8da502 100644 --- a/src/kplib/traverse.rs +++ b/src/kplib/traverse.rs @@ -45,20 +45,18 @@ pub fn brute_force_find_path( params: &KDParams, skip_edges: &[EdgeIndex], ) -> PathScore { - let start_path = PathNodeState { + let mut npaths = 0; + let mut best_path = PathScore::default(); + let snk_node = NodeIndex::new(graph.node_count() - 1); + let partial_haps = target.partial_haplotypes(params.kmer); + + let mut stack: BinaryHeap = BinaryHeap::new(); + stack.push(PathNodeState { dist: target.size.unsigned_abs(), // this is for sorting size: 0, node: NodeIndex::new(0), path: vec![], - }; - let mut stack: BinaryHeap = BinaryHeap::new(); - stack.push(start_path); - - let mut best_path = PathScore::default(); - let mut npaths = 0; - - let partial_haps = target.partial_haplotypes(params.kmer); - let snk_node = NodeIndex::new(graph.node_count() - 1); + }); while let Some(cur_path) = stack.pop() { // Throw all of cur_node's neighbors on the stack @@ -75,6 +73,7 @@ pub fn brute_force_find_path( best_path = best_path.max(PathScore::new( graph, cur_path.path.clone(), + cur_path.size, &partial_haps, target.size, params, @@ -114,6 +113,7 @@ pub fn get_one_to_one( let candidate = PathScore::new( graph, vec![target_node], + graph.node_weight(target_node).unwrap().size, vec![target.clone()].as_ref(), target.size, params, From 6b7b632918250de9885fa4036038aa9a35ba02a4 Mon Sep 17 00:00:00 2001 From: Adam English Date: Mon, 27 May 2024 10:22:24 -0400 Subject: [PATCH 37/47] slight optimization traverse is the performance critical section of the code, so any speed we can get from that helps --- src/kplib/metrics.rs | 20 ++++++++++---------- src/kplib/pathscore.rs | 4 ++-- src/kplib/traverse.rs | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/kplib/metrics.rs b/src/kplib/metrics.rs index 7614223..d346d2e 100644 --- a/src/kplib/metrics.rs +++ b/src/kplib/metrics.rs @@ -15,14 +15,14 @@ use ordered_float::OrderedFloat; pub fn seqsim(a: &[f32], b: &[f32], mink: f32) -> f32 { let mut deno: f32 = 0.0; let mut neum: f32 = 0.0; + let mut total_d: f32; + for (&x, &y) in a.iter().zip(b.iter()) { - let d = x.abs() + y.abs(); - if d <= mink { - continue; + total_d = x.abs() + y.abs(); + if total_d > mink { + deno += total_d; + neum += (x - y).abs(); } - deno += d; - - neum += (x - y).abs(); } if deno == 0.0 { @@ -49,12 +49,12 @@ pub fn seqsim(a: &[f32], b: &[f32], mink: f32) -> f32 { /// - If both sizes are zero, the function returns 1.0. /// - Otherwise, the similarity is calculated as the ratio of the smaller size to the larger size. pub fn sizesim(size_a: u64, size_b: u64) -> f32 { - debug!("sz: {} <-> {}", size_a, size_b); - if ((size_a == 0) || (size_b == 0)) && size_a == size_b { + if size_a == 0 && size_b == 0 { return 1.0; } - std::cmp::max(std::cmp::min(size_a, size_b), 1) as f32 - / std::cmp::max(std::cmp::max(size_a, size_b), 1) as f32 + let min_size = size_a.min(size_b).max(1) as f32; + let max_size = size_a.max(size_b).max(1) as f32; + min_size / max_size } /// Determines if two intervals overlap. diff --git a/src/kplib/pathscore.rs b/src/kplib/pathscore.rs index 2fbebfb..98a4f79 100644 --- a/src/kplib/pathscore.rs +++ b/src/kplib/pathscore.rs @@ -75,7 +75,7 @@ impl PathScore { } let sizesim = metrics::sizesim(path_size.unsigned_abs(), hap_parts.size.unsigned_abs()); - debug!("szsim: {}", sizesim); + //debug!("szsim: {}", sizesim); if sizesim < params.sizesim { continue; } @@ -100,7 +100,7 @@ impl PathScore { &hap_parts.kfeat, params.minkfreq as f32, ); - debug!("sqsim: {}", seqsim); + //debug!("sqsim: {}", seqsim); if seqsim < params.seqsim { continue; } diff --git a/src/kplib/traverse.rs b/src/kplib/traverse.rs index e8da502..642f165 100644 --- a/src/kplib/traverse.rs +++ b/src/kplib/traverse.rs @@ -78,7 +78,6 @@ pub fn brute_force_find_path( target.size, params, )); - debug!("best path {:?}", best_path); npaths += 1; } else { let nsize = cur_path.size + graph.node_weight(next_node).unwrap().size; @@ -98,6 +97,7 @@ pub fn brute_force_find_path( } } + debug!("best path {:?}", best_path); best_path } From 3dce53cf232b0ebbea09b788cf17dd37ab038b79 Mon Sep 17 00:00:00 2001 From: Adam English Date: Mon, 27 May 2024 10:43:51 -0400 Subject: [PATCH 38/47] sizesim fix --- src/kplib/metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kplib/metrics.rs b/src/kplib/metrics.rs index d346d2e..8a2e505 100644 --- a/src/kplib/metrics.rs +++ b/src/kplib/metrics.rs @@ -49,7 +49,7 @@ pub fn seqsim(a: &[f32], b: &[f32], mink: f32) -> f32 { /// - If both sizes are zero, the function returns 1.0. /// - Otherwise, the similarity is calculated as the ratio of the smaller size to the larger size. pub fn sizesim(size_a: u64, size_b: u64) -> f32 { - if size_a == 0 && size_b == 0 { + if size_a == size_b { return 1.0; } let min_size = size_a.min(size_b).max(1) as f32; From 862f409cb85994e2e2a11b388578cac9e8168578 Mon Sep 17 00:00:00 2001 From: Adam English Date: Mon, 27 May 2024 12:46:39 -0400 Subject: [PATCH 39/47] default buffer on stdout --- src/kplib/vcfwriter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kplib/vcfwriter.rs b/src/kplib/vcfwriter.rs index 99443b1..b4ff4e6 100644 --- a/src/kplib/vcfwriter.rs +++ b/src/kplib/vcfwriter.rs @@ -134,13 +134,13 @@ impl VcfWriter { all_formats.insert(ssid.to_string(), ssfmt); // Ready to make files - let m_page = page_size::get() * 1000; let out_buf: Box = match out_path { Some(ref path) => { + let m_page = page_size::get() * 1000; let file = File::create(path).expect("Error Creating Output File"); Box::new(BufWriter::with_capacity(m_page, file)) } - None => Box::new(BufWriter::with_capacity(m_page, std::io::stdout())), + None => Box::new(BufWriter::new(std::io::stdout())), }; let mut writer = vcf::io::Writer::new(out_buf); let _ = writer.write_header(&header); From f4db6c69b7e3ba8d232ff36ce8c57b965bb577c2 Mon Sep 17 00:00:00 2001 From: Adam English Date: Mon, 27 May 2024 14:19:32 -0400 Subject: [PATCH 40/47] clippy --- src/kplib/pathscore.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kplib/pathscore.rs b/src/kplib/pathscore.rs index 98a4f79..cbdaf1d 100644 --- a/src/kplib/pathscore.rs +++ b/src/kplib/pathscore.rs @@ -80,7 +80,7 @@ impl PathScore { continue; } - if !path_k.is_some() { + if path_k.is_none() { // only make if it is ever needed path_k = Some( path.iter() From 7d7b23b12db9453321aed162f89890bb75b041e1 Mon Sep 17 00:00:00 2001 From: Adam English Date: Tue, 28 May 2024 00:13:43 -0400 Subject: [PATCH 41/47] argument validation, fewer happarts errors for invalid arguments. haplotype parts capped to 100 --- src/kplib/cli.rs | 30 ++++++++++++++++++++++++++++++ src/kplib/haplotype.rs | 2 +- src/kplib/vcftraits.rs | 6 +++--- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/kplib/cli.rs b/src/kplib/cli.rs index bf0daeb..d3c8256 100644 --- a/src/kplib/cli.rs +++ b/src/kplib/cli.rs @@ -165,6 +165,36 @@ impl ArgParser { warn!("--kmer above 8 becomes memory intensive"); } + if self.kd.kmer < 1 { + error!("--kmer must be at least 1"); + is_ok = false; + } + + if self.kd.sizesim < 0.0 || self.kd.sizesim > 1.0 { + error!("--sizesim must be between 0.0 and 1.0"); + is_ok = false; + } + + if self.kd.seqsim < 0.0 || self.kd.seqsim > 1.0 { + error!("--seqsim must be between 0.0 and 1.0"); + is_ok = false; + } + + if self.kd.hapsim < 0.0 || self.kd.hapsim >= 1.0 { + error!("--hapsim must be between 0.0 and 1.0"); + is_ok = false; + } + + if self.kd.maxpaths < 1 { + error!("--maxpaths must be at least 1"); + is_ok = false; + } + + if self.io.threads < 1 { + error!("--threads must be at least 1"); + is_ok = false; + } + is_ok } } diff --git a/src/kplib/haplotype.rs b/src/kplib/haplotype.rs index 116b7dd..982c78f 100644 --- a/src/kplib/haplotype.rs +++ b/src/kplib/haplotype.rs @@ -52,7 +52,7 @@ impl Haplotype { pub fn partial_haplotypes(&self, kmer: u8) -> Vec { // Consider putting these as parameters. But really we just need smarter clustering let max_fns = 3; // Most number of false-negatives we'll attempt to apply to the graph - let max_parts = 500; // Most number of pileups we'll even attempt to split + let max_parts = 100; // Most number of pileups we'll even attempt to split // If its more than this, we can't evaluate the region reasonably, // anyway. let mut ret = vec![]; diff --git a/src/kplib/vcftraits.rs b/src/kplib/vcftraits.rs index 79cc9df..2b1d23b 100644 --- a/src/kplib/vcftraits.rs +++ b/src/kplib/vcftraits.rs @@ -41,7 +41,7 @@ pub trait KdpVcf { impl KdpVcf for RecordBuf { /// Convert variant sequence to Kfeat fn to_kfeat(&self, kmer: u8, maxhom: usize) -> (Vec, i64) { - let ref_seq = self.reference_bases().to_string(); + let ref_seq = self.reference_bases(); let alt_seq = self.get_alt(); let size = alt_seq.len() as i64 - ref_seq.len() as i64; @@ -101,8 +101,8 @@ impl KdpVcf for RecordBuf { let alts = self.alternate_bases(); match alts.len() { 0 => ".", - _ => alts.iter().next().expect("I just checked").unwrap(), - //.to_string(), // I don't like all this String when str should be simplier + 1 => alts.iter().next().expect("I just checked").unwrap(), + _ => panic!("multi-allelic records not supported"), } } } From 48bd6beef3626a93dba34e58f89b0fb9a5840338 Mon Sep 17 00:00:00 2001 From: Adam English Date: Wed, 29 May 2024 02:14:45 -0400 Subject: [PATCH 42/47] compressing notebooks --- experiments/Development.ipynb | 99 +++++------------------------------ 1 file changed, 13 insertions(+), 86 deletions(-) diff --git a/experiments/Development.ipynb b/experiments/Development.ipynb index d3c4e11..b393e46 100644 --- a/experiments/Development.ipynb +++ b/experiments/Development.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 179, + "execution_count": null, "id": "e63d221c-abfa-4d5d-b5df-94d8b33398fb", "metadata": {}, "outputs": [], @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 180, + "execution_count": null, "id": "5a3008ef-fee9-45b7-8e86-aad8cd022e13", "metadata": {}, "outputs": [], @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 181, + "execution_count": null, "id": "70f8aaef-09a9-4aa2-b6a8-3e2e0da5fa4d", "metadata": {}, "outputs": [], @@ -35,72 +35,30 @@ }, { "cell_type": "code", - "execution_count": 182, + "execution_count": null, "id": "46f87f7c-833e-484d-8815-811f50ccb94f", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "904" - ] - }, - "execution_count": 182, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "len(data)" ] }, { "cell_type": "code", - "execution_count": 176, + "execution_count": null, "id": "dc315ca1-8e34-4754-aec2-21de1614cf31", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PearsonRResult(statistic=0.953444540233332, pvalue=0.0)\n" - ] - } - ], + "outputs": [], "source": [ "print(pearsonr(data['seqsim'], data['pj']))" ] }, { "cell_type": "code", - "execution_count": 183, + "execution_count": null, "id": "3a1650ae-4f28-4f0b-b433-8da399f35c2b", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Text(0.5, 1.0, 'Kmer estimate of sequence similarity'),\n", - " Text(0.5, 0, 'Sequence Similarity'),\n", - " Text(0, 0.5, 'Kmer Count Similarity')]" - ] - }, - "execution_count": 183, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import seaborn as sb\n", "p = sb.scatterplot(data=data, x='seqsim', y='pj', hue='bin_len')\n", @@ -109,51 +67,20 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": null, "id": "768670af-cc99-4264-b966-6874bee29473", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PearsonRResult(statistic=0.9361582055174525, pvalue=0.0)\n" - ] - } - ], + "outputs": [], "source": [ "print(pearsonr(data['unroll'], data['pj']))" ] }, { "cell_type": "code", - "execution_count": 178, + "execution_count": null, "id": "b0373a3f-55ce-47ec-9867-07bfff9c99e0", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Text(0.5, 1.0, 'Kmer estimate of unroll sequence similarity'),\n", - " Text(0.5, 0, 'Unroll Sequence Similarity'),\n", - " Text(0, 0.5, 'Kmer Count Similarity')]" - ] - }, - "execution_count": 178, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import seaborn as sb\n", "p = sb.scatterplot(data=data, x='unroll', y='pj', hue='bin_len')\n", From 79bacb677c731bc3adf6e04128af1f91802dbedb Mon Sep 17 00:00:00 2001 From: Adam English Date: Mon, 10 Jun 2024 14:13:12 -0400 Subject: [PATCH 43/47] hapsim correction 1.0 is allowed --- src/kplib/cli.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kplib/cli.rs b/src/kplib/cli.rs index d3c8256..4885467 100644 --- a/src/kplib/cli.rs +++ b/src/kplib/cli.rs @@ -180,7 +180,7 @@ impl ArgParser { is_ok = false; } - if self.kd.hapsim < 0.0 || self.kd.hapsim >= 1.0 { + if self.kd.hapsim < 0.0 || self.kd.hapsim > 1.0 { error!("--hapsim must be between 0.0 and 1.0"); is_ok = false; } From d8b53c39365c4ba818e1cbfe60c20af289de1664 Mon Sep 17 00:00:00 2001 From: Adam English Date: Tue, 11 Jun 2024 11:27:47 -0400 Subject: [PATCH 44/47] annotation fixing partial is now just a binary. FT flags are fixed. SQ is now informative --- experiments/bedtest.sh | 14 ++++++------- src/kplib/annotator.rs | 46 ++++++++++++++++++++++++++++-------------- src/kplib/metrics.rs | 10 ++------- src/kplib/pathscore.rs | 19 ++++++++--------- src/kplib/traverse.rs | 2 -- 5 files changed, 48 insertions(+), 43 deletions(-) diff --git a/experiments/bedtest.sh b/experiments/bedtest.sh index 5b8abaa..579293f 100644 --- a/experiments/bedtest.sh +++ b/experiments/bedtest.sh @@ -2,18 +2,18 @@ set -e bed=test_rs/test.chr20.bed create() { + #time ../target/release/kanpig \ #time kanpig-v0.2.0-x86_64-apple-darwin/kanpig \ - #time cargo run --release -- \ - time ../target/release/kanpig \ + time cargo run --release -- \ --input test_rs/test2.vcf.gz \ --bam /Users/english/code/kanpig/experiments/test_rs/NA24385.chr20.bam \ --reference /Users/english/code/references/grch38/GRCh38_1kg_mainchrs.fa \ --sizemin 50 \ --sizesim 0.95 --seqsim 0.90 --threads 5 \ - --maxpaths 1000 --mapq 5 --hapsim 0.98 \ + --maxpaths 1000 --mapq 20 --hapsim 0.98 \ --chunksize 100 --maxhom 0 \ --sample doesthiswork \ - --bed $bed -o test_rs/hc.vcf.gz + --bed $bed -o test_rs/hc.vcf #| bcftools sort -O z -o test_rs/hc.vcf.gz # --bed /Users/english/code/kanpig/test/GRCh38_HG002-T2TQ100-V1.0_stvar.benchmark.bed \ # --bam /Users/english/code/kanpig/experiments/test_rs/GIABHG002.bam \ @@ -41,8 +41,8 @@ bench_full() { } create -#bcftools sort -O z -o test_rs/hc.vcf.gz test_rs/hc.vcf -#tabix test_rs/hc.vcf.gz +bcftools sort -O z -o test_rs/hc.vcf.gz test_rs/hc.vcf +tabix test_rs/hc.vcf.gz #bench_lite -#bench_medium +bench_medium #bench_full diff --git a/src/kplib/annotator.rs b/src/kplib/annotator.rs index 31132e0..5a40528 100644 --- a/src/kplib/annotator.rs +++ b/src/kplib/annotator.rs @@ -10,13 +10,13 @@ use noodles_vcf::{ bitflags! { pub struct FiltFlags: u32 { - const PASS = 0x0; // passing - const GTMISMATCH = 0x1; // genotype from AD doesn't match path genotype - const LOWGQ = 0x2; // genotype quality below 5 - const LOWCOV = 0x4; // coverage below 5 - const LOWSQ = 0x8; // sample quality below below 5 (only on non-ref genotypes) - const LOWALT = 0x16; // alt coverage below 5 (only on non-ref genotypes) - const PARTIAL = 0x32; // best scoring path only used part of the haplotype + const PASS = 0b00000000; // passing + const GTMISMATCH = 0b00000001; // genotype from AD doesn't match path genotype + const LOWGQ = 0b00000010; // genotype quality below 5 + const LOWCOV = 0b00000100; // coverage below 5 + const LOWSQ = 0b00001000; // sample quality below below 5 (only on non-ref genotypes) + const LOWALT = 0b00010000; // alt coverage below 5 (only on non-ref genotypes) + const PARTIAL = 0b00100000; // best scoring path only used part of the haplotype } } @@ -96,19 +96,35 @@ fn diploid( let path1 = &paths[0]; let path2 = &paths[1]; - let (gt_str, gt_path, alt_cov) = + let (gt_str, gt_path, alt_cov, full_target) = match (path1.path.contains(var_idx), path2.path.contains(var_idx)) { (true, true) if path1 != path2 => ( "1|1", metrics::GTstate::Hom, (path1.coverage.unwrap() + path2.coverage.unwrap()) as f64, + path1.full_target && path2.full_target, ), // sometimes I used the same path twice - (true, true) => ("1|1", metrics::GTstate::Hom, path1.coverage.unwrap() as f64), - (true, false) => ("1|0", metrics::GTstate::Het, path1.coverage.unwrap() as f64), - (false, true) => ("0|1", metrics::GTstate::Het, path2.coverage.unwrap() as f64), - (false, false) if coverage != 0 => ("0|0", metrics::GTstate::Ref, 0.0), - (false, false) => ("./.", metrics::GTstate::Non, 0.0), + (true, true) => ( + "1|1", + metrics::GTstate::Hom, + path1.coverage.unwrap() as f64, + path1.full_target, + ), + (true, false) => ( + "1|0", + metrics::GTstate::Het, + path1.coverage.unwrap() as f64, + path1.full_target, + ), + (false, true) => ( + "0|1", + metrics::GTstate::Het, + path2.coverage.unwrap() as f64, + path2.full_target, + ), + (false, false) if coverage != 0 => ("0|0", metrics::GTstate::Ref, 0.0, true), + (false, false) => ("./.", metrics::GTstate::Non, 0.0, true), }; let ref_cov = (coverage as f64) - alt_cov; @@ -147,7 +163,7 @@ fn diploid( filt |= FiltFlags::LOWALT; } } - if (path1.align_pct != 1.0) || (path2.align_pct != 1.0) { + if !full_target { filt |= FiltFlags::PARTIAL; } @@ -221,7 +237,7 @@ fn haploid( filt |= FiltFlags::LOWALT; } } - if path1.align_pct != 1.0 { + if !path1.full_target { filt |= FiltFlags::PARTIAL; } diff --git a/src/kplib/metrics.rs b/src/kplib/metrics.rs index 8a2e505..21ec09f 100644 --- a/src/kplib/metrics.rs +++ b/src/kplib/metrics.rs @@ -157,19 +157,13 @@ fn genotype_scores(alt1_cov: f64, alt2_cov: f64) -> [f64; 3] { /// - The second value is the sample quality (SQ). pub fn genotype_quals(ref_cov: f64, alt_cov: f64) -> (f64, f64) { let mut gt_lplist = genotype_scores(ref_cov, alt_cov); - gt_lplist.sort_by(|a, b| b.partial_cmp(a).unwrap()); + let sq = f64::min((-10.0 * (gt_lplist[0] - (gt_lplist[1] + gt_lplist[2]))).abs(), 100.0); + gt_lplist.sort_by(|a, b| b.partial_cmp(a).unwrap()); let best = gt_lplist[0]; let second_best = gt_lplist[1]; let gq = f64::min(-10.0 * (second_best - best), 100.0); - let mut gt_sum = 0.0; - for gt in >_lplist { - gt_sum += 10.0_f64.powf(*gt); - } - - let gt_sum_log = gt_sum.log10(); - let sq = f64::min((-10.0 * (gt_lplist[0] - gt_sum_log)).abs(), 100.0); (gq, sq) } diff --git a/src/kplib/pathscore.rs b/src/kplib/pathscore.rs index cbdaf1d..d905e02 100644 --- a/src/kplib/pathscore.rs +++ b/src/kplib/pathscore.rs @@ -8,14 +8,14 @@ pub struct PathScore { pub seqsim: f32, pub coverage: Option, pub path: Vec, - pub align_pct: f32, // percent of the haplotype used + pub full_target: bool, // is this path against the full target } impl Eq for PathScore {} impl PartialEq for PathScore { fn eq(&self, other: &Self) -> bool { - self.align_pct == other.align_pct + self.full_target == other.full_target && self.sizesim == other.sizesim && self.seqsim == other.seqsim } @@ -25,8 +25,8 @@ impl Ord for PathScore { // Sort by mean of size and sequence fn cmp(&self, other: &Self) -> Ordering { match self - .align_pct - .partial_cmp(&other.align_pct) + .full_target + .partial_cmp(&other.full_target) .unwrap_or(Ordering::Equal) { Ordering::Equal => { @@ -52,7 +52,7 @@ impl Default for PathScore { sizesim: 0.0, seqsim: 0.0, coverage: None, - align_pct: 0.0, + full_target: false, } } } @@ -62,14 +62,13 @@ impl PathScore { graph: &DiGraph, path: Vec, path_size: i64, - targets: &Vec, - target_size: i64, + targets: &[Haplotype], params: &KDParams, ) -> Self { let mut path_k: Option> = None; // Return the partials in order from all to least - for hap_parts in targets { + for (i, hap_parts) in targets.iter().enumerate() { if path_size.signum() != hap_parts.size.signum() { continue; } @@ -111,9 +110,7 @@ impl PathScore { sizesim, seqsim, coverage: None, - align_pct: (hap_parts.size.unsigned_abs() as f32 - / target_size.unsigned_abs() as f32) - .abs(), + full_target: i == 0, }; } PathScore::default() diff --git a/src/kplib/traverse.rs b/src/kplib/traverse.rs index 642f165..9220408 100644 --- a/src/kplib/traverse.rs +++ b/src/kplib/traverse.rs @@ -75,7 +75,6 @@ pub fn brute_force_find_path( cur_path.path.clone(), cur_path.size, &partial_haps, - target.size, params, )); npaths += 1; @@ -115,7 +114,6 @@ pub fn get_one_to_one( vec![target_node], graph.node_weight(target_node).unwrap().size, vec![target.clone()].as_ref(), - target.size, params, ); if candidate.seqsim > 0.0 { From b389ab4669376cc9918376284ab258ce6dbcdac7 Mon Sep 17 00:00:00 2001 From: Adam English Date: Tue, 11 Jun 2024 12:02:50 -0400 Subject: [PATCH 45/47] SQ fix Better represents status of is an alt present --- src/kplib/metrics.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/kplib/metrics.rs b/src/kplib/metrics.rs index 21ec09f..9b69688 100644 --- a/src/kplib/metrics.rs +++ b/src/kplib/metrics.rs @@ -157,7 +157,13 @@ fn genotype_scores(alt1_cov: f64, alt2_cov: f64) -> [f64; 3] { /// - The second value is the sample quality (SQ). pub fn genotype_quals(ref_cov: f64, alt_cov: f64) -> (f64, f64) { let mut gt_lplist = genotype_scores(ref_cov, alt_cov); - let sq = f64::min((-10.0 * (gt_lplist[0] - (gt_lplist[1] + gt_lplist[2]))).abs(), 100.0); + + let mut gt_sum = 0.0; + for gt in >_lplist { + gt_sum += 10.0_f64.powf(*gt); + } + let gt_sum_log = gt_sum.log10(); + let sq = f64::min((-10.0 * (gt_lplist[0] - gt_sum_log)).abs(), 100.0); gt_lplist.sort_by(|a, b| b.partial_cmp(a).unwrap()); let best = gt_lplist[0]; From b4b572b11073bc4a676a8f86d445c0f47e0f1340 Mon Sep 17 00:00:00 2001 From: Adam English Date: Tue, 11 Jun 2024 14:14:27 -0500 Subject: [PATCH 46/47] experimental ignore errors When the hap1 is non-ref, has low coverage, and isn't applied to any paths, use the ref/alt coverage to genotype. --- src/kplib/annotator.rs | 16 ++++++++++++++-- src/kplib/pathscore.rs | 9 ++++++++- src/kplib/vargraph.rs | 1 + 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/kplib/annotator.rs b/src/kplib/annotator.rs index 5a40528..cd9f6c5 100644 --- a/src/kplib/annotator.rs +++ b/src/kplib/annotator.rs @@ -96,7 +96,7 @@ fn diploid( let path1 = &paths[0]; let path2 = &paths[1]; - let (gt_str, gt_path, alt_cov, full_target) = + let (mut gt_str, gt_path, alt_cov, full_target) = match (path1.path.contains(var_idx), path2.path.contains(var_idx)) { (true, true) if path1 != path2 => ( "1|1", @@ -126,9 +126,21 @@ fn diploid( (false, false) if coverage != 0 => ("0|0", metrics::GTstate::Ref, 0.0, true), (false, false) => ("./.", metrics::GTstate::Non, 0.0, true), }; - let ref_cov = (coverage as f64) - alt_cov; let gt_obs = metrics::genotyper(ref_cov, alt_cov); + + // Alt haplotypes without a path can be filtered if we think it is + // more likely to be an error + let bcov = path1.coverage.unwrap() as f64; + if !path1.is_ref && *path1 == PathScore::default() && bcov < ref_cov && gt_path != gt_obs { + gt_str = match gt_obs { + metrics::GTstate::Ref => "0|0", + metrics::GTstate::Het => "0|1", + metrics::GTstate::Hom => "1|1", + _ => "./.", + }; + } + // we're now assuming that ref/alt are the coverages used for these genotypes. no bueno let (gq, sq) = metrics::genotype_quals(ref_cov, alt_cov); diff --git a/src/kplib/pathscore.rs b/src/kplib/pathscore.rs index d905e02..19be4ce 100644 --- a/src/kplib/pathscore.rs +++ b/src/kplib/pathscore.rs @@ -9,6 +9,7 @@ pub struct PathScore { pub coverage: Option, pub path: Vec, pub full_target: bool, // is this path against the full target + pub is_ref: bool, // This path tried to use a reference allele } impl Eq for PathScore {} @@ -53,6 +54,7 @@ impl Default for PathScore { seqsim: 0.0, coverage: None, full_target: false, + is_ref: true, } } } @@ -111,8 +113,13 @@ impl PathScore { seqsim, coverage: None, full_target: i == 0, + is_ref: false, }; } - PathScore::default() + PathScore { + is_ref: false, + ..Default::default() + } + //PathScore::default() } } diff --git a/src/kplib/vargraph.rs b/src/kplib/vargraph.rs index 4d8d6fd..bb07a86 100644 --- a/src/kplib/vargraph.rs +++ b/src/kplib/vargraph.rs @@ -131,6 +131,7 @@ impl Variants { if hap.n == 0 { return PathScore { coverage: Some(hap.coverage), + is_ref: true, ..Default::default() }; } From 3b10b52a82e96d3d769a5085686dd1b0c00d17c8 Mon Sep 17 00:00:00 2001 From: Adam English Date: Tue, 11 Jun 2024 15:55:40 -0400 Subject: [PATCH 47/47] version bump --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index a274f41..81759fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "kanpig" -version = "0.2.1-dev" +version = "0.3.0" edition = "2021" [dependencies]