diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java index 1e5d86162b4..cda2de8c986 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java @@ -79,6 +79,7 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection public static final String MUTECT3_REF_DOWNSAMPLE_LONG_NAME = "mutect3-ref-downsample"; public static final String MUTECT3_ALT_DOWNSAMPLE_LONG_NAME = "mutect3-alt-downsample"; public static final String MUTECT3_DATASET_LONG_NAME = "mutect3-dataset"; + public static final String MUTECT3_TRAINING_TRUTH_LONG_NAME = "mutect3-training-truth"; public static final int DEFAULT_MUTECT3_REF_DOWNSAMPLE = 10; public static final int DEFAULT_MUTECT3_ALT_DOWNSAMPLE = 20; @@ -203,6 +204,14 @@ public double getDefaultAlleleFrequency() { @Argument(fullName = MUTECT3_DATASET_LONG_NAME, optional = true, doc="Destination for Mutect3 data collection") public File mutect3Dataset; + /** + * VCF of known calls for a sample used for generating a Mutect3 training dataset. Unfiltered variants contained + * in this VCF are considered good; other variants are considered artifacts. If this VCF is not given the dataset + * is generated with an weak-labelling strategy based on allele fractions. + */ + @Argument(fullName= MUTECT3_TRAINING_TRUTH_LONG_NAME, doc="VCF file of known variants for labeling Mutect3 training data", optional = true) + public FeatureInput mutect3TrainingTruth; + /** * Only variants with tumor LODs exceeding this threshold will be written to the VCF, regardless of filter status. * Set to less than or equal to tumor_lod. Increase argument value to reduce false positives in the callset. diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java index 0c4c31d232e..0656b9442eb 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java @@ -4,7 +4,9 @@ import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import org.apache.commons.lang3.tuple.Triple; +import org.apache.commons.math3.util.CombinatoricsUtils; import org.apache.commons.math3.util.FastMath; +import org.broadinstitute.hellbender.engine.FeatureContext; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.walkers.annotator.AssemblyComplexity; @@ -13,8 +15,10 @@ import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine; import org.broadinstitute.hellbender.utils.IndexRange; import org.broadinstitute.hellbender.utils.MathUtils; +import org.broadinstitute.hellbender.utils.NaturalLogUtils; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; +import org.broadinstitute.hellbender.utils.genotyper.LikelihoodMatrix; import org.broadinstitute.hellbender.utils.haplotype.Haplotype; import org.broadinstitute.hellbender.utils.read.Fragment; import org.broadinstitute.hellbender.utils.read.GATKRead; @@ -101,8 +105,10 @@ public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode, } // add one datum per alt allele - public void addData(final ReferenceContext ref, final VariantContext vc, final AlleleLikelihoods likelihoods, - final AlleleLikelihoods logFragmentLikelihoods) { + public void addData(final ReferenceContext ref, final VariantContext vc, Optional> truthVCs, + final AlleleLikelihoods likelihoods, + final AlleleLikelihoods logFragmentLikelihoods, + final AlleleLikelihoods logFragmentAlleleLikelihoods) { final String refBases = ReferenceBases.annotate(ref, vc); final String refAllele = vc.getReference().getBaseString(); final String contig = vc.getContig(); @@ -115,6 +121,12 @@ public void addData(final ReferenceContext ref, final VariantContext vc, final A final double[] popafs = VariantContextGetters.getAttributeAsDoubleArray(vc, GATKVCFConstants.POPULATION_AF_KEY); //final double[] altPopulationAFs = MathUtils.applyToArray(popafs, x -> Math.pow(10, -x )); final double[] tumorLods = Mutect2FilteringEngine.getTumorLogOdds(vc); + + // These ADs, which later make up the pre-downsampling depths, come from the genotype AD field applied by Mutect2. + // This means that uninformative reads are not discarded; rather the expected non-integral ADs are rounded to + // the nearest integer. Also, these ADs are *fragment* ADs from the log fragment-allele likelihoods in the + // SomaticGenotypingEngine.For consistency, the seq error likelihood must also be calculated from every fragment in + // the fragment-allele log likelihoods matrix. final int[] tumorADs = sumADsOverSamples(vc, tumorSamples); final int[] normalADs = sumADsOverSamples(vc, normalSamples); final int tumorDepth = (int) MathUtils.sum(tumorADs); @@ -127,32 +139,41 @@ public void addData(final ReferenceContext ref, final VariantContext vc, final A for (int n = 0; n < numAlt; n++) { final double tumorAF = tumorADs[n+1] / ((double) tumorDepth); final double normalAF = hasNormal ? normalADs[n+1] / ((double) normalDepth) : 0.0; - final String altAllele = vc.getAlternateAllele(n).getBaseString(); - final int diff = altAllele.length() - refAllele.length(); + Allele altAllele = vc.getAlternateAllele(n); + final String altAlleleString = altAllele.getBaseString(); + final int diff = altAlleleString.length() - refAllele.length(); final VariantType type = diff == 0 ? VariantType.SNV : ( diff > 0 ? VariantType.INSERTION : VariantType.DELETION); + // TODO: what about alternative representations? + final Set truthAlleles = !truthVCs.isPresent() ? Collections.emptySet() : truthVCs.get().stream() + .filter(var -> ! var.isFiltered()) + .flatMap(var -> var.getAlternateAlleles().stream()) + .collect(Collectors.toSet()); + if (trainingMode) { final ArrayBlockingQueue unmatchedQueue = unmatchedArtifactAltCounts.get(type); final boolean likelySeqError = tumorLods[n] < TLOD_THRESHOLD; - final boolean likelyGermline = hasNormal && normalAF > 0.2; // extremely strict criteria because there are so many germline variants we can afford to waste a lot final boolean definiteGermline = !likelySeqError && popafs[n] < COMMON_POPAF_THRESHOLD && tumorAF > 0.35 && (!hasNormal || normalAF > 0.35); + final boolean trueVariant = truthVCs.isPresent() ? truthAlleles.contains(altAllele) : definiteGermline; // low AF in tumor and normal, rare in population implies artifact - if (!(likelySeqError || likelyGermline) && tumorAF < 0.2 && popafs[n] > RARE_POPAF_THRESHOLD) { + boolean probableArtifact = !likelySeqError && (truthVCs.isPresent() ? !truthAlleles.contains(altAllele) : + (tumorAF < 0.2 && popafs[n] > RARE_POPAF_THRESHOLD)); - if (unmatchedQueue.size() > 0.9*CAPACITY) { // this should rarely come up + if (probableArtifact) { + if (unmatchedQueue.size() > 0.9 * CAPACITY) { // this should rarely come up labels.add(Label.IGNORE); } else { labels.add(Label.ARTIFACT); unmatchedQueue.addAll(Collections.nCopies(nonArtifactPerArtifact, tumorADs[n + 1])); } - } else if (definiteGermline && !unmatchedQueue.isEmpty()) { + } else if (trueVariant && !unmatchedQueue.isEmpty()) { // high AF in tumor and normal, common in population implies germline, which we downsample labels.add(Label.VARIANT); - altDownsampleMap.put(vc.getAlternateAllele(n), unmatchedQueue.poll()); + altDownsampleMap.put(altAllele, unmatchedQueue.poll()); } else if (tumorLods[n] > 4.0 && tumorAF < 0.3) { labels.add(Label.UNLABELED); } else { @@ -173,6 +194,7 @@ public void addData(final ReferenceContext ref, final VariantContext vc, final A final Triple assemblyComplexity = AssemblyComplexity.annotate(vc, logFragmentLikelihoods, false); // TODO: for now we don't really need normal reads + // note that the following use the VC's allele order, not necessarily the likelihoods' allele order final List>> normalReadVectorsByAllele = FeaturizedReadSets.getReadVectors(vc, normalSamples, likelihoods, logFragmentLikelihoods, maxRefCount, maxAltCount); final List>> tumorReadVectorsByAllele = FeaturizedReadSets.getReadVectors(vc, tumorSamples, likelihoods, logFragmentLikelihoods, maxRefCount, maxAltCount, altDownsampleMap); @@ -180,6 +202,10 @@ public void addData(final ReferenceContext ref, final VariantContext vc, final A final List> tumorRefReads = tumorReadVectorsByAllele.get(0); final List> normalRefReads = normalReadVectorsByAllele.get(0); + final List> tumorMatrices = tumorSamples.stream() + .map(s -> logFragmentAlleleLikelihoods.sampleMatrix(logFragmentAlleleLikelihoods.indexOfSample(s))) + .collect(Collectors.toList()); + for (int n = 0; n < numAlt; n++) { if (labels.get(n) == Label.IGNORE) { continue; @@ -203,7 +229,13 @@ public void addData(final ReferenceContext ref, final VariantContext vc, final A //normalRefReads.forEach(r -> printWriter.print(numberString(r))); //normalAltReads.forEach(r -> printWriter.print(numberString(r))); printWriter.printf("%d %d %d %d%n", tumorDepth, tumorADs[n+1], normalDepth, normalADs[n+1]); // pre-downsampling counts for normal artifact model - } + // this is approximately the likelihood that these particular reads are alt given sequencing error, excluding + // the depth C N_alt combinatorial factor that is common to all likelihoods in M3 + // basicaly, it's the TLOD with a correction for the marginalized flat prior from M2 + final double tlod = vc.getAttributeAsDoubleList("TLOD", 0).get(n); + final double seqErrorLogLikelihood = -MathUtils.log10ToLog(tlod) - Math.log(tumorDepth + 1); + printWriter.printf("%.3f%n", seqErrorLogLikelihood); // pre-downsampling counts for normal artifact model + } } private String integerString(final List numbers) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java index 196298e84ea..4f218ea4904 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java @@ -205,7 +205,10 @@ public CalledHaplotypes callMutations( AssemblyBasedCallerUtils.annotateReadLikelihoodsWithSupportedAlleles(trimmedCall, trimmedLikelihoods, Fragment::getReads); } - mutect3DatasetEngine.ifPresent(engine -> engine.addData(referenceContext, annotatedCall, trimmedLikelihoodsForAnnotation, logFragmentLikelihoods)); + final Optional> truthVCs = MTAC.mutect3TrainingTruth == null ? Optional.empty() : + Optional.of(featureContext.getValues(MTAC.mutect3TrainingTruth, mergedVC.getStart())); + mutect3DatasetEngine.ifPresent(engine -> engine.addData(referenceContext, annotatedCall, truthVCs, + trimmedLikelihoodsForAnnotation, logFragmentLikelihoods, logLikelihoods)); call.getAlleles().stream().map(alleleMapper::get).filter(Objects::nonNull).forEach(calledHaplotypes::addAll); returnCalls.add( annotatedCall );