Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mutect3 dataset enhancements: optional truth VCF for labels, seq error likelihood annotation #7975

Merged
merged 2 commits into from
Aug 5, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection
public static final String MUTECT3_REF_DOWNSAMPLE_LONG_NAME = "mutect3-ref-downsample";
public static final String MUTECT3_ALT_DOWNSAMPLE_LONG_NAME = "mutect3-alt-downsample";
public static final String MUTECT3_DATASET_LONG_NAME = "mutect3-dataset";
public static final String MUTECT3_TRAINING_TRUTH_LONG_NAME = "mutect3-training-truth";

public static final int DEFAULT_MUTECT3_REF_DOWNSAMPLE = 10;
public static final int DEFAULT_MUTECT3_ALT_DOWNSAMPLE = 20;
Expand Down Expand Up @@ -203,6 +204,14 @@ public double getDefaultAlleleFrequency() {
@Argument(fullName = MUTECT3_DATASET_LONG_NAME, optional = true, doc="Destination for Mutect3 data collection")
public File mutect3Dataset;

/**
* VCF of known calls for a sample used for generating a Mutect3 training dataset. Unfiltered variants contained
davidbenjamin marked this conversation as resolved.
Show resolved Hide resolved
* in this VCF are considered good; other variants are considered artifacts. If this VCF is not given the dataset
* is generated with an weak-labelling strategy based on allele fractions.
*/
@Argument(fullName= MUTECT3_TRAINING_TRUTH_LONG_NAME, doc="VCF file of known variants for labeling Mutect3 training data", optional = true)
public FeatureInput<VariantContext> mutect3TrainingTruth;

/**
* Only variants with tumor LODs exceeding this threshold will be written to the VCF, regardless of filter status.
* Set to less than or equal to tumor_lod. Increase argument value to reduce false positives in the callset.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.commons.math3.util.CombinatoricsUtils;
import org.apache.commons.math3.util.FastMath;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.AssemblyComplexity;
Expand All @@ -13,8 +15,10 @@
import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine;
import org.broadinstitute.hellbender.utils.IndexRange;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.NaturalLogUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.genotyper.LikelihoodMatrix;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
Expand Down Expand Up @@ -101,8 +105,10 @@ public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode,
}

// add one datum per alt allele
public void addData(final ReferenceContext ref, final VariantContext vc, final AlleleLikelihoods<GATKRead, Allele> likelihoods,
final AlleleLikelihoods<Fragment, Haplotype> logFragmentLikelihoods) {
public void addData(final ReferenceContext ref, final VariantContext vc, Optional<List<VariantContext>> truthVCs,
final AlleleLikelihoods<GATKRead, Allele> likelihoods,
final AlleleLikelihoods<Fragment, Haplotype> logFragmentLikelihoods,
final AlleleLikelihoods<Fragment, Allele> logFragmentAlleleLikelihoods) {
final String refBases = ReferenceBases.annotate(ref, vc);
final String refAllele = vc.getReference().getBaseString();
final String contig = vc.getContig();
Expand All @@ -115,6 +121,11 @@ public void addData(final ReferenceContext ref, final VariantContext vc, final A
final double[] popafs = VariantContextGetters.getAttributeAsDoubleArray(vc, GATKVCFConstants.POPULATION_AF_KEY);
//final double[] altPopulationAFs = MathUtils.applyToArray(popafs, x -> Math.pow(10, -x ));
final double[] tumorLods = Mutect2FilteringEngine.getTumorLogOdds(vc);

// These ADs, which later make up the pre-downsampling depths, come from the genotype AD field applied by Mutect2.
// This means that uninformative reads are not discarded; rather the expected non-integral ADs are rounded to
// the nearest integer. Also, these ADs are *fragment* ADs from the log fragment-allele likelihoods in the
// SomaticGenotypingEngine.
final int[] tumorADs = sumADsOverSamples(vc, tumorSamples);
final int[] normalADs = sumADsOverSamples(vc, normalSamples);
final int tumorDepth = (int) MathUtils.sum(tumorADs);
Expand All @@ -127,32 +138,41 @@ public void addData(final ReferenceContext ref, final VariantContext vc, final A
for (int n = 0; n < numAlt; n++) {
final double tumorAF = tumorADs[n+1] / ((double) tumorDepth);
final double normalAF = hasNormal ? normalADs[n+1] / ((double) normalDepth) : 0.0;
final String altAllele = vc.getAlternateAllele(n).getBaseString();
final int diff = altAllele.length() - refAllele.length();
Allele altAllele = vc.getAlternateAllele(n);
final String altAlleleString = altAllele.getBaseString();
final int diff = altAlleleString.length() - refAllele.length();
final VariantType type = diff == 0 ? VariantType.SNV : ( diff > 0 ? VariantType.INSERTION : VariantType.DELETION);

// TODO: what about alternative representations?
final Set<Allele> truthAlleles = !truthVCs.isPresent() ? Collections.emptySet() : truthVCs.get().stream()
.filter(var -> ! var.isFiltered())
.flatMap(var -> var.getAlternateAlleles().stream())
.collect(Collectors.toSet());

if (trainingMode) {
final ArrayBlockingQueue<Integer> unmatchedQueue = unmatchedArtifactAltCounts.get(type);
final boolean likelySeqError = tumorLods[n] < TLOD_THRESHOLD;
final boolean likelyGermline = hasNormal && normalAF > 0.2;

// extremely strict criteria because there are so many germline variants we can afford to waste a lot
final boolean definiteGermline = !likelySeqError && popafs[n] < COMMON_POPAF_THRESHOLD &&
tumorAF > 0.35 && (!hasNormal || normalAF > 0.35);
final boolean trueVariant = truthVCs.isPresent() ? truthAlleles.contains(altAllele) : definiteGermline;

// low AF in tumor and normal, rare in population implies artifact
if (!(likelySeqError || likelyGermline) && tumorAF < 0.2 && popafs[n] > RARE_POPAF_THRESHOLD) {
boolean probableArtifact = !likelySeqError && (truthVCs.isPresent() ? !truthAlleles.contains(altAllele) :
(tumorAF < 0.2 && popafs[n] > RARE_POPAF_THRESHOLD));

if (unmatchedQueue.size() > 0.9*CAPACITY) { // this should rarely come up
if (probableArtifact) {
if (unmatchedQueue.size() > 0.9 * CAPACITY) { // this should rarely come up
labels.add(Label.IGNORE);
} else {
labels.add(Label.ARTIFACT);
unmatchedQueue.addAll(Collections.nCopies(nonArtifactPerArtifact, tumorADs[n + 1]));
}
} else if (definiteGermline && !unmatchedQueue.isEmpty()) {
} else if (trueVariant && !unmatchedQueue.isEmpty()) {
// high AF in tumor and normal, common in population implies germline, which we downsample
labels.add(Label.VARIANT);
altDownsampleMap.put(vc.getAlternateAllele(n), unmatchedQueue.poll());
altDownsampleMap.put(altAllele, unmatchedQueue.poll());
} else if (tumorLods[n] > 4.0 && tumorAF < 0.3) {
labels.add(Label.UNLABELED);
} else {
Expand All @@ -173,13 +193,18 @@ public void addData(final ReferenceContext ref, final VariantContext vc, final A
final Triple<int[], int[], double[]> assemblyComplexity = AssemblyComplexity.annotate(vc, logFragmentLikelihoods, false);

// TODO: for now we don't really need normal reads
// note that the following use the VC's allele order, not necessarily the likelihoods' allele order
final List<List<List<Integer>>> normalReadVectorsByAllele = FeaturizedReadSets.getReadVectors(vc, normalSamples, likelihoods, logFragmentLikelihoods, maxRefCount, maxAltCount);
final List<List<List<Integer>>> tumorReadVectorsByAllele = FeaturizedReadSets.getReadVectors(vc, tumorSamples, likelihoods, logFragmentLikelihoods, maxRefCount, maxAltCount, altDownsampleMap);

// ref and alt reads have already been downsampled by the read featurizer
final List<List<Integer>> tumorRefReads = tumorReadVectorsByAllele.get(0);
final List<List<Integer>> normalRefReads = normalReadVectorsByAllele.get(0);

final List<LikelihoodMatrix<Fragment,Allele>> tumorMatrices = tumorSamples.stream()
.map(s -> logFragmentAlleleLikelihoods.sampleMatrix(logFragmentAlleleLikelihoods.indexOfSample(s)))
.collect(Collectors.toList());

for (int n = 0; n < numAlt; n++) {
if (labels.get(n) == Label.IGNORE) {
continue;
Expand All @@ -203,7 +228,13 @@ public void addData(final ReferenceContext ref, final VariantContext vc, final A
//normalRefReads.forEach(r -> printWriter.print(numberString(r)));
//normalAltReads.forEach(r -> printWriter.print(numberString(r)));
printWriter.printf("%d %d %d %d%n", tumorDepth, tumorADs[n+1], normalDepth, normalADs[n+1]); // pre-downsampling counts for normal artifact model
}
// this is approximately the likelihood that these particular reads are alt given sequencing error, excluding
// the depth C N_alt combinatorial factor that is common to all likelihoods in M3
// basicaly, it's the TLOD with a correction for the marginalized flat prior from M2
final double tlod = vc.getAttributeAsDoubleList("TLOD", 0).get(n);
final double seqErrorLogLikelihood = -MathUtils.log10ToLog(tlod) - Math.log(tumorDepth + 1);
printWriter.printf("%.3f%n", seqErrorLogLikelihood); // pre-downsampling counts for normal artifact model
}
}

private String integerString(final List<Integer> numbers) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,10 @@ public CalledHaplotypes callMutations(
AssemblyBasedCallerUtils.annotateReadLikelihoodsWithSupportedAlleles(trimmedCall, trimmedLikelihoods, Fragment::getReads);
}

mutect3DatasetEngine.ifPresent(engine -> engine.addData(referenceContext, annotatedCall, trimmedLikelihoodsForAnnotation, logFragmentLikelihoods));
final Optional<List<VariantContext>> truthVCs = MTAC.mutect3TrainingTruth == null ? Optional.empty() :
Optional.of(featureContext.getValues(MTAC.mutect3TrainingTruth, mergedVC.getStart()));
mutect3DatasetEngine.ifPresent(engine -> engine.addData(referenceContext, annotatedCall, truthVCs,
trimmedLikelihoodsForAnnotation, logFragmentLikelihoods, logLikelihoods));

call.getAlleles().stream().map(alleleMapper::get).filter(Objects::nonNull).forEach(calledHaplotypes::addAll);
returnCalls.add( annotatedCall );
Expand Down