broadinstitute · droazen · Apr 23, 2020 · Mar 16, 2020 · lbergelson · Apr 23, 2020
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/DuplicateSetWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/DuplicateSetWalker.java
@@ -0,0 +1,153 @@
+package org.broadinstitute.hellbender.engine;
+
+import htsjdk.samtools.SAMTag;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.hellbender.engine.filters.ReadFilter;
+import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
+import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.walkers.consensus.MoleculeID;
+import org.broadinstitute.hellbender.tools.walkers.consensus.ReadsWithSameUMI;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A walker that processes duplicate reads that share the same Unique molecule Identifier (UMI) as a single unit.
+ *
+ * This tool assumes that the input bam has been sorted by UMI (the {@link SAMTag.MI} tag to be specific) with FGBio GroupReadsByUmi:
+ * http://fulcrumgenomics.github.io/fgbio/tools/latest/GroupReadsByUmi.html
+ */
+public abstract class DuplicateSetWalker extends ReadWalker {
+    public static final String MIN_REQUIRED_READS_NAME = "min-reads";
+    public static final String MIN_REQUIRED_READS_PER_STRAND_NAME = "min-per-strand-reads";
+
+    private static final int DEFAULT_MINIMUM_READS_PER_SET = 1;
+    private static final int DEFAULT_MINIMUM_READS_PER_STRAND = 0;
+
+    @Argument(fullName = MIN_REQUIRED_READS_NAME, doc = "The mininum total number of reads required in the set", optional = true, minValue = 0)
+    private int minimumRequiredReadsPerUMI = DEFAULT_MINIMUM_READS_PER_SET;
+
+    // The user may choose to only keep read sets containing both strands (i.d. duplex evidence) by setting this argument to a positive number
+    @Argument(fullName = MIN_REQUIRED_READS_PER_STRAND_NAME, doc = "The mininum total number of reads in each strand", optional = true, minValue = 0)
+    private int minimumRequiredReadsPerStrand = DEFAULT_MINIMUM_READS_PER_STRAND;
+
+    protected ReadsWithSameUMI currentReadsWithSameUMI = null;
+
+    @Override
+    public final void traverse(){
+        super.traverse();
+        processLastReadSet();
+    }
+
+    /***
+     * FGBio GroupByUMI returns reads sorted by molecule ID: For example, the input bam may look like
+     * read1: ... MI:Z:0/A ...
+     * read2: ... MI:Z:0/A ...
+     * read3: ... MI:Z:0/B ...
+     * read4: ... MI:Z:0/B ...
+     * read5: ... MI:Z:1/A ...
+     * read6: ... MI:Z:1/B ...
+     * read7: ... MI:Z:1/B ...
+     *
+     * Thus it's sufficient to go through the reads in order and collect them in a list until
+     * we encounter the next molecule ID, at which point we pass the list to the {@code apply} method,
+     * process the set based on the child class's implementation of the method, and clear the {@code currentDuplicateSet} variable and start collecting reads again.
+     *
+     * Notice there are two apply() methods in this class:
+     * This apply() inherited from ReadWalker is marked final to discourage subclassing.
+     * A subclass must override the other apply() method that takes in the DuplicateSet.
+     */
+    @Override
+    public final void apply(GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext) {
+        if (currentReadsWithSameUMI == null){ // evaluates to true for the very first read
+            currentReadsWithSameUMI = new ReadsWithSameUMI(read);
+            return;
+        }
+
+        final int readMoleculeNumber = MoleculeID.getMoleculeNumberOfRead(read);
+        final int duplicateSetMoleculeNumber = currentReadsWithSameUMI.getMoleculeNumber();
+
+        // If the incoming read has the molecule id less than that of the currentDuplicateSet,
+        // the input bam is not sorted properly by the MI tag
+        if (duplicateSetMoleculeNumber > readMoleculeNumber){
+            throw new UserException(String.format("The input bam must be sorted by the molecule ID (%s) tag.", SAMTag.MI.name()));
+        }
+
+        if (duplicateSetMoleculeNumber < readMoleculeNumber) {
+            // The incoming read's molecule ID is greater than that of the current set, meaning we've reached the end of the current set.
+            if (rejectSet(currentReadsWithSameUMI)){
+                currentReadsWithSameUMI = new ReadsWithSameUMI(read);
+                return;
+            }
+
+            // Call the apply() method to process the current set and start a new set.
+            apply(currentReadsWithSameUMI,
+                    new ReferenceContext(reference, currentReadsWithSameUMI.getInterval()), // Will create an empty ReferenceContext if reference or readInterval == null
+                    new FeatureContext(features, currentReadsWithSameUMI.getInterval()));
+            currentReadsWithSameUMI = new ReadsWithSameUMI(read);
+            return;
+        }
+
+        // The incoming read has the same UMI as the current set; simply add to the current set
+        currentReadsWithSameUMI.addRead(read);
+    }
+
+    /**
+     * A subclass must specify how to process the duplicate sets by overriding this method.
+     *
+     * @param readsWithSameUMI A set of reads with the matching UMIs with the same fragment start and end
+     * @param referenceContext A reference context object over the intervals determined by the duplicate set.
+     * @param featureContext Entries from a secondary feature file (e.g. vcf) if provided
+     *
+     */
+    public abstract void apply(ReadsWithSameUMI readsWithSameUMI, ReferenceContext referenceContext, FeatureContext featureContext );
+
+    private void processLastReadSet(){
+        if (currentReadsWithSameUMI.getReads().size() > 0){
+            apply(currentReadsWithSameUMI,
+                    new ReferenceContext(reference, currentReadsWithSameUMI.getInterval()),
+                    new FeatureContext(features, currentReadsWithSameUMI.getInterval()));
+        }
+    }
+
+    /**
+     * Returns true for duplicate sets that does not meet required criteria for further processing.
+     * We encourage the user to override this method to meet their needs.
+     */
+    protected boolean rejectSet(final ReadsWithSameUMI readsWithSameUMI){
+        // Check that the set contains the minimum required number of reads in each strand
+        final Pair<Integer, Integer> strandCounts = MoleculeID.countStrands(readsWithSameUMI.getReads());
+        if (Math.min(strandCounts.getLeft(), strandCounts.getRight()) < minimumRequiredReadsPerStrand){
+            return true;
+        }
+
+        // Check that the read set is paired (this check may not reject some sets that contain unpaired reads)
+        if (readsWithSameUMI.getReads().size() % 2 == 1){
+            return true;
+        }
+
+        // Check that the total number of reads (from both strands) exceeds a specified threshold
+        return readsWithSameUMI.getReads().size() < minimumRequiredReadsPerUMI;
+    }
+
+    @Override
+    public List<ReadFilter> getDefaultReadFilters() {
+        final List<ReadFilter> readFilters = new ArrayList<>(6);
+        readFilters.add(new WellformedReadFilter());
+        readFilters.addAll(getDuplicateSetWalkerSpecificReadFilterList());
+        return readFilters;
+    }
+
+    private static List<ReadFilter> getDuplicateSetWalkerSpecificReadFilterList() {
+        List<ReadFilter> filters = new ArrayList<>(5);
+        filters.add(ReadFilterLibrary.MAPPING_QUALITY_AVAILABLE);
+        filters.add(ReadFilterLibrary.MAPPED);
+        filters.add(ReadFilterLibrary.NOT_SECONDARY_ALIGNMENT);
+        filters.add(ReadFilterLibrary.PASSES_VENDOR_QUALITY_CHECK);
+        filters.add(ReadFilterLibrary.NON_ZERO_REFERENCE_LENGTH_ALIGNMENT);
+        return filters;
+    }
+}
diff --git a/.../java/org/broadinstitute/hellbender/tools/walkers/consensus/DownsampleByDuplicateSet.java b/.../java/org/broadinstitute/hellbender/tools/walkers/consensus/DownsampleByDuplicateSet.java
@@ -0,0 +1,103 @@
+package org.broadinstitute.hellbender.tools.walkers.consensus;
+
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.DuplicateSetWalker;
+import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.GATKPathSpecifier;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
+import org.glassfish.jersey.Beta;
+import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
+
+import java.util.Random;
+
+@CommandLineProgramProperties(
+        summary = "Discard a set fraction of duplicate sets from a UMI-grouped bam",
+        oneLineSummary = "Discard a set fraction of duplicate sets from a UMI-grouped bam",
+        programGroup = ReadDataManipulationProgramGroup.class
+)
+/**
+ * Given a bam grouped by the same unique molecular identifier (UMI), this tool drops a specified fraction of duplicate sets and returns a new bam.
+ * A duplicate set refers to a group of reads whose fragments start and end at the same genomic coordinate _and_ share the same UMI.
+ *
+ * The input bam must first be sorted by UMI using FGBio GroupReadsByUmi (http://fulcrumgenomics.github.io/fgbio/tools/latest/GroupReadsByUmi.html).
+ *
+ * Use this tool to create, for instance, an insilico mixture of duplex-sequenced samples to simulate tumor subclones.
+ *
+ * Suppose you wish to simulate a tumor sample in which 5% cells share a common set of somatic mutations
+ * in addition to ones common to the entire cell population.
+ *
+ * If you randomly drop 5% of reads in sample A and 95% of reads in sample B and merge the reduced bams,
+ * the resulting mixture skews the family-size distribution to the left. Here the family size refers to the
+ * number of sequenced duplicate reads that share the same UMI.
+ *
+ * To see this, take a cancer sample, in which 5% of cells share a unique set of somatic mutations,
+ * that was processed with duplex-UMIs (i.e. UMIs on both adapters) and high rounds of PCR. Suppose we have the sequence-ready
+ * libraries of this sample attached to and amplified on the flowcell. Now, sort the flowcell lawn such that the reads from the above
+ * 5% of the cell population moves near the top of the flowcell. This population must have the same family-size distribution as
+ * the rest of the flowcell, at about 5% of the library complexity compared to the entire flowcell.
+ *
+ * Now imagine replacing this population with 5% ramdonly chosen from the *entire* flowcell of another sample that was prepared and sequenced similarly.
+ * The library complexity of these "graft" reads is higher than that of the original, and, consequently, with other parameters
+ * such as the number of PCR cycles and sequencing depth fixed, its family distribution would be skewed left---that is, the family size
+ * would be smaller than it should be.
+ *
+ * This tool will help address the above problem by dropping a set fraction of _molecules_, or duplicate sets, rather than reads, at random.
+ *
+ * Example Usage:
+ *
+ * Keep 95 percent of the molecules.
+ *
+ * gatk DownsampleByDuplicateSet \ \
+ * -I umiGrouped.bam \
+ * --fraction-to-keep 0.95 \
+ * -O umiGrouped_0.95.bam
+ **/
+@BetaFeature
+public class DownsampleByDuplicateSet extends DuplicateSetWalker {
+    @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "")
+    public GATKPathSpecifier outputBam;
+
+    public static final String FRACTION_TO_KEEP_NAME = "fraction-to-keep";
+    @Argument(fullName = FRACTION_TO_KEEP_NAME, doc = "This fraction of molecules in the input bam will be retained", minValue = 0.0, maxValue = 1.0)
+    public double fractionToKeep;
+
+    private static final int RANDOM_SEED = 142;
+    private final Random rng = new Random(RANDOM_SEED);
+    private int numDuplicateReadSets;
+    private int numReads;
+    private SAMFileGATKReadWriter outputWriter;
+
+    @Override
+    public void onTraversalStart() {
+        outputWriter = createSAMWriter(outputBam.toPath(), false);
+    }
+
+    @Override
+    public void apply(ReadsWithSameUMI readsWithSameUMI, ReferenceContext referenceContext, FeatureContext featureContext) {
+        if (rng.nextDouble() < fractionToKeep){
+            readsWithSameUMI.getReads().forEach(r -> outputWriter.addRead(r));
+            numReads += readsWithSameUMI.getReads().size();
+            numDuplicateReadSets += 1;
+        }
+    }
+
+    @Override
+    public Object onTraversalSuccess(){
+        outputWriter.close();
+        logger.info(String.format("Wrote %d reads", numReads));
+        logger.info(String.format("Wrote %d duplicate read sets", numDuplicateReadSets));
+        return "SUCCESS";
+    }
+
+    @Override
+    public void closeTool(){
+        if (outputWriter != null) {
+            outputWriter.close();
+        }
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/consensus/MoleculeID.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/consensus/MoleculeID.java
@@ -0,0 +1,67 @@
+package org.broadinstitute.hellbender.tools.walkers.consensus;
+
+import htsjdk.samtools.SAMTag;
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.hellbender.utils.read.GATKRead;
+
+import java.util.List;
+
+/**
+ * A container class for the molecule ID, which consists of an integer ID and a binary strand.
+ * For example, Reads with the tags 12/A and 12/B originated from the same DNA fragment before PCR,
+ * (i.e. from the same library) but they originated from different strands in that library.
+ * In other words, one read is F1R2 and the other F2R1.
+ *
+ * The word "molecule" here refers to the original DNA fragment with barcode before undergoing
+ * PCR and sequencing. We amplify this molecule through PCR and end up with many duplicate _fragments_.
+ */
+public class MoleculeID {
+    private int moleculeNumber;
+    private String strand;
+
+    public MoleculeID(final GATKRead read){
+        this.moleculeNumber = getMoleculeNumberOfRead(read);
+        this.strand = getStrandOfRead(read);
+    }
+
+    public MoleculeID(final int moleculeNumber, final String strand){
+        this.moleculeNumber = moleculeNumber;
+        this.strand = strand;
+    }
+
+    public int getMoleculeNumber() {
+        return moleculeNumber;
+    }
+
+    public String getStrand() {
+        return strand;
+    }
+
+    /** Format the molecule ID as stored in the sam/bam/cram file under the {@link SAMTag.MI} tag **/
+    public String getSAMField(){
+        return moleculeNumber + ReadsWithSameUMI.FGBIO_MI_TAG_DELIMITER + strand;
+    }
+
+    /** Extracts the molecule number portion of the {@link SAMTag.MI} field of the read **/
+    public static int getMoleculeNumberOfRead(final GATKRead read){
+        final String MITag = read.getAttributeAsString(SAMTag.MI.name());
+        return Integer.parseInt(MITag.split(ReadsWithSameUMI.FGBIO_MI_TAG_DELIMITER)[0]);
+    }
+
+    /** Extracts the strand portion of the {@link SAMTag.MI} field of the read **/
+    public static String getStrandOfRead(final GATKRead read){
+        final String MITag = read.getAttributeAsString(SAMTag.MI.name());
+        return MITag.split(ReadsWithSameUMI.FGBIO_MI_TAG_DELIMITER)[1];
+    }
+
+    /**
+     * Assumes that the input reads have the same molecule number in the {@link SAMTag.MI} tag
+     * @returns Counts of reads from each strand, the first element is always larger than the second
+     **/
+    public static Pair<Integer, Integer> countStrands(final List<GATKRead> reads){
+        final int strandACount = (int) reads.stream().filter(r -> getStrandOfRead(r).equals("A")).count();
+        final int strandBCount = (int) reads.stream().filter(r -> getStrandOfRead(r).equals("B")).count();
+        return new ImmutablePair<>(strandACount, strandBCount);
+    }
+}