Skip to content

Commit

Permalink
Added the ability for Indels to be recovered from dangling heads and …
Browse files Browse the repository at this point in the history
…a new argument for filtering dangling ends (#6113)
  • Loading branch information
jamesemery authored Jan 26, 2021
1 parent 862a3f4 commit c305078
Show file tree
Hide file tree
Showing 9 changed files with 220 additions and 45 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ public class HaplotypeCallerReadThreadingAssemblerArgumentCollection extends Rea
public ReadThreadingAssembler makeReadThreadingAssembler() {
final ReadThreadingAssembler assemblyEngine = new ReadThreadingAssembler(maxNumHaplotypesInPopulation, Collections.unmodifiableList(kmerSizes),
dontIncreaseKmerSizesForCycles, allowNonUniqueKmersInRef, numPruningSamples, useAdaptivePruning ? 0 : minPruneFactor,
useAdaptivePruning, initialErrorRateForPruning, pruningLogOddsThreshold, pruningSeedingLogOddsThreshold, maxUnprunedVariants, useLinkedDeBruijnGraph, enableLegacyGraphCycleDetection);
useAdaptivePruning, initialErrorRateForPruning, pruningLogOddsThreshold, pruningSeedingLogOddsThreshold, maxUnprunedVariants, useLinkedDeBruijnGraph,
enableLegacyGraphCycleDetection, minMatchingBasesToDanglingEndRecovery);
assemblyEngine.setDebugGraphTransformations(debugGraphTransformations);
assemblyEngine.setRecoverDanglingBranches(!doNotRecoverDanglingBranches);
assemblyEngine.setRecoverAllDanglingBranches(recoverAllDanglingBranches);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ public class MutectReadThreadingAssemblerArgumentCollection extends ReadThreadin
public ReadThreadingAssembler makeReadThreadingAssembler() {
final ReadThreadingAssembler assemblyEngine = new ReadThreadingAssembler(maxNumHaplotypesInPopulation, Collections.unmodifiableList(kmerSizes),
dontIncreaseKmerSizesForCycles, allowNonUniqueKmersInRef, numPruningSamples, disableAdaptivePruning ? minPruneFactor : 0,
!disableAdaptivePruning, initialErrorRateForPruning, pruningLogOddsThreshold, pruningSeedingLogOddsThreshold, maxUnprunedVariants, useLinkedDeBruijnGraph, enableLegacyGraphCycleDetection);
!disableAdaptivePruning, initialErrorRateForPruning, pruningLogOddsThreshold, pruningSeedingLogOddsThreshold, maxUnprunedVariants, useLinkedDeBruijnGraph,
enableLegacyGraphCycleDetection, minMatchingBasesToDanglingEndRecovery);
assemblyEngine.setDebugGraphTransformations(debugGraphTransformations);
assemblyEngine.setRecoverDanglingBranches(true);
assemblyEngine.setRecoverAllDanglingBranches(recoverAllDanglingBranches);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,10 @@ public abstract class ReadThreadingAssemblerArgumentCollection implements Serial
@Argument(fullName = CAPTURE_ASSEMBLY_FAILURE_BAM_LONG_NAME, doc = "Write a BAM called assemblyFailure.bam capturing all of the reads that were in the active region when the assembler failed for any reason", optional = true)
public boolean captureAssemblyFailureBAM = false;

@Hidden
@Argument(fullName = "num-matching-bases-in-dangling-end-to-recover", doc = "Sets the number of exactly matching bases in the suffix of a dangling tail and the prefix for a dangling head necessary in order to recover the path. (-1 indicates legacy behavior)", optional = true)
public int minMatchingBasesToDanglingEndRecovery = -1;

//---------------------------------------------------------------------------------------------------------------
//
// Read Error Corrector Related Parameters
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.util.Locatable;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.Kmer;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.BaseGraph;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.KmerSearchableGraph;
Expand Down Expand Up @@ -36,6 +39,8 @@ public abstract class AbstractReadThreadingGraph extends BaseGraph<MultiDeBruijn
private static final boolean DEBUG_NON_UNIQUE_CALC = false;
private static final int MAX_CIGAR_COMPLEXITY = 3;
private static final boolean INCREASE_COUNTS_BACKWARDS = true;
private int minMatchingBasesToDangingEndRecovery = -1;

/**
* for debugging info printing
*/
Expand All @@ -57,7 +62,7 @@ public abstract class AbstractReadThreadingGraph extends BaseGraph<MultiDeBruijn
// --------------------------------------------------------------------------------
private Kmer refSource = null;
private boolean startThreadingOnlyAtExistingVertex = false;
private int maxMismatchesInDanglingHead = -1;
private int maxMismatchesInDanglingHead = -1; // this argument exists purely for testing purposes in constructing helpful tests and is currently not hooked up
private boolean increaseCountsThroughBranches = false; // this may increase the branches without bounds

protected enum TraversalDirection {
Expand All @@ -76,13 +81,14 @@ protected enum TraversalDirection {
*
* @param kmerSize must be >= 1
*/
public AbstractReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples) {
public AbstractReadThreadingGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases) {
super(kmerSize, new MyEdgeFactory(numPruningSamples));

Utils.validateArg(kmerSize > 0, () -> "bad minkKmerSize " + kmerSize);

this.debugGraphTransformations = debugGraphTransformations;
this.minBaseQualityToUseInAssembly = minBaseQualityToUseInAssembly;
this.minMatchingBasesToDangingEndRecovery = numDanglingMatchingPrefixBases;
}

/**
Expand Down Expand Up @@ -175,6 +181,11 @@ protected void setAlreadyBuilt() {
alreadyBuilt = true;
}

@VisibleForTesting
void setMinMatchingBasesToDangingEndRecovery(final int minMatchingBasesToDangingEndRecovery) {
this.minMatchingBasesToDangingEndRecovery = minMatchingBasesToDangingEndRecovery;
}

@VisibleForTesting
void setMaxMismatchesInDanglingHead(final int maxMismatchesInDanglingHead) {
this.maxMismatchesInDanglingHead = maxMismatchesInDanglingHead;
Expand Down Expand Up @@ -465,6 +476,45 @@ private int recoverDanglingTail(final MultiDeBruijnVertex vertex, final int prun
return mergeDanglingTail(danglingTailMergeResult);
}

/**
* Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging.
* Requires that at a minimum there are at least #getMinMatchingBases() matches between the reference and the read
* at the end in order to emit an alignment offset.
*
* @param cigarElements cigar elements corresponding to the alignment between path1 and path2
* @param path1 the first path
* @param path2 the second path
* @return an integer pair object where the key is the offset into path1 and the value is offset into path2 (both -1 if no path is found)
*/
private Pair<Integer, Integer> bestPrefixMatch(final List<CigarElement> cigarElements, final byte[] path1, final byte[] path2) {
final int minMismatchingBases = getMinMatchingBases();

int refIdx = cigarElements.stream().mapToInt(ce -> ce.getOperator().consumesReferenceBases()? ce.getLength() : 0).sum() - 1;
int readIdx = path2.length - 1;

// NOTE: this only works when the last cigar element has a sufficient number of M bases, so no indels within min-mismatches of the edge.
for (final CigarElement ce : Lists.reverse(cigarElements)) {
if (!(ce.getOperator().consumesReadBases() && ce.getOperator().consumesReferenceBases())) {
break;
}
for (int j = 0; j < ce.getLength(); j++, refIdx--, readIdx--) {
if (path1[refIdx] != path2[readIdx]) {
break;
}
}
}

final int matches = path2.length - 1 - readIdx;
return matches < minMismatchingBases ? Pair.of(-1,-1) : Pair.of(readIdx, refIdx);
}

/**
* The minimum number of matches to be considered allowable for recovering dangling ends
*/
private int getMinMatchingBases() {
return minMatchingBasesToDangingEndRecovery;
}

/**
* Attempt to attach vertex with in-degree == 0, or a vertex on its path, to the graph
*
Expand All @@ -489,7 +539,7 @@ private int recoverDanglingHead(final MultiDeBruijnVertex vertex, final int prun
}

// merge
return mergeDanglingHead(danglingHeadMergeResult);
return minMatchingBasesToDangingEndRecovery >= 0 ? mergeDanglingHead(danglingHeadMergeResult) : mergeDanglingHeadLegacy(danglingHeadMergeResult);
}

/**
Expand All @@ -507,7 +557,7 @@ final int mergeDanglingTail(final DanglingChainMergeHelper danglingTailMergeResu

final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1;
final int matchingSuffix = Math.min(longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength());
if (matchingSuffix == 0) {
if (minMatchingBasesToDangingEndRecovery >= 0 ? matchingSuffix < minMatchingBasesToDangingEndRecovery : matchingSuffix == 0 ) {
return 0;
}

Expand All @@ -534,19 +584,19 @@ final int mergeDanglingTail(final DanglingChainMergeHelper danglingTailMergeResu
}

/**
* Actually merge the dangling head if possible
* Actually merge the dangling head if possible, this is the old codepath that does not handle indels
*
* @param danglingHeadMergeResult the result from generating a Cigar for the dangling head against the reference
* @param danglingHeadMergeResult the result from generating a Cigar for the dangling head against the reference
* @return 1 if merge was successful, 0 otherwise
*/
@VisibleForTesting
int mergeDanglingHead(final DanglingChainMergeHelper danglingHeadMergeResult) {
int mergeDanglingHeadLegacy(final DanglingChainMergeHelper danglingHeadMergeResult) {

final List<CigarElement> elements = danglingHeadMergeResult.cigar.getCigarElements();
final CigarElement firstElement = elements.get(0);
Utils.validateArg(firstElement.getOperator() == CigarOperator.M, "The first Cigar element must be an M");

final int indexesToMerge = bestPrefixMatch(danglingHeadMergeResult.referencePathString, danglingHeadMergeResult.danglingPathString, firstElement.getLength());
final int indexesToMerge = bestPrefixMatchLegacy(danglingHeadMergeResult.referencePathString, danglingHeadMergeResult.danglingPathString, firstElement.getLength());
if (indexesToMerge <= 0) {
return 0;
}
Expand All @@ -567,6 +617,41 @@ int mergeDanglingHead(final DanglingChainMergeHelper danglingHeadMergeResult) {
return 1;
}


/**
* Actually merge the dangling head if possible
*
* @param danglingHeadMergeResult the result from generating a Cigar for the dangling head against the reference
* @return 1 if merge was successful, 0 otherwise
*/
@VisibleForTesting
int mergeDanglingHead(final DanglingChainMergeHelper danglingHeadMergeResult) {

final List<CigarElement> elements = danglingHeadMergeResult.cigar.getCigarElements();
final CigarElement firstElement = elements.get(0);
Utils.validateArg( firstElement.getOperator() == CigarOperator.M, "The first Cigar element must be an M");

final Pair<Integer, Integer> indexesToMerge = bestPrefixMatch(elements, danglingHeadMergeResult.referencePathString, danglingHeadMergeResult.danglingPathString);
if ( indexesToMerge.getKey() <= 0 || indexesToMerge.getValue() <= 0 ) {
return 0;
}

// we can't push back the reference path
if ( indexesToMerge.getKey() >= danglingHeadMergeResult.referencePath.size() - 1 ) {
return 0;
}

// but we can manipulate the dangling path if we need to
if ( indexesToMerge.getValue() >= danglingHeadMergeResult.danglingPath.size() &&
! extendDanglingPathAgainstReference(danglingHeadMergeResult, indexesToMerge.getValue() - danglingHeadMergeResult.danglingPath.size() + 2) ) {
return 0;
}

addEdge(danglingHeadMergeResult.referencePath.get(indexesToMerge.getKey()), danglingHeadMergeResult.danglingPath.get(indexesToMerge.getValue()), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1));

return 1;
}

/**
* Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the
* provided vertex is the sink) and the reference path.
Expand Down Expand Up @@ -782,8 +867,8 @@ byte[] getBasesForPath(final List<MultiDeBruijnVertex> path, final boolean expan
* @param maxIndex the maximum index to traverse (not inclusive)
* @return the index of the ideal prefix match or -1 if it cannot find one, must be less than maxIndex
*/
private int bestPrefixMatch(final byte[] path1, final byte[] path2, final int maxIndex) {
final int maxMismatches = getMaxMismatches(maxIndex);
private int bestPrefixMatchLegacy(final byte[] path1, final byte[] path2, final int maxIndex) {
final int maxMismatches = getMaxMismatchesLegacy(maxIndex);
int mismatches = 0;
int index = 0;
int lastGoodIndex = -1;
Expand All @@ -801,13 +886,15 @@ private int bestPrefixMatch(final byte[] path1, final byte[] path2, final int ma
}

/**
* NOTE: this method is only used for dangling heads and not tails.
*
* Determine the maximum number of mismatches permitted on the branch.
* Unless it's preset (e.g. by unit tests) it should be the length of the branch divided by the kmer size.
*
* @param lengthOfDanglingBranch the length of the branch itself
* @return positive integer
*/
private int getMaxMismatches(final int lengthOfDanglingBranch) {
private int getMaxMismatchesLegacy(final int lengthOfDanglingBranch) {
return maxMismatchesInDanglingHead > 0 ? maxMismatchesInDanglingHead : Math.max(1, (lengthOfDanglingBranch / kmerSize));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,17 @@ public class JunctionTreeLinkedDeBruijnGraph extends AbstractReadThreadingGraph
// TODO should this be constructed here or elsewhere
private final Set<Kmer> kmers = new HashSet<>();

@VisibleForTesting
public JunctionTreeLinkedDeBruijnGraph(int kmerSize) {
this(kmerSize, false, (byte)6, 1);
this(kmerSize, false, (byte)6, 1, -1);
}

/**
* Create a new ReadThreadingAssembler using kmerSize for matching
* @param kmerSize must be >= 1
*/
JunctionTreeLinkedDeBruijnGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples) {
super(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples);
JunctionTreeLinkedDeBruijnGraph(final int kmerSize, final boolean debugGraphTransformations, final byte minBaseQualityToUseInAssembly, final int numPruningSamples, final int numDanglingMatchingPrefixBases) {
super(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples, numDanglingMatchingPrefixBases);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ public final class ReadThreadingAssembler {
protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE;
private int pruneFactor;
private final ChainPruner<MultiDeBruijnVertex, MultiSampleEdge> chainPruner;
private int minMatchingBasesToDanglingEndRecovery;

private File debugGraphOutputPath = null; //Where to write debug graphs, if unset it defaults to the current working dir
private File graphOutputPath = null;
Expand All @@ -77,7 +78,8 @@ public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler
final boolean dontIncreaseKmerSizesForCycles, final boolean allowNonUniqueKmersInRef,
final int numPruningSamples, final int pruneFactor, final boolean useAdaptivePruning,
final double initialErrorRateForPruning, final double pruningLogOddsThreshold,
final double pruningSeedingLogOddsThreshold, final int maxUnprunedVariants, final boolean useLinkedDebruijnGraphs, final boolean enableLegacyGraphCycleDetection) {
final double pruningSeedingLogOddsThreshold, final int maxUnprunedVariants, final boolean useLinkedDebruijnGraphs,
final boolean enableLegacyGraphCycleDetection, final int minMachingBasesToDanglngEndRecovery) {
Utils.validateArg( maxAllowedPathsForReadThreadingAssembler >= 1, "numBestHaplotypesPerGraph should be >= 1 but got " + maxAllowedPathsForReadThreadingAssembler);
this.kmerSizes = kmerSizes.stream().sorted(Integer::compareTo).collect(Collectors.toList());
this.dontIncreaseKmerSizesForCycles = dontIncreaseKmerSizesForCycles;
Expand All @@ -93,18 +95,25 @@ public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler
chainPruner = useAdaptivePruning ? new AdaptiveChainPruner<>(initialErrorRateForPruning, pruningLogOddsThreshold, pruningSeedingLogOddsThreshold, maxUnprunedVariants) :
new LowWeightChainPruner<>(pruneFactor);
numBestHaplotypesPerGraph = maxAllowedPathsForReadThreadingAssembler;
this.minMatchingBasesToDanglingEndRecovery = minMachingBasesToDanglngEndRecovery;
}

@VisibleForTesting
ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler, final List<Integer> kmerSizes, final int pruneFactor) {
this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, true, true, 1, pruneFactor, false, 0.001, 2, 2, Integer.MAX_VALUE, false, false);
this(maxAllowedPathsForReadThreadingAssembler, kmerSizes, true, true, 1, pruneFactor, false, 0.001, 2, 2, Integer.MAX_VALUE, false, false, 3);
}

@VisibleForTesting
ReadThreadingAssembler() {
this(DEFAULT_NUM_PATHS_PER_GRAPH, Arrays.asList(25), 2);
}

// this method should not be used, only exposed for testing purposes. This should be set in the constructor.
@VisibleForTesting
void setMinMatchingBasesToDanglingEndRecovery(final int mimMatchingBases) {
this.minMatchingBasesToDanglingEndRecovery = mimMatchingBases;
}

/**
* Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
* @param assemblyRegion AssemblyRegion object holding the reads which are to be used during assembly
Expand Down Expand Up @@ -609,8 +618,8 @@ private AssemblyResult createGraph(final Iterable<GATKRead> reads,
}

// TODO figure out how you want to hook this in
final AbstractReadThreadingGraph rtgraph = generateSeqGraph ? new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples) :
new JunctionTreeLinkedDeBruijnGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples);
final AbstractReadThreadingGraph rtgraph = generateSeqGraph ? new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples, minMatchingBasesToDanglingEndRecovery) :
new JunctionTreeLinkedDeBruijnGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples, minMatchingBasesToDanglingEndRecovery);

rtgraph.setThreadingStartOnlyAtExistingVertex(!recoverDanglingBranches);

Expand Down
Loading

0 comments on commit c305078

Please sign in to comment.