Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed bugs and simplified AlleleLikelihoods evidence-to-index cache #6593

Merged
merged 3 commits into from
May 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.downsampling.AlleleBiasedDownsamplingUtils;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;

import java.util.*;
import java.util.function.Function;
Expand Down Expand Up @@ -77,6 +76,8 @@ private double getInformativeThreshold() {
*
* <p>In order to save CPU time the indices contained in this array (not the array itself) is
* lazily initialized by invoking {@link #evidenceIndexBySampleIndex(int)}.</p>
*
* <p>Whenever a sample's evidence list is modified this cache must either be updated or invalidated by {@link #invalidateEvidenceToIndexCache(int)}.</p>
*/
protected final List<Object2IntMap<EVIDENCE>> evidenceIndexBySampleIndex;

Expand Down Expand Up @@ -737,9 +738,10 @@ public void addEvidence(final Map<String,List<EVIDENCE>> evidenceBySample, final
continue;
}

final int initialNumberOfEvidences = evidenceBySampleIndex.get(sampleIndex).size();
final List<EVIDENCE> newlyAddedEvidence = appendEvidence(newSampleEvidence, sampleIndex);
extendsLikelihoodArrays(initialLikelihood, sampleIndex, initialNumberOfEvidences, initialNumberOfEvidences + newlyAddedEvidence.size());
final int oldEvidenceCount = evidenceBySampleIndex.get(sampleIndex).size();
appendEvidence(newSampleEvidence, sampleIndex);
final int newEvidenceCount = evidenceBySampleIndex.get(sampleIndex).size();
extendsLikelihoodArrays(initialLikelihood, sampleIndex, oldEvidenceCount, newEvidenceCount);
}
}

Expand All @@ -758,46 +760,24 @@ private void extendsLikelihoodArrays(final double initialLikelihood, final int s
}
}

// Append the new evidence reference into the structure per-sample.
private List<EVIDENCE> appendEvidence(final List<EVIDENCE> newSampleEvidence, final int sampleIndex) {
// Append the new evidence reference into the structure per-sample, returning the count of evidence actually added (duplicates are not added)
// NOTE: the evidence-to-index cache is updated in place and not invalidated via {@link #invalidateEvidenceToIndexCache(int)} because adding new evidence
// to the cache, as opposed to removing evidence, is just a matter of appending entries
private void appendEvidence(final List<EVIDENCE> newSampleEvidence, final int sampleIndex) {

final List<EVIDENCE> sampleEvidence = evidenceBySampleIndex.get(sampleIndex);
final Object2IntMap<EVIDENCE> sampleEvidenceIndex = evidenceIndexBySampleIndex(sampleIndex);

// actually-added will have the list of evidence added at the end of this method.
// this won't include those that were already in the table.
// being optimistic we assume that there is no repeats in the input new evidence so we set it to
// the input list but if we found something we then start a new list.
List<EVIDENCE> actuallyAdded = newSampleEvidence;

int i, nextIndex = sampleEvidence.size();
final int stop = newSampleEvidence.size();
for (i = 0; i < stop; i++) {
final EVIDENCE newEvidence = newSampleEvidence.get(i);
final int previousValue = sampleEvidenceIndex.put(newEvidence, nextIndex);
for (final EVIDENCE newEvidence : newSampleEvidence) {
final int previousValue = sampleEvidenceIndex.put(newEvidence, sampleEvidence.size());
if (previousValue == MISSING_INDEX) {
nextIndex++;
sampleEvidence.add(newEvidence);
} else {
actuallyAdded = new ArrayList<>(newSampleEvidence.subList(0, i));
i++; // skip the repeated element.
break;
}
}
// second for below only use if we encounter some evidence that is already in the table:
for (; i < stop; i++) {
final EVIDENCE newEvidence = newSampleEvidence.get(i);
final int previousValue = sampleEvidenceIndex.put(newEvidence, nextIndex);
if (previousValue == MISSING_INDEX) {
nextIndex++;
sampleEvidence.add(newEvidence);
actuallyAdded.add(newEvidence);
} else {
sampleEvidenceIndex.put(newEvidence, previousValue); // revert
}
}

numberOfEvidences[sampleIndex] = sampleEvidence.size();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nextIndex contains the value you need here.
perhaps it would be better to call this variable currentSize

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, but now that you mention it I think it's even better to drop nextIndex entirely and replace it with sampleEvidence.size().

return actuallyAdded;
}

/**
Expand Down Expand Up @@ -1129,60 +1109,41 @@ private void removeEvidence(final int sampleIndex, final Collection<EVIDENCE> ev
removeEvidenceByIndex(sampleIndex, indexesToRemove);
}

// remove evidence and unset the {@code evidenceIndexBySampleIndex} cache for this sample
// assumes that evidencesToRemove is sorted and without duplicates.
private void removeEvidenceByIndex(final int sampleIndex, final int[] evidencesToRemove) {
if (evidencesToRemove.length == 0) {
final int numToRemove = evidencesToRemove.length;
if (numToRemove == 0) {
return;
} else {
final Object2IntMap<EVIDENCE> evidenceIndex = evidenceIndexBySampleIndex(sampleIndex);
final int oldEvidenceCount = numberOfEvidences[sampleIndex];
final int newEvidenceCount = oldEvidenceCount - evidencesToRemove.length;
if (newEvidenceCount < 0) {
throw new IllegalStateException("attempt to remove non-existent evidence or repeated evidence");
} else if (newEvidenceCount == 0) { // taking in consideration the assumption we simply remove
// all evidence.
evidenceIndex.clear();
numberOfEvidences[sampleIndex] = 0;
}
final int oldEvidenceCount = numberOfEvidences[sampleIndex];
final int newEvidenceCount = oldEvidenceCount - evidencesToRemove.length;

// update the list of evidence and evidence count
final List<EVIDENCE> oldEvidence = evidenceBySampleIndex.get(sampleIndex);
final List<EVIDENCE> newEvidence = new ArrayList<>(newEvidenceCount);
for (int n = 0, numRemoved = 0; n < oldEvidenceCount; n++) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can be a bit more efficient by putting the next to skip "on-deck":

for (int n = 0, numRemoved = 0, nextToRemove = evidencesToRemove[0]; n < oldEvidenceCount; n++) {
     if (n == nextToRemove) {
         nextToRemove = ++numRemoved < evidencesToRemove.length ? evidencesToRemove[numRemoved] : -1;
     } else {
         newEvidence.add(oldEvidence.get(n));
     }
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's pretty neat but I think I'll keep the loop simpler as-is.

if (numRemoved < numToRemove && n == evidencesToRemove[numRemoved]) {
numRemoved++;
} else {
final List<EVIDENCE> evidences = evidenceBySampleIndex.get(sampleIndex);
final double[][] values = valuesBySampleIndex[sampleIndex];
newEvidence.add(oldEvidence.get(n));

int nextIndexToRemove = evidencesToRemove[0];
if (nextIndexToRemove < 0) {
throw new IllegalStateException("invalid input index array as it contains negatives");
}
evidenceIndex.remove(evidences.get(nextIndexToRemove));
for (int etrIndex = 1, to = nextIndexToRemove, from = to + 1; to < newEvidenceCount; etrIndex++, from++) {
if (etrIndex < evidencesToRemove.length) {
nextIndexToRemove = evidencesToRemove[etrIndex];
if (nextIndexToRemove < from) {
throw new IllegalStateException("invalid input index array contains indexes out of order");
} else if (nextIndexToRemove >= oldEvidenceCount) {
throw new IllegalStateException("invalid input index array contains indexes out of order");
}
evidenceIndex.remove(evidences.get(nextIndexToRemove));
} else {
nextIndexToRemove = oldEvidenceCount;
}
for (; from < nextIndexToRemove; from++) {
final EVIDENCE evidence = evidences.get(from);
evidences.set(to, evidence);
evidenceIndex.put(evidence, to++);
}
}
Utils.truncate(evidences, newEvidenceCount);
// now we do the likelihood values, we save ourselves all the checks:
for (final double[] alleleValues : values) {
for (int etrIndex = 1, to = evidencesToRemove[0], from = to + 1; to < newEvidenceCount; from++, etrIndex++) {
nextIndexToRemove = etrIndex < evidencesToRemove.length ? evidencesToRemove[etrIndex] : oldEvidenceCount;
for (; from < nextIndexToRemove; from++) {
alleleValues[to++] = alleleValues[from];
}
}
// update the likelihoods arrays in place
for (final double[] alleleValues : valuesBySampleIndex[sampleIndex]) {
alleleValues[n - numRemoved] = alleleValues[n];
davidbenjamin marked this conversation as resolved.
Show resolved Hide resolved
}
numberOfEvidences[sampleIndex] = newEvidenceCount;
}
}
evidenceBySampleIndex.set(sampleIndex, newEvidence);
numberOfEvidences[sampleIndex] = newEvidenceCount;

invalidateEvidenceToIndexCache(sampleIndex);
}

// The evidenceToIndex map becomes invalid when the evidence list is modified, for example by deleting evidence
// When adding evidence it is simple enough to add new entries to the map, but we must be careful to do so.
private void invalidateEvidenceToIndexCache(final int sampleIndex) {
evidenceIndexBySampleIndex.set(sampleIndex, null);
}

/**
Expand Down Expand Up @@ -1215,21 +1176,21 @@ public void filterPoorlyModeledEvidence(final ToDoubleFunction<EVIDENCE> log10Mi


private Object2IntMap<EVIDENCE> evidenceIndexBySampleIndex(final int sampleIndex) {
if (evidenceIndexBySampleIndex.get(sampleIndex) == null) {
final List<EVIDENCE> sampleEvidence = evidenceBySampleIndex.get(sampleIndex);
final int sampleEvidenceCount = sampleEvidence.size();
final Object2IntMap<EVIDENCE> index = new Object2IntOpenHashMap<>(sampleEvidenceCount);
index.defaultReturnValue(MISSING_INDEX);
evidenceIndexBySampleIndex.set(sampleIndex, index);
for (int r = 0; r < sampleEvidenceCount; r++) {
index.put(sampleEvidence.get(r), r);
}
return index;
} else {
return evidenceIndexBySampleIndex.get(sampleIndex);
}
final Object2IntMap<EVIDENCE> cached = evidenceIndexBySampleIndex.get(sampleIndex);
return cached == null ? fillEvidenceToIndexCache(sampleIndex) : cached;
}

private Object2IntMap<EVIDENCE> fillEvidenceToIndexCache(int sampleIndex) {
final List<EVIDENCE> sampleEvidence = evidenceBySampleIndex.get(sampleIndex);
final int sampleEvidenceCount = sampleEvidence.size();
final Object2IntMap<EVIDENCE> index = new Object2IntOpenHashMap<>(sampleEvidenceCount);
index.defaultReturnValue(MISSING_INDEX);
for (int r = 0; r < sampleEvidenceCount; r++) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what about using a foreach:

int nextIdx = 0;
for( final EVIDENCE evi : sampleEvidence) {
    index.put(evi, nextIdx++);
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the foreach but my distaste for introducing a variable outside the scope of the for loop is stronger.

index.put(sampleEvidence.get(r), r);
}
evidenceIndexBySampleIndex.set(sampleIndex, index);
return index;
}

/**
* Implements a likelihood matrix per sample given its index.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

import htsjdk.samtools.SAMFileHeader;
import htsjdk.variant.variantcontext.Allele;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.stat.descriptive.rank.Median;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils;
Expand All @@ -14,6 +12,8 @@
import org.testng.annotations.Test;

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
* Test code for {@link AlleleLikelihoods}
Expand Down Expand Up @@ -174,8 +174,13 @@ public void testFilterPoorlyModeledReads(final String[] samples, final Allele[]
final AlleleLikelihoods<GATKRead, Allele> original = makeGoodAndBadLikelihoods(samples, alleles, reads);

final AlleleLikelihoods<GATKRead, Allele> result = makeGoodAndBadLikelihoods(samples, alleles, reads);

checkEvidenceToIndexMapIsCorrect(result);

result.filterPoorlyModeledEvidence(read -> -100);

checkEvidenceToIndexMapIsCorrect(result);

for (int s = 0; s < samples.length; s++) {
final int oldSampleReadCount = original.sampleEvidenceCount(s);
final int newSampleReadCount = result.sampleEvidenceCount(s);
Expand All @@ -198,9 +203,13 @@ public void testFilterReadsToOverlap(final String[] samples, final Allele[] alle
fillWithRandomLikelihoods(samples, alleles, original, result);

final SimpleInterval evenReadOverlap = new SimpleInterval(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(), EVEN_READ_START, EVEN_READ_START);

checkEvidenceToIndexMapIsCorrect(result);
result.retainEvidence(evenReadOverlap::overlaps);
checkEvidenceToIndexMapIsCorrect(result);

final double[][][] newLikelihoods = new double[samples.length][alleles.length][];
for (int s = 0; s < samples.length ; s++)
for (int s = 0; s < samples.length ; s++) {
for (int a = 0; a < alleles.length; a++) {
newLikelihoods[s][a] = new double[(original.sampleEvidenceCount(s) + 1) / 2];
final LikelihoodMatrix<GATKRead, Allele> sampleMatrix = original.sampleMatrix(s);
Expand All @@ -209,9 +218,23 @@ public void testFilterReadsToOverlap(final String[] samples, final Allele[] alle
newLikelihoods[s][a][r] = sampleMatrix.get(a, r << 1);
}
}
}
testLikelihoodMatrixQueries(samples,result,newLikelihoods);
}

@Test(dataProvider = "dataSets")
public void testContaminationDownsamplingDoesntDoBadThingsToEvidenceToIndexCache(final String[] samples, final Allele[] alleles, final Map<String,List<GATKRead>> reads) {
final AlleleLikelihoods<GATKRead, Allele> original = new AlleleLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
final AlleleLikelihoods<GATKRead, Allele> result = new AlleleLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
fillWithRandomLikelihoods(samples, alleles, original, result);

checkEvidenceToIndexMapIsCorrect(result);
final double downsamplingFraction = 0.5;
final Map<String, Double> perSampleDownsamplingFractions = Arrays.stream(samples).collect(Collectors.toMap(s -> s, s -> downsamplingFraction));
result.contaminationDownsampling(perSampleDownsamplingFractions);
checkEvidenceToIndexMapIsCorrect(result);
}

@Test(dataProvider = "marginalizationDataSets")
public void testMarginalizationWithOverlap(final String[] samples, final Allele[] alleles, final Map<String,List<GATKRead>> reads, final Map<Allele,List<Allele>> newToOldAlleleMapping) {
final AlleleLikelihoods<GATKRead, Allele> original = new AlleleLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads);
Expand Down Expand Up @@ -570,7 +593,9 @@ public void testInstantiationAndBasicQueries1(final int[] readCounts, final int

final Map<String,List<GATKRead>> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList, readCounts);
final double expectedLik = -0.2;
checkEvidenceToIndexMapIsCorrect(subject);
subject.addEvidence(sampleToReads, expectedLik);
checkEvidenceToIndexMapIsCorrect(subject);

AlleleListUnitTester.assertAlleleList(subject, alleleList.asListOfAlleles());
SampleListUnitTester.assertSampleList(subject,sampleList.asListOfSamples());
Expand All @@ -588,6 +613,18 @@ public void testInstantiationAndBasicQueries1(final int[] readCounts, final int
testSampleQueries(sampleList, sampleToReads, subject);
}

// Make sure that operations that add or remove evidence result in a correct evidence-to-index map
// Some such operations update the map; others invalidate it and leave it to regenerated lazily the next time it is queried.
// Either way, we want the queries after the mutating operations to be correct
private void checkEvidenceToIndexMapIsCorrect(AlleleLikelihoods<GATKRead, Allele> subject) {
// test that evidence-to-index cache is valid after adding reads
for (int s = 0; s < subject.numberOfSamples(); s++) {
for (int r = 0; r < subject.sampleEvidenceCount(s); r++) {
Assert.assertEquals(subject.evidenceIndex(s, subject.sampleEvidence(s).get(r)), r);
}
}
}


@Test(dataProvider="readCountsAndnumberOfAllelesDataSkippingNoLikelihoodsOrNoAlleleAndWithReference")
public void testLikelihoodWriting(final int[] readCounts, final int numberOfAlleles, final boolean hasReference) {
Expand Down