Skip to content

Commit

Permalink
Index output VCFs for GCNV postprocessing (#6330)
Browse files Browse the repository at this point in the history
* Index output VCFs for GCNV postprocessing
  • Loading branch information
ldgauthier authored Jan 7, 2020
1 parent 6765e0e commit 0c376e9
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,16 @@ public final class PostprocessGermlineCNVCalls extends GATKTool {
*/
private Set<String> allosomalContigSet;

/**
* Intervals extracted from call shards, unsorted
*/
private List<SimpleIntervalCollection> unsortedIntervalCollectionsFromCalls;

/**
* Intervals extracted from model shards, unsorted
*/
private List<SimpleIntervalCollection> unsortedIntervalCollectionsFromModels;

/**
* Call shard directories put in correct order
*/
Expand All @@ -240,16 +250,15 @@ public void onStartup() {
}

/**
* Performs various validations on input arguments. Since many of these validations requires loading and parsing
* reusable data, we store them as global variables (shard interval lists, sample name, etc).
* Inputs to this tool are directories, not files, so we need a custom method to find an appropriate file inside the
* directories to pull a dictionary out of
* @return a SAM sequence dictionary that is consistent for all shards
*/
@Override
public void onTraversalStart() {
validateArguments();

numShards = inputUnsortedCallsShardPaths.size();

/* get intervals from each call and model shard in the provided (potentially arbitrary) order */
public SAMSequenceDictionary getBestAvailableSequenceDictionary() {
if (sequenceDictionary != null) {
return sequenceDictionary;
}
final List<SimpleIntervalCollection> unsortedIntervalCollectionsFromCalls =
getIntervalCollectionsFromPaths(inputUnsortedCallsShardPaths);
final List<SimpleIntervalCollection> unsortedIntervalCollectionsFromModels =
Expand All @@ -272,22 +281,37 @@ public void onTraversalStart() {
"The SAM sequence dictionary is either not the same for all of the model shards, " +
"or is different from the SAM sequence dictionary of calls shards.");

return sequenceDictionary;
}

/**
* Performs various validations on input arguments. Since many of these validations requires loading and parsing
* reusable data, we store them as global variables (shard interval lists, sample name, etc).
*/
@Override
public void onTraversalStart() {
validateArguments();

numShards = inputUnsortedCallsShardPaths.size();

sequenceDictionary = getBestAvailableSequenceDictionary();

/* get the correct shard sort order and sort all collections */
final List<Integer> sortedCallShardsOrder = AbstractLocatableCollection.getShardedCollectionSortOrder(
unsortedIntervalCollectionsFromCalls);
getUnsortedIntervalCollectionsFromCalls());
final List<Integer> sortedModelShardsOrder = AbstractLocatableCollection.getShardedCollectionSortOrder(
unsortedIntervalCollectionsFromModels);
getUnsortedIntervalCollectionsFromModels());
sortedCallsShardPaths = sortedCallShardsOrder.stream()
.map(inputUnsortedCallsShardPaths::get)
.collect(Collectors.toList());
sortedModelShardPaths = sortedModelShardsOrder.stream()
.map(inputUnsortedModelShardPaths::get)
.collect(Collectors.toList());
final List<SimpleIntervalCollection> sortedIntervalCollectionsFromCalls = sortedCallShardsOrder.stream()
.map(unsortedIntervalCollectionsFromCalls::get)
.map(getUnsortedIntervalCollectionsFromCalls()::get)
.collect(Collectors.toList());
final List<SimpleIntervalCollection> sortedIntervalCollectionsFromModels = sortedModelShardsOrder.stream()
.map(unsortedIntervalCollectionsFromModels::get)
.map(getUnsortedIntervalCollectionsFromModels()::get)
.collect(Collectors.toList());
Utils.validateArg(sortedIntervalCollectionsFromCalls.equals(sortedIntervalCollectionsFromModels),
"The interval lists found in model and call shards do not match. Make sure that the calls and model " +
Expand Down Expand Up @@ -353,7 +377,7 @@ private void generateIntervalsVCFFileFromAllShards() {
final GermlineCNVIntervalVariantComposer germlineCNVIntervalVariantComposer =
new GermlineCNVIntervalVariantComposer(intervalsVCFWriter, sampleName,
refAutosomalIntegerCopyNumberState, allosomalContigSet);
germlineCNVIntervalVariantComposer.composeVariantContextHeader(getDefaultToolVCFHeaderLines());
germlineCNVIntervalVariantComposer.composeVariantContextHeader(sequenceDictionary, getDefaultToolVCFHeaderLines());

logger.info(String.format("Writing intervals VCF file to %s...", outputIntervalsVCFFile.getAbsolutePath()));
for (int shardIndex = 0; shardIndex < numShards; shardIndex++) {
Expand Down Expand Up @@ -392,7 +416,7 @@ private void generateSegmentsVCFFileFromAllShards() {
final GermlineCNVSegmentVariantComposer germlineCNVSegmentVariantComposer =
new GermlineCNVSegmentVariantComposer(segmentsVCFWriter, sampleName,
refAutosomalIntegerCopyNumberState, allosomalContigSet);
germlineCNVSegmentVariantComposer.composeVariantContextHeader(getDefaultToolVCFHeaderLines());
germlineCNVSegmentVariantComposer.composeVariantContextHeader(sequenceDictionary, getDefaultToolVCFHeaderLines());
germlineCNVSegmentVariantComposer.writeAll(integerCopyNumberSegmentCollection.getRecords());
segmentsVCFWriter.close();
}
Expand Down Expand Up @@ -608,4 +632,26 @@ private static File getCopyNumberSegmentsFile(final File pythonSegmenterOutputPa
GermlineCNVNamingConstants.SAMPLE_PREFIX + sampleIndex,
GermlineCNVNamingConstants.COPY_NUMBER_SEGMENTS_FILE_NAME).toFile();
}

/**
* Get intervals from each call shard in the provided (potentially arbitrary) order
* @return unsorted intervals
*/
private List<SimpleIntervalCollection> getUnsortedIntervalCollectionsFromCalls() {
if (unsortedIntervalCollectionsFromCalls == null) {
unsortedIntervalCollectionsFromCalls = getIntervalCollectionsFromPaths(inputUnsortedCallsShardPaths);
}
return unsortedIntervalCollectionsFromCalls;
}

/**
* Get intervals from each model shard in the provided (potentially arbitrary) order
* @return unsorted intervals
*/
private List<SimpleIntervalCollection> getUnsortedIntervalCollectionsFromModels() {
if (unsortedIntervalCollectionsFromModels == null) {
unsortedIntervalCollectionsFromModels = getIntervalCollectionsFromPaths(inputUnsortedModelShardPaths);
}
return unsortedIntervalCollectionsFromModels;
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.copynumber.gcnv;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
Expand Down Expand Up @@ -63,13 +64,16 @@ public GermlineCNVIntervalVariantComposer(final VariantContextWriter outputWrite
}

@Override
public void composeVariantContextHeader(final Set<VCFHeaderLine> vcfDefaultToolHeaderLines) {
public void composeVariantContextHeader(final SAMSequenceDictionary sequenceDictionary,
final Set<VCFHeaderLine> vcfDefaultToolHeaderLines) {
final VCFHeader result = new VCFHeader(Collections.emptySet(), Collections.singletonList(sampleName));

/* add VCF version */
result.addMetaDataLine(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(),
VCFHeaderVersion.VCF4_2.getVersionString()));

result.setSequenceDictionary(sequenceDictionary);

/* add default tool header lines */
vcfDefaultToolHeaderLines.forEach(result::addMetaDataLine);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.copynumber.gcnv;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
Expand All @@ -10,8 +11,7 @@
import org.broadinstitute.hellbender.tools.copynumber.formats.records.IntegerCopyNumberSegment;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.Collections;
import java.util.Set;
import java.util.*;

/**
* Helper class for {@link PostprocessGermlineCNVCalls} for single-sample postprocessing of segmented
Expand Down Expand Up @@ -75,7 +75,8 @@ public GermlineCNVSegmentVariantComposer(final VariantContextWriter outputWriter
}

@Override
public void composeVariantContextHeader(final Set<VCFHeaderLine> vcfDefaultToolHeaderLines) {
public void composeVariantContextHeader(final SAMSequenceDictionary sequenceDictionary,
final Set<VCFHeaderLine> vcfDefaultToolHeaderLines) {
final VCFHeader result = new VCFHeader(Collections.emptySet(), Collections.singletonList(sampleName));

/* add VCF version */
Expand All @@ -85,6 +86,8 @@ public void composeVariantContextHeader(final Set<VCFHeaderLine> vcfDefaultToolH
/* add default tool header lines */
vcfDefaultToolHeaderLines.forEach(result::addMetaDataLine);

result.setSequenceDictionary(sequenceDictionary);

/* header lines related to genotype formatting */
result.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1,
VCFHeaderLineType.Integer, "Segment genotype"));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.copynumber.gcnv;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
Expand Down Expand Up @@ -35,9 +36,10 @@ public abstract class GermlineCNVVariantComposer<DATA extends Locatable> {
/**
* Compose the header of the variant context.
*
* @param sequenceDictionary sequence dictionary to use for contig header lines
* @param vcfDefaultToolHeaderLines default header lines of the VCF generation tool
*/
abstract void composeVariantContextHeader(final Set<VCFHeaderLine> vcfDefaultToolHeaderLines);
abstract void composeVariantContextHeader(final SAMSequenceDictionary sequenceDictionary, final Set<VCFHeaderLine> vcfDefaultToolHeaderLines);

abstract VariantContext composeVariantContext(final DATA data);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
package org.broadinstitute.hellbender.tools.copynumber;

import htsjdk.samtools.util.FileExtensions;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

Expand Down Expand Up @@ -105,15 +109,26 @@ public void testDifferentValidInput(final int sampleIndex,
final List<String> modelShards) throws IOException {
final File actualIntervalsOutputVCF = createTempFile("intervals-output-vcf-" + sampleIndex, ".vcf");
final File actualSegmentsOutputVCF = createTempFile("segments-output-vcf-" + sampleIndex, ".vcf");
final File intervalsIndex = new File(actualIntervalsOutputVCF.getAbsolutePath() + FileExtensions.TRIBBLE_INDEX);
Assert.assertFalse(intervalsIndex.exists());
final File segmentsIndex = new File(actualIntervalsOutputVCF.getAbsolutePath() + FileExtensions.TRIBBLE_INDEX);
Assert.assertFalse(segmentsIndex.exists());
final File actualDenoisedCopyRatiosOutput = createTempFile("denoised-copy-ratios-output-" + sampleIndex, ".tsv");
final File expectedIntervalsOutputVCF = INTERVALS_VCF_CORRECT_OUTPUTS.get(sampleIndex);
final File expectedSegmentsOutputVCF = SEGMENTS_VCF_CORRECT_OUTPUTS.get(sampleIndex);
final File expectedDenoisedCopyRatiosOutput = DENOISED_COPY_RATIOS_OUTPUTS.get(sampleIndex);
runToolForSingleSample(callShards, modelShards, sampleIndex,
actualIntervalsOutputVCF, actualSegmentsOutputVCF, actualDenoisedCopyRatiosOutput,
ALLOSOMAL_CONTIGS, AUTOSOMAL_REF_COPY_NUMBER);

Assert.assertTrue(intervalsIndex.exists());
Assert.assertTrue(segmentsIndex.exists());
IntegrationTestSpec.assertEqualTextFiles(actualIntervalsOutputVCF, expectedIntervalsOutputVCF, "##");
final VCFHeader intervalsHeader = VariantContextTestUtils.getVCFHeader(actualIntervalsOutputVCF.getAbsolutePath());
Assert.assertTrue(intervalsHeader.getContigLines().size() > 0);
IntegrationTestSpec.assertEqualTextFiles(actualSegmentsOutputVCF, expectedSegmentsOutputVCF, "##");
final VCFHeader segmentsHeader = VariantContextTestUtils.getVCFHeader(actualIntervalsOutputVCF.getAbsolutePath());
Assert.assertTrue(segmentsHeader.getContigLines().size() > 0);
IntegrationTestSpec.assertEqualTextFiles(actualDenoisedCopyRatiosOutput, expectedDenoisedCopyRatiosOutput, "##");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,18 @@ public static Pair<VCFHeader, List<VariantContext>> readEntireVCFIntoMemory(fina
}
}

public static VCFHeader getVCFHeader(final String vcfPath) {
Utils.nonNull(vcfPath);

try ( final FeatureDataSource<VariantContext> vcfReader = new FeatureDataSource<>(vcfPath) ) {
final Object header = vcfReader.getHeader();
if (!(header instanceof VCFHeader)) {
throw new IllegalArgumentException(vcfPath + " does not have a valid VCF header");
}
return (VCFHeader)header;
}
}

private static void assertAttributeEquals(final String key, final Object actual, final Object expected) {
final Object notationCorrectedActual = normalizeScientificNotation(actual);
final Object notationCorrectedExpected = normalizeScientificNotation(expected);
Expand Down

0 comments on commit 0c376e9

Please sign in to comment.