diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCalls.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCalls.java index ce02228d090..1af5f8b139e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCalls.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCalls.java @@ -222,6 +222,16 @@ public final class PostprocessGermlineCNVCalls extends GATKTool { */ private Set allosomalContigSet; + /** + * Intervals extracted from call shards, unsorted + */ + private List unsortedIntervalCollectionsFromCalls; + + /** + * Intervals extracted from model shards, unsorted + */ + private List unsortedIntervalCollectionsFromModels; + /** * Call shard directories put in correct order */ @@ -240,16 +250,15 @@ public void onStartup() { } /** - * Performs various validations on input arguments. Since many of these validations requires loading and parsing - * reusable data, we store them as global variables (shard interval lists, sample name, etc). + * Inputs to this tool are directories, not files, so we need a custom method to find an appropriate file inside the + * directories to pull a dictionary out of + * @return a SAM sequence dictionary that is consistent for all shards */ @Override - public void onTraversalStart() { - validateArguments(); - - numShards = inputUnsortedCallsShardPaths.size(); - - /* get intervals from each call and model shard in the provided (potentially arbitrary) order */ + public SAMSequenceDictionary getBestAvailableSequenceDictionary() { + if (sequenceDictionary != null) { + return sequenceDictionary; + } final List unsortedIntervalCollectionsFromCalls = getIntervalCollectionsFromPaths(inputUnsortedCallsShardPaths); final List unsortedIntervalCollectionsFromModels = @@ -272,11 +281,26 @@ public void onTraversalStart() { "The SAM sequence dictionary is either not the same for all of the model shards, " + "or is different from the SAM sequence dictionary of calls shards."); + return sequenceDictionary; + } + + /** + * Performs various validations on input arguments. Since many of these validations requires loading and parsing + * reusable data, we store them as global variables (shard interval lists, sample name, etc). + */ + @Override + public void onTraversalStart() { + validateArguments(); + + numShards = inputUnsortedCallsShardPaths.size(); + + sequenceDictionary = getBestAvailableSequenceDictionary(); + /* get the correct shard sort order and sort all collections */ final List sortedCallShardsOrder = AbstractLocatableCollection.getShardedCollectionSortOrder( - unsortedIntervalCollectionsFromCalls); + getUnsortedIntervalCollectionsFromCalls()); final List sortedModelShardsOrder = AbstractLocatableCollection.getShardedCollectionSortOrder( - unsortedIntervalCollectionsFromModels); + getUnsortedIntervalCollectionsFromModels()); sortedCallsShardPaths = sortedCallShardsOrder.stream() .map(inputUnsortedCallsShardPaths::get) .collect(Collectors.toList()); @@ -284,10 +308,10 @@ public void onTraversalStart() { .map(inputUnsortedModelShardPaths::get) .collect(Collectors.toList()); final List sortedIntervalCollectionsFromCalls = sortedCallShardsOrder.stream() - .map(unsortedIntervalCollectionsFromCalls::get) + .map(getUnsortedIntervalCollectionsFromCalls()::get) .collect(Collectors.toList()); final List sortedIntervalCollectionsFromModels = sortedModelShardsOrder.stream() - .map(unsortedIntervalCollectionsFromModels::get) + .map(getUnsortedIntervalCollectionsFromModels()::get) .collect(Collectors.toList()); Utils.validateArg(sortedIntervalCollectionsFromCalls.equals(sortedIntervalCollectionsFromModels), "The interval lists found in model and call shards do not match. Make sure that the calls and model " + @@ -353,7 +377,7 @@ private void generateIntervalsVCFFileFromAllShards() { final GermlineCNVIntervalVariantComposer germlineCNVIntervalVariantComposer = new GermlineCNVIntervalVariantComposer(intervalsVCFWriter, sampleName, refAutosomalIntegerCopyNumberState, allosomalContigSet); - germlineCNVIntervalVariantComposer.composeVariantContextHeader(getDefaultToolVCFHeaderLines()); + germlineCNVIntervalVariantComposer.composeVariantContextHeader(sequenceDictionary, getDefaultToolVCFHeaderLines()); logger.info(String.format("Writing intervals VCF file to %s...", outputIntervalsVCFFile.getAbsolutePath())); for (int shardIndex = 0; shardIndex < numShards; shardIndex++) { @@ -392,7 +416,7 @@ private void generateSegmentsVCFFileFromAllShards() { final GermlineCNVSegmentVariantComposer germlineCNVSegmentVariantComposer = new GermlineCNVSegmentVariantComposer(segmentsVCFWriter, sampleName, refAutosomalIntegerCopyNumberState, allosomalContigSet); - germlineCNVSegmentVariantComposer.composeVariantContextHeader(getDefaultToolVCFHeaderLines()); + germlineCNVSegmentVariantComposer.composeVariantContextHeader(sequenceDictionary, getDefaultToolVCFHeaderLines()); germlineCNVSegmentVariantComposer.writeAll(integerCopyNumberSegmentCollection.getRecords()); segmentsVCFWriter.close(); } @@ -608,4 +632,26 @@ private static File getCopyNumberSegmentsFile(final File pythonSegmenterOutputPa GermlineCNVNamingConstants.SAMPLE_PREFIX + sampleIndex, GermlineCNVNamingConstants.COPY_NUMBER_SEGMENTS_FILE_NAME).toFile(); } + + /** + * Get intervals from each call shard in the provided (potentially arbitrary) order + * @return unsorted intervals + */ + private List getUnsortedIntervalCollectionsFromCalls() { + if (unsortedIntervalCollectionsFromCalls == null) { + unsortedIntervalCollectionsFromCalls = getIntervalCollectionsFromPaths(inputUnsortedCallsShardPaths); + } + return unsortedIntervalCollectionsFromCalls; + } + + /** + * Get intervals from each model shard in the provided (potentially arbitrary) order + * @return unsorted intervals + */ + private List getUnsortedIntervalCollectionsFromModels() { + if (unsortedIntervalCollectionsFromModels == null) { + unsortedIntervalCollectionsFromModels = getIntervalCollectionsFromPaths(inputUnsortedModelShardPaths); + } + return unsortedIntervalCollectionsFromModels; + } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVIntervalVariantComposer.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVIntervalVariantComposer.java index c3bd536ac30..6c4eb69314b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVIntervalVariantComposer.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVIntervalVariantComposer.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.copynumber.gcnv; import com.google.common.annotations.VisibleForTesting; +import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; @@ -63,13 +64,16 @@ public GermlineCNVIntervalVariantComposer(final VariantContextWriter outputWrite } @Override - public void composeVariantContextHeader(final Set vcfDefaultToolHeaderLines) { + public void composeVariantContextHeader(final SAMSequenceDictionary sequenceDictionary, + final Set vcfDefaultToolHeaderLines) { final VCFHeader result = new VCFHeader(Collections.emptySet(), Collections.singletonList(sampleName)); /* add VCF version */ result.addMetaDataLine(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); + result.setSequenceDictionary(sequenceDictionary); + /* add default tool header lines */ vcfDefaultToolHeaderLines.forEach(result::addMetaDataLine); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVSegmentVariantComposer.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVSegmentVariantComposer.java index cbb0eb43a12..9dfa515214b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVSegmentVariantComposer.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVSegmentVariantComposer.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.copynumber.gcnv; import com.google.common.annotations.VisibleForTesting; +import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; @@ -10,8 +11,7 @@ import org.broadinstitute.hellbender.tools.copynumber.formats.records.IntegerCopyNumberSegment; import org.broadinstitute.hellbender.utils.Utils; -import java.util.Collections; -import java.util.Set; +import java.util.*; /** * Helper class for {@link PostprocessGermlineCNVCalls} for single-sample postprocessing of segmented @@ -75,7 +75,8 @@ public GermlineCNVSegmentVariantComposer(final VariantContextWriter outputWriter } @Override - public void composeVariantContextHeader(final Set vcfDefaultToolHeaderLines) { + public void composeVariantContextHeader(final SAMSequenceDictionary sequenceDictionary, + final Set vcfDefaultToolHeaderLines) { final VCFHeader result = new VCFHeader(Collections.emptySet(), Collections.singletonList(sampleName)); /* add VCF version */ @@ -85,6 +86,8 @@ public void composeVariantContextHeader(final Set vcfDefaultToolH /* add default tool header lines */ vcfDefaultToolHeaderLines.forEach(result::addMetaDataLine); + result.setSequenceDictionary(sequenceDictionary); + /* header lines related to genotype formatting */ result.addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.Integer, "Segment genotype")); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVVariantComposer.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVVariantComposer.java index 3c2b1938de9..6e25e3a3665 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVVariantComposer.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/gcnv/GermlineCNVVariantComposer.java @@ -1,5 +1,6 @@ package org.broadinstitute.hellbender.tools.copynumber.gcnv; +import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.Locatable; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; @@ -35,9 +36,10 @@ public abstract class GermlineCNVVariantComposer { /** * Compose the header of the variant context. * + * @param sequenceDictionary sequence dictionary to use for contig header lines * @param vcfDefaultToolHeaderLines default header lines of the VCF generation tool */ - abstract void composeVariantContextHeader(final Set vcfDefaultToolHeaderLines); + abstract void composeVariantContextHeader(final SAMSequenceDictionary sequenceDictionary, final Set vcfDefaultToolHeaderLines); abstract VariantContext composeVariantContext(final DATA data); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCallsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCallsIntegrationTest.java index 0a9475db4d9..95189d41516 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCallsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCallsIntegrationTest.java @@ -1,9 +1,13 @@ package org.broadinstitute.hellbender.tools.copynumber; +import htsjdk.samtools.util.FileExtensions; +import htsjdk.variant.vcf.VCFHeader; import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; +import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; +import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -105,6 +109,10 @@ public void testDifferentValidInput(final int sampleIndex, final List modelShards) throws IOException { final File actualIntervalsOutputVCF = createTempFile("intervals-output-vcf-" + sampleIndex, ".vcf"); final File actualSegmentsOutputVCF = createTempFile("segments-output-vcf-" + sampleIndex, ".vcf"); + final File intervalsIndex = new File(actualIntervalsOutputVCF.getAbsolutePath() + FileExtensions.TRIBBLE_INDEX); + Assert.assertFalse(intervalsIndex.exists()); + final File segmentsIndex = new File(actualIntervalsOutputVCF.getAbsolutePath() + FileExtensions.TRIBBLE_INDEX); + Assert.assertFalse(segmentsIndex.exists()); final File actualDenoisedCopyRatiosOutput = createTempFile("denoised-copy-ratios-output-" + sampleIndex, ".tsv"); final File expectedIntervalsOutputVCF = INTERVALS_VCF_CORRECT_OUTPUTS.get(sampleIndex); final File expectedSegmentsOutputVCF = SEGMENTS_VCF_CORRECT_OUTPUTS.get(sampleIndex); @@ -112,8 +120,15 @@ public void testDifferentValidInput(final int sampleIndex, runToolForSingleSample(callShards, modelShards, sampleIndex, actualIntervalsOutputVCF, actualSegmentsOutputVCF, actualDenoisedCopyRatiosOutput, ALLOSOMAL_CONTIGS, AUTOSOMAL_REF_COPY_NUMBER); + + Assert.assertTrue(intervalsIndex.exists()); + Assert.assertTrue(segmentsIndex.exists()); IntegrationTestSpec.assertEqualTextFiles(actualIntervalsOutputVCF, expectedIntervalsOutputVCF, "##"); + final VCFHeader intervalsHeader = VariantContextTestUtils.getVCFHeader(actualIntervalsOutputVCF.getAbsolutePath()); + Assert.assertTrue(intervalsHeader.getContigLines().size() > 0); IntegrationTestSpec.assertEqualTextFiles(actualSegmentsOutputVCF, expectedSegmentsOutputVCF, "##"); + final VCFHeader segmentsHeader = VariantContextTestUtils.getVCFHeader(actualIntervalsOutputVCF.getAbsolutePath()); + Assert.assertTrue(segmentsHeader.getContigLines().size() > 0); IntegrationTestSpec.assertEqualTextFiles(actualDenoisedCopyRatiosOutput, expectedDenoisedCopyRatiosOutput, "##"); } diff --git a/src/testUtils/java/org/broadinstitute/hellbender/testutils/VariantContextTestUtils.java b/src/testUtils/java/org/broadinstitute/hellbender/testutils/VariantContextTestUtils.java index c0ce3028c59..b2b45238f86 100644 --- a/src/testUtils/java/org/broadinstitute/hellbender/testutils/VariantContextTestUtils.java +++ b/src/testUtils/java/org/broadinstitute/hellbender/testutils/VariantContextTestUtils.java @@ -79,6 +79,18 @@ public static Pair> readEntireVCFIntoMemory(fina } } + public static VCFHeader getVCFHeader(final String vcfPath) { + Utils.nonNull(vcfPath); + + try ( final FeatureDataSource vcfReader = new FeatureDataSource<>(vcfPath) ) { + final Object header = vcfReader.getHeader(); + if (!(header instanceof VCFHeader)) { + throw new IllegalArgumentException(vcfPath + " does not have a valid VCF header"); + } + return (VCFHeader)header; + } + } + private static void assertAttributeEquals(final String key, final Object actual, final Object expected) { final Object notationCorrectedActual = normalizeScientificNotation(actual); final Object notationCorrectedExpected = normalizeScientificNotation(expected);