Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

block compressed interval streams #7142

Merged
merged 6 commits into from
Jul 20, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,26 @@
import org.broadinstitute.hellbender.utils.IndexUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.codecs.FeaturesHeader;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.BlockCompressedIntervalStream.Reader;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.genomicsdb.model.GenomicsDBExportConfiguration;
import org.genomicsdb.reader.GenomicsDBFeatureReader;

import java.io.File;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.net.URI;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;

import static org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBUtils.createExportConfiguration;
import static org.broadinstitute.hellbender.utils.io.BlockCompressedIntervalStream.BCI_FILE_EXTENSION;

/**
* Enables traversals and queries over sources of Features, which are metadata associated with a location
Expand Down Expand Up @@ -305,7 +306,8 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
* @param setNameOnCodec If true, and if this FeatureDataSource uses a NameAwareCodec, the name of the FeatureInput will be used to set the codec's name. This exists as a mechanism to store the FeatureInput name in the source field of VariantContexts
*/
public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLookaheadBases, final Class<? extends Feature> targetFeatureType,
final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions genomicsDBOptions, final boolean setNameOnCodec) {
final int cloudPrefetchBuffer, final int cloudIndexPrefetchBuffer, final GenomicsDBOptions genomicsDBOptions,
final boolean setNameOnCodec) {
Utils.validateArg(queryLookaheadBases >= 0, "Query lookahead bases must be >= 0");
this.featureInput = Utils.nonNull(featureInput, "featureInput must not be null");
if (IOUtils.isGenomicsDBPath(featureInput)) {
Expand All @@ -319,12 +321,14 @@ public FeatureDataSource(final FeatureInput<T> featureInput, final int queryLook
BucketUtils.getPrefetchingWrapper(cloudIndexPrefetchBuffer),
genomicsDBOptions, setNameOnCodec);

if (IOUtils.isGenomicsDBPath(featureInput)) {
if (IOUtils.isGenomicsDBPath(featureInput) ||
featureInput.getFeaturePath().toLowerCase().endsWith(BCI_FILE_EXTENSION)) {
//genomics db uri's have no associated index file to read from, but they do support random access
// likewise with block-compressed interval files
this.hasIndex = false;
this.supportsRandomAccess = true;
} else if (featureReader instanceof AbstractFeatureReader) {
this.hasIndex = ((AbstractFeatureReader<T, ?>) featureReader).hasIndex();
this.hasIndex = ((AbstractFeatureReader<T, ?>)featureReader).hasIndex();
this.supportsRandomAccess = hasIndex;
} else {
throw new GATKException("Found a feature input that was neither GenomicsDB or a Tribble AbstractFeatureReader. Input was " + featureInput.toString() + ".");
Expand All @@ -345,7 +349,7 @@ final void printCacheStats() {
queryCache.printCacheStatistics( getName() );
}

@SuppressWarnings("unchecked")
@SuppressWarnings({"unchecked", "rawtypes"})
private static <T extends Feature> FeatureReader<T> getFeatureReader(final FeatureInput<T> featureInput, final Class<? extends Feature> targetFeatureType,
final Function<SeekableByteChannel, SeekableByteChannel> cloudWrapper,
final Function<SeekableByteChannel, SeekableByteChannel> cloudIndexWrapper,
Expand All @@ -367,6 +371,9 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
}
} else {
final FeatureCodec<T, ?> codec = getCodecForFeatureInput(featureInput, targetFeatureType, setNameOnCodec);
if ( featureInput.getFeaturePath().toLowerCase().endsWith(BCI_FILE_EXTENSION) ) {
return new Reader(featureInput, codec);
}
return getTribbleFeatureReader(featureInput, codec, cloudWrapper, cloudIndexWrapper);
}
}
Expand All @@ -380,7 +387,8 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
*/
@SuppressWarnings("unchecked")
private static <T extends Feature> FeatureCodec<T, ?> getCodecForFeatureInput(final FeatureInput<T> featureInput,
final Class<? extends Feature> targetFeatureType, final boolean setNameOnCodec) {
final Class<? extends Feature> targetFeatureType,
final boolean setNameOnCodec) {
final FeatureCodec<T, ?> codec;
final Class<FeatureCodec<T, ?>> codecClass = featureInput.getFeatureCodecClass();
if (codecClass == null) {
Expand All @@ -391,6 +399,10 @@ private static <T extends Feature> FeatureReader<T> getFeatureReader(final Featu
} else {
try {
codec = codecClass.getDeclaredConstructor().newInstance();
if ( !codec.canDecode(featureInput.toPath().toAbsolutePath().toUri().toString()) ) {
throw new GATKException(codec.getClass().getSimpleName() + " cannot decode " +
featureInput.toPath().toString());
}
} catch (final InstantiationException | IllegalAccessException | NoSuchMethodException | InvocationTargetException e) {
throw new GATKException("Unable to automatically instantiate codec " + codecClass.getName());
}
Expand Down Expand Up @@ -461,7 +473,9 @@ protected static FeatureReader<VariantContext> getGenomicsDBFeatureReader(final
public SAMSequenceDictionary getSequenceDictionary() {
SAMSequenceDictionary dict = null;
final Object header = getHeader();
if (header instanceof VCFHeader) {
if ( header instanceof FeaturesHeader ) {
dict = ((FeaturesHeader)header).getDictionary();
} else if (header instanceof VCFHeader) {
dict = ((VCFHeader) header).getSequenceDictionary();
}
if (dict != null && !dict.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ void addToFeatureSources(final int featureQueryLookahead, final FeatureInput<? e
featureSources.put(featureInput, new FeatureDataSource<>(featureInput, featureQueryLookahead, featureType, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, genomicsDBOptions));
}

<F extends Feature> void addToFeatureSources (final FeatureInput<F> featureInput,
final FeatureDataSource<F> featureDataSource) {
featureSources.put(featureInput, featureDataSource);
}

/**
* Given a ArgumentDefinition for an argument known to be of type FeatureInput (or a Collection thereof), retrieves the type
* parameter for the FeatureInput (eg., for FeatureInput<VariantContext> or List<FeatureInput<VariantContext>>
Expand Down Expand Up @@ -375,6 +380,22 @@ public <T extends Feature> Iterator<T> getFeatureIterator(final FeatureInput<T>
return dataSource.iterator();
}

/**
* As above, but takes an optional list of intervals to examine.
* @param featureDescriptor FeatureInput to scan
* @param intervals The userIntervals to examine (may be null)
* @param <T> Feature type
* @return An iterator over the Features
*/
public <T extends Feature> Iterator<T> getFeatureIterator( final FeatureInput<T> featureDescriptor,
final List<SimpleInterval> intervals ) {
final FeatureDataSource<T> dataSource = lookupDataSource(featureDescriptor);
dataSource.setIntervalsForTraversal(intervals);
final Iterator<T> itr = dataSource.iterator();
dataSource.setIntervalsForTraversal(null);
return itr;
}

/**
* Get the header associated with a particular FeatureInput
*
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package org.broadinstitute.hellbender.engine;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
import org.broadinstitute.hellbender.engine.filters.CountingReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;

Expand Down Expand Up @@ -57,12 +59,18 @@ private void initializeDrivingFeatures() {
final GATKPath drivingPath = getDrivingFeaturePath();
final FeatureCodec<? extends Feature, ?> codec = FeatureManager.getCodecForFile(drivingPath.toPath());
if (isAcceptableFeatureType(codec.getFeatureType())) {
drivingFeatures = new FeatureDataSource<>(new FeatureInput<>(drivingPath), FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES, null, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, referenceArguments.getReferencePath());

final FeatureInput<F> drivingFeaturesInput = new FeatureInput<>(drivingPath, "drivingFeatureFile");
features.addToFeatureSources(0, drivingFeaturesInput, codec.getFeatureType(), cloudPrefetchBuffer, cloudIndexPrefetchBuffer,
referenceArguments.getReferencePath());
header = getHeaderForFeatures(drivingFeaturesInput);
final GenomicsDBOptions options = new GenomicsDBOptions(referenceArguments.getReferencePath());
final FeatureInput<F> drivingFeatureInput = new FeatureInput<>(drivingPath);
drivingFeatureInput.setFeatureCodecClass((Class<FeatureCodec<F, ?>>)codec.getClass());
drivingFeatures = new FeatureDataSource<>(drivingFeatureInput, FeatureDataSource.DEFAULT_QUERY_LOOKAHEAD_BASES, null,
cloudPrefetchBuffer, cloudIndexPrefetchBuffer, options, false);
header = drivingFeatures.getHeader();

final FeatureInput<F> featureInput = new FeatureInput<>(drivingPath, "drivingFeatureFile");
featureInput.setFeatureCodecClass((Class<FeatureCodec<F, ?>>)codec.getClass());
features.addToFeatureSources(featureInput,
new FeatureDataSource<>(featureInput, 0, null,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The third argument to this constructor should be codec.getFeatureType(), not null -- unless you changed it to null deliberately?

cloudPrefetchBuffer, cloudIndexPrefetchBuffer, options, false));
} else {
throw new UserException("File " + drivingPath.getRawInputString() + " contains features of the wrong type.");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,7 @@ public SAMSequenceDictionary getBestAvailableSequenceDictionary() {
} else if (hasReads()){
return reads.getSequenceDictionary();
} else if (hasFeatures()){
final List<SAMSequenceDictionary> dictionaries = features.getVariantSequenceDictionaries();
final List<SAMSequenceDictionary> dictionaries = features.getAllSequenceDictionaries();
//If there is just one, it clearly is the best. Otherwise, none is best.
if (dictionaries.size() == 1){
return dictionaries.get(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SimpleNovelAdjacencyAndChimericAlignmentEvidence;
import org.broadinstitute.hellbender.tools.spark.sv.utils.CNVInputReader;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter;
import org.broadinstitute.hellbender.utils.BaseUtils;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.evidence.FindBreakpointEvidenceSpark;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.ReadMetadata;
import org.broadinstitute.hellbender.tools.spark.sv.utils.*;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.evidence.EvidenceTargetLink;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.ReadMetadata;
import org.broadinstitute.hellbender.tools.spark.sv.utils.*;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.Utils;
import scala.Tuple2;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigAlignmentsRDDProcessor;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.ContigChimericAlignmentIterativeInterpreter;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.evidence.EvidenceTargetLink;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.ReadMetadata;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.util.Collections;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.CpxVariantInterpreter;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SegmentedCpxVariantSimpleVariantExtractor;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SimpleNovelAdjacencyInterpreter;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter;
import org.broadinstitute.hellbender.utils.io.IOUtils;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
import org.broadinstitute.hellbender.tools.spark.sv.evidence.EvidenceTargetLink;
import org.broadinstitute.hellbender.tools.spark.sv.evidence.ReadMetadata;
import org.broadinstitute.hellbender.tools.spark.sv.utils.PairedStrandedIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;

import java.util.List;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.BreakpointComplications;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFReader;
import org.broadinstitute.hellbender.utils.SimpleInterval;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.*;
import htsjdk.samtools.util.SequenceUtil;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.Strand;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Tail;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.StrandSwitch;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import scala.Tuple2;

import java.util.ArrayList;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoveryInputMetaData;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.read.GATKRead;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.Utils;

import java.io.Serializable;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvType;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import scala.Tuple2;

import java.util.List;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import htsjdk.samtools.*;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVFileUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
import org.broadinstitute.hellbender.utils.Utils;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import com.google.common.annotations.VisibleForTesting;
import org.broadinstitute.hellbender.tools.spark.sv.utils.*;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.Utils;
import scala.Tuple2;

Expand Down
Loading