From 32734adc32376e0a11f90585dc2368536f8582fe Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Mon, 4 May 2020 14:27:02 -0400 Subject: [PATCH] Ignore specified contigs for pre-aligned BAMs in PathSeq (#6537) --- .../pathseq/HostAlignmentReadFilter.java | 14 +++++- .../tools/spark/pathseq/PSFilter.java | 21 ++++++++- .../pathseq/PSFilterArgumentCollection.java | 13 +++++ .../pathseq/HostAlignmentReadFilterTest.java | 47 +++++++++++++++++-- 4 files changed, 89 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/HostAlignmentReadFilter.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/HostAlignmentReadFilter.java index c42dc71a402..ff745d282bf 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/HostAlignmentReadFilter.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/HostAlignmentReadFilter.java @@ -4,22 +4,34 @@ import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.utils.read.GATKRead; +import java.util.Collections; +import java.util.Set; + /** - * Filters out reads above a threshold identity (number of matches less deletions), given in bases. + * Filters out reads above a threshold identity (number of matches less deletions), given in bases. Reads mapped to + * any of the excluded contigs, if provided, will automatically pass the filter regardless of mapping identity. */ public final class HostAlignmentReadFilter extends ReadFilter { private static final long serialVersionUID = 1L; private final int minIdentity; + private final Set excludedContigs; + + public HostAlignmentReadFilter(final int minIdent, final Set excludedContigs) { + this.minIdentity = minIdent; + this.excludedContigs = excludedContigs; + } public HostAlignmentReadFilter(final int minIdent) { this.minIdentity = minIdent; + this.excludedContigs = Collections.emptySet(); } @Override public boolean test(final GATKRead read) { if ( read.isUnmapped() ) return true; + if ( excludedContigs.contains(read.getContig()) ) return true; final Integer nmVal = read.getAttributeAsInteger("NM"); if ( nmVal == null ) return true; return test(read.getCigar(), nmVal.intValue()); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PSFilter.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PSFilter.java index e5c0e86213a..ae7986b9be3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PSFilter.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PSFilter.java @@ -3,6 +3,7 @@ import com.google.common.annotations.VisibleForTesting; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMTag; import htsjdk.samtools.util.CollectionUtil; import htsjdk.samtools.util.SequenceUtil; @@ -11,6 +12,7 @@ import org.apache.spark.api.java.JavaSparkContext; import org.broadinstitute.hellbender.engine.filters.AmbiguousBaseReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadLengthReadFilter; +import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.spark.pathseq.loggers.PSFilterLogger; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils; import org.broadinstitute.hellbender.tools.spark.utils.ReadFilterSparkifier; @@ -57,6 +59,7 @@ public PSFilter(final JavaSparkContext ctx, final PSFilterArgumentCollection fil this.ctx = ctx; this.filterArgs = filterArgs; this.header = header; + validateFilterArguments(); } @VisibleForTesting @@ -170,6 +173,21 @@ static JavaRDD filterDuplicateSequences(final JavaRDD reads) }); } + /** + * Validate arguments + */ + private void validateFilterArguments() { + final SAMSequenceDictionary dictionary = header.getSequenceDictionary(); + if (filterArgs.alignedInput) { + final Set contigsToIgnoreSet = new HashSet<>(filterArgs.alignmentContigsToIgnore); + for (final String contig : contigsToIgnoreSet) { + if (dictionary.getSequence(contig) == null) { + throw new UserException.BadInput("Ignored sequence " + contig + " not found in input header."); + } + } + } + } + /** * Pairs reads with canonical Long ID using the lesser 64-bit hash of the sequence and its reverse complement */ @@ -210,7 +228,8 @@ public Tuple2, JavaRDD> doFilter(JavaRDD r filterLogger.logPrimaryReads(reads); if (filterArgs.alignedInput) { - reads = reads.filter(new ReadFilterSparkifier(new HostAlignmentReadFilter(filterArgs.minIdentity))); + final Set contigsToIgnoreSet = Collections.unmodifiableSet(new HashSet<>(filterArgs.alignmentContigsToIgnore)); + reads = reads.filter(new ReadFilterSparkifier(new HostAlignmentReadFilter(filterArgs.minIdentity, contigsToIgnoreSet))); } filterLogger.logReadsAfterPrealignedHostFilter(reads); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PSFilterArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PSFilterArgumentCollection.java index ba427fbe17b..4e386aa11ab 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PSFilterArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PSFilterArgumentCollection.java @@ -9,6 +9,7 @@ import org.broadinstitute.hellbender.engine.filters.ReadLengthReadFilter; import java.io.Serializable; +import java.util.ArrayList; import java.util.List; public final class PSFilterArgumentCollection implements Serializable { @@ -45,6 +46,8 @@ public final class PSFilterArgumentCollection implements Serializable { public static final String DUST_T_SCORE_SHORT_NAME = DUST_T_SCORE_LONG_NAME; public static final String HOST_MIN_IDENTITY_LONG_NAME = "host-min-identity"; public static final String HOST_MIN_IDENTITY_SHORT_NAME = HOST_MIN_IDENTITY_LONG_NAME; + public static final String IGNORE_ALIGNMENT_CONTIGS_LONG_NAME = "ignore-alignment-contigs"; + public static final String IGNORE_ALIGNMENT_CONTIGS_SHORT_NAME = IGNORE_ALIGNMENT_CONTIGS_LONG_NAME; public static final String HOST_KMER_COUNT_THRESHOLD_LONG_NAME = "host-kmer-thresh"; public static final String HOST_KMER_COUNT_THRESHOLD_SHORT_NAME = HOST_KMER_COUNT_THRESHOLD_LONG_NAME; public static final String FILTER_BWA_SEED_LENGTH_LONG_NAME = "filter-bwa-seed-length"; @@ -190,6 +193,16 @@ public final class PSFilterArgumentCollection implements Serializable { optional = true) public int minIdentity = 30; + /** + * If using --is-host-aligned, ignores alignments to these contigs (can be specified multiple times). This + * can be useful if the alignment is to a reference containing a microbe, such as chrEBV in hg38. + */ + @Argument(doc = "Host-aligned BAM contigs to ignore", + fullName = IGNORE_ALIGNMENT_CONTIGS_LONG_NAME, + shortName = IGNORE_ALIGNMENT_CONTIGS_SHORT_NAME, + optional = true) + public List alignmentContigsToIgnore = new ArrayList<>(); + /** * Controls the stringency of read filtering based on host k-mer matching. Reads with at least this many matching * k-mers in the host reference will be filtered. diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/pathseq/HostAlignmentReadFilterTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/pathseq/HostAlignmentReadFilterTest.java index 6784593aa3d..921a884a478 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/pathseq/HostAlignmentReadFilterTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/pathseq/HostAlignmentReadFilterTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.hellbender.tools.spark.pathseq; +import com.google.common.collect.Sets; import htsjdk.samtools.Cigar; import htsjdk.samtools.TextCigarCodec; import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; @@ -9,10 +10,12 @@ import org.testng.annotations.Test; import java.util.Arrays; +import java.util.Set; public final class HostAlignmentReadFilterTest { private static final int MIN_IDENTITY = 70; + private final String IGNORED_CONTIG = "chrIgnore"; @DataProvider(name = "alignmentData") public Object[][] getAlignmentData() { @@ -54,7 +57,6 @@ public void testMappedRead(final String cigarString, final int NM, final boolean final Cigar cigar = TextCigarCodec.decode(cigarString); final GATKRead read_in = ArtificialReadUtils.createArtificialRead(cigar); read_in.setAttribute("NM", NM); - read_in.setPosition("test_contig", 1); read_in.setPosition("pos", 1); final boolean test_i = filter.test(read_in); Assert.assertEquals(test_out, test_i); @@ -70,7 +72,45 @@ public void testUnmappedRead() { final GATKRead read_in = ArtificialReadUtils.createArtificialRead(bases, qual, "*"); read_in.setIsUnmapped(); final boolean test_i = filter.test(read_in); - Assert.assertEquals(true, test_i); + Assert.assertTrue(test_i); + } + + @Test + public void testIgnoredContig() { + final Set ignoredContigSet = Sets.newHashSet(IGNORED_CONTIG); + final HostAlignmentReadFilter filter = new HostAlignmentReadFilter(MIN_IDENTITY, ignoredContigSet); + final byte[] bases = new byte[100]; + final byte[] qual = new byte[100]; + Arrays.fill(bases, (byte) 'A'); + Arrays.fill(qual, (byte) 30); + + // Mapped fully to ignored contig + final GATKRead readA = ArtificialReadUtils.createArtificialRead(bases, qual, "100M"); + readA.setAttribute("NM", 0); + readA.setPosition(IGNORED_CONTIG, 1); + final boolean testA = filter.test(readA); + Assert.assertTrue(testA); + + // Mapped fully to non-ignored contig + final GATKRead readB = ArtificialReadUtils.createArtificialRead(bases, qual, "100M"); + readB.setAttribute("NM", 0); + readB.setPosition("chrNotIgnore", 1); + final boolean testB = filter.test(readB); + Assert.assertFalse(testB); + + // Mapped poorly to ignored contig + final GATKRead readC = ArtificialReadUtils.createArtificialRead(bases, qual, "10M90S"); + readC.setAttribute("NM", 90); + readC.setPosition(IGNORED_CONTIG, 1); + final boolean testC = filter.test(readC); + Assert.assertTrue(testC); + + // Mapped poorly to non-ignored contig + final GATKRead readD = ArtificialReadUtils.createArtificialRead(bases, qual, "10M90S"); + readD.setAttribute("NM", 90); + readD.setPosition("chrNotIgnore", 1); + final boolean testD = filter.test(readD); + Assert.assertTrue(testD); } @Test @@ -79,9 +119,8 @@ public void testMappedReadWithoutNMTag() { final HostAlignmentReadFilter filter = new HostAlignmentReadFilter(MIN_IDENTITY); final Cigar cigar = TextCigarCodec.decode(cigarString); final GATKRead read_in = ArtificialReadUtils.createArtificialRead(cigar); - read_in.setPosition("test_contig", 1); read_in.setPosition("pos", 1); final boolean test_i = filter.test(read_in); - Assert.assertEquals(true, test_i); //Don't filter out reads if there is no NM tag + Assert.assertTrue(test_i); //Don't filter out reads if there is no NM tag } }