Skip to content

Commit

Permalink
Ignore specified contigs for pre-aligned BAMs in PathSeq (#6537)
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalker174 authored and jonn-smith committed Jul 14, 2020
1 parent f98d873 commit 32734ad
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,34 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.util.Collections;
import java.util.Set;

/**
* Filters out reads above a threshold identity (number of matches less deletions), given in bases.
* Filters out reads above a threshold identity (number of matches less deletions), given in bases. Reads mapped to
* any of the excluded contigs, if provided, will automatically pass the filter regardless of mapping identity.
*/
public final class HostAlignmentReadFilter extends ReadFilter {

private static final long serialVersionUID = 1L;

private final int minIdentity;
private final Set<String> excludedContigs;

public HostAlignmentReadFilter(final int minIdent, final Set<String> excludedContigs) {
this.minIdentity = minIdent;
this.excludedContigs = excludedContigs;
}

public HostAlignmentReadFilter(final int minIdent) {
this.minIdentity = minIdent;
this.excludedContigs = Collections.emptySet();
}

@Override
public boolean test(final GATKRead read) {
if ( read.isUnmapped() ) return true;
if ( excludedContigs.contains(read.getContig()) ) return true;
final Integer nmVal = read.getAttributeAsInteger("NM");
if ( nmVal == null ) return true;
return test(read.getCigar(), nmVal.intValue());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.SequenceUtil;
Expand All @@ -11,6 +12,7 @@
import org.apache.spark.api.java.JavaSparkContext;
import org.broadinstitute.hellbender.engine.filters.AmbiguousBaseReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadLengthReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.pathseq.loggers.PSFilterLogger;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
import org.broadinstitute.hellbender.tools.spark.utils.ReadFilterSparkifier;
Expand Down Expand Up @@ -57,6 +59,7 @@ public PSFilter(final JavaSparkContext ctx, final PSFilterArgumentCollection fil
this.ctx = ctx;
this.filterArgs = filterArgs;
this.header = header;
validateFilterArguments();
}

@VisibleForTesting
Expand Down Expand Up @@ -170,6 +173,21 @@ static JavaRDD<GATKRead> filterDuplicateSequences(final JavaRDD<GATKRead> reads)
});
}

/**
* Validate arguments
*/
private void validateFilterArguments() {
final SAMSequenceDictionary dictionary = header.getSequenceDictionary();
if (filterArgs.alignedInput) {
final Set<String> contigsToIgnoreSet = new HashSet<>(filterArgs.alignmentContigsToIgnore);
for (final String contig : contigsToIgnoreSet) {
if (dictionary.getSequence(contig) == null) {
throw new UserException.BadInput("Ignored sequence " + contig + " not found in input header.");
}
}
}
}

/**
* Pairs reads with canonical Long ID using the lesser 64-bit hash of the sequence and its reverse complement
*/
Expand Down Expand Up @@ -210,7 +228,8 @@ public Tuple2<JavaRDD<GATKRead>, JavaRDD<GATKRead>> doFilter(JavaRDD<GATKRead> r
filterLogger.logPrimaryReads(reads);

if (filterArgs.alignedInput) {
reads = reads.filter(new ReadFilterSparkifier(new HostAlignmentReadFilter(filterArgs.minIdentity)));
final Set<String> contigsToIgnoreSet = Collections.unmodifiableSet(new HashSet<>(filterArgs.alignmentContigsToIgnore));
reads = reads.filter(new ReadFilterSparkifier(new HostAlignmentReadFilter(filterArgs.minIdentity, contigsToIgnoreSet)));
}
filterLogger.logReadsAfterPrealignedHostFilter(reads);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.broadinstitute.hellbender.engine.filters.ReadLengthReadFilter;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;

public final class PSFilterArgumentCollection implements Serializable {
Expand Down Expand Up @@ -45,6 +46,8 @@ public final class PSFilterArgumentCollection implements Serializable {
public static final String DUST_T_SCORE_SHORT_NAME = DUST_T_SCORE_LONG_NAME;
public static final String HOST_MIN_IDENTITY_LONG_NAME = "host-min-identity";
public static final String HOST_MIN_IDENTITY_SHORT_NAME = HOST_MIN_IDENTITY_LONG_NAME;
public static final String IGNORE_ALIGNMENT_CONTIGS_LONG_NAME = "ignore-alignment-contigs";
public static final String IGNORE_ALIGNMENT_CONTIGS_SHORT_NAME = IGNORE_ALIGNMENT_CONTIGS_LONG_NAME;
public static final String HOST_KMER_COUNT_THRESHOLD_LONG_NAME = "host-kmer-thresh";
public static final String HOST_KMER_COUNT_THRESHOLD_SHORT_NAME = HOST_KMER_COUNT_THRESHOLD_LONG_NAME;
public static final String FILTER_BWA_SEED_LENGTH_LONG_NAME = "filter-bwa-seed-length";
Expand Down Expand Up @@ -190,6 +193,16 @@ public final class PSFilterArgumentCollection implements Serializable {
optional = true)
public int minIdentity = 30;

/**
* If using --is-host-aligned, ignores alignments to these contigs (can be specified multiple times). This
* can be useful if the alignment is to a reference containing a microbe, such as chrEBV in hg38.
*/
@Argument(doc = "Host-aligned BAM contigs to ignore",
fullName = IGNORE_ALIGNMENT_CONTIGS_LONG_NAME,
shortName = IGNORE_ALIGNMENT_CONTIGS_SHORT_NAME,
optional = true)
public List<String> alignmentContigsToIgnore = new ArrayList<>();

/**
* Controls the stringency of read filtering based on host k-mer matching. Reads with at least this many matching
* k-mers in the host reference will be filtered.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.spark.pathseq;

import com.google.common.collect.Sets;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.TextCigarCodec;
import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils;
Expand All @@ -9,10 +10,12 @@
import org.testng.annotations.Test;

import java.util.Arrays;
import java.util.Set;

public final class HostAlignmentReadFilterTest {

private static final int MIN_IDENTITY = 70;
private final String IGNORED_CONTIG = "chrIgnore";

@DataProvider(name = "alignmentData")
public Object[][] getAlignmentData() {
Expand Down Expand Up @@ -54,7 +57,6 @@ public void testMappedRead(final String cigarString, final int NM, final boolean
final Cigar cigar = TextCigarCodec.decode(cigarString);
final GATKRead read_in = ArtificialReadUtils.createArtificialRead(cigar);
read_in.setAttribute("NM", NM);
read_in.setPosition("test_contig", 1);
read_in.setPosition("pos", 1);
final boolean test_i = filter.test(read_in);
Assert.assertEquals(test_out, test_i);
Expand All @@ -70,7 +72,45 @@ public void testUnmappedRead() {
final GATKRead read_in = ArtificialReadUtils.createArtificialRead(bases, qual, "*");
read_in.setIsUnmapped();
final boolean test_i = filter.test(read_in);
Assert.assertEquals(true, test_i);
Assert.assertTrue(test_i);
}

@Test
public void testIgnoredContig() {
final Set<String> ignoredContigSet = Sets.newHashSet(IGNORED_CONTIG);
final HostAlignmentReadFilter filter = new HostAlignmentReadFilter(MIN_IDENTITY, ignoredContigSet);
final byte[] bases = new byte[100];
final byte[] qual = new byte[100];
Arrays.fill(bases, (byte) 'A');
Arrays.fill(qual, (byte) 30);

// Mapped fully to ignored contig
final GATKRead readA = ArtificialReadUtils.createArtificialRead(bases, qual, "100M");
readA.setAttribute("NM", 0);
readA.setPosition(IGNORED_CONTIG, 1);
final boolean testA = filter.test(readA);
Assert.assertTrue(testA);

// Mapped fully to non-ignored contig
final GATKRead readB = ArtificialReadUtils.createArtificialRead(bases, qual, "100M");
readB.setAttribute("NM", 0);
readB.setPosition("chrNotIgnore", 1);
final boolean testB = filter.test(readB);
Assert.assertFalse(testB);

// Mapped poorly to ignored contig
final GATKRead readC = ArtificialReadUtils.createArtificialRead(bases, qual, "10M90S");
readC.setAttribute("NM", 90);
readC.setPosition(IGNORED_CONTIG, 1);
final boolean testC = filter.test(readC);
Assert.assertTrue(testC);

// Mapped poorly to non-ignored contig
final GATKRead readD = ArtificialReadUtils.createArtificialRead(bases, qual, "10M90S");
readD.setAttribute("NM", 90);
readD.setPosition("chrNotIgnore", 1);
final boolean testD = filter.test(readD);
Assert.assertTrue(testD);
}

@Test
Expand All @@ -79,9 +119,8 @@ public void testMappedReadWithoutNMTag() {
final HostAlignmentReadFilter filter = new HostAlignmentReadFilter(MIN_IDENTITY);
final Cigar cigar = TextCigarCodec.decode(cigarString);
final GATKRead read_in = ArtificialReadUtils.createArtificialRead(cigar);
read_in.setPosition("test_contig", 1);
read_in.setPosition("pos", 1);
final boolean test_i = filter.test(read_in);
Assert.assertEquals(true, test_i); //Don't filter out reads if there is no NM tag
Assert.assertTrue(test_i); //Don't filter out reads if there is no NM tag
}
}

0 comments on commit 32734ad

Please sign in to comment.