Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Trim intervals loaded from interval files. #6375

Merged
merged 3 commits into from
Feb 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -292,12 +292,13 @@ private void vglHelper(final String msg) {
* Performs interval-style validation:
*
* contig is valid; start and stop less than the end; start <= stop, and start/stop are on the contig
* @param str the string to parse
* @param rawStr the string to parse
*
* @return a GenomeLoc representing the String
*
*/
public GenomeLoc parseGenomeLoc(final String str) {
public GenomeLoc parseGenomeLoc(final String rawStr) {
final String str = rawStr.trim();
try {
if ( isUnmappedGenomeLocString(str) ) {
return GenomeLoc.UNMAPPED;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -403,8 +403,9 @@ else if ( glParser.isValidGenomeLoc(interval.getContig(), interval.getStart(), i
} else {
try (PathLineIterator reader = new PathLineIterator(inputPath)) {
for (final String line : reader) {
if (!line.trim().isEmpty()) {
ret.add(glParser.parseGenomeLoc(line));
final String trimmedLine = line.trim();
if (!trimmedLine.isEmpty()) {
ret.add(glParser.parseGenomeLoc(trimmedLine));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is at least one other call to parseGenomeLoc that are still not trimmed. See line 332. I think it should also be trimmed or the trimming should just be pushed down into parseGenomeLoc as well to be sure that it always happens.

Copy link
Collaborator Author

@cmnbroad cmnbroad Jan 30, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added trim call to GenomeParserLoc, with a test.

}
}
}
Expand Down
18 changes: 18 additions & 0 deletions src/main/java/org/broadinstitute/hellbender/utils/io/IOUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import java.nio.file.*;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
Expand Down Expand Up @@ -151,6 +152,23 @@ public static File writeTempFile(String content, String prefix, String suffix, F
}
}

/**
* Writes multiple lines of content to a temp file and returns the temporary file.
* @param prefix prefix to use for the temp file name
* @param suffix extension to use for the temp file
* @param content List<String> Strings that will be written to the file as separate lines
* @return temporary File that will be deleted on exit
*/
public static File writeTempFile(final List<String> content, final String prefix, final String suffix) {
try {
final File tempFile = createTempFile(prefix, suffix);
FileUtils.writeLines(tempFile, content);
return tempFile;
} catch (IOException e) {
throw new UserException.BadTempDir(e.getMessage(), e);
}
}

/**
* Returns true if the file is a special file.
* @param file File path to check.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,13 @@ public void testParseGoodString() {
assertEquals(loc.getStart(), 1);
}

@Test
public void testStringIsTrimmed() {
final String locString = "1:1-10";
final GenomeLoc loc = genomeLocParser.parseGenomeLoc(" " + "1:1-10" + " ");
Assert.assertEquals(loc.toString(), locString);
}

@Test
public void testCreateGenomeLoc1() {
GenomeLoc loc = genomeLocParser.createGenomeLoc("1", 1, 100);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import htsjdk.tribble.SimpleFeature;
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
import htsjdk.variant.vcf.VCFFileReader;
import org.apache.commons.io.FileUtils;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.exceptions.UserException;
Expand Down Expand Up @@ -1120,12 +1119,35 @@ public void testUnmergedIntervals(String unmergedIntervals) {
Assert.assertEquals(merged.size(), 1);
}

@Test
public void testIntervalFileIntervalsAreTrimmed() throws IOException {
final GenomeLoc expectedGenomeLoc = hg19GenomeLocParser.createGenomeLoc("1", 1, 10);
final File gatkIntervalFile = IOUtils.writeTempFile(
Arrays.asList(
String.format(
// add some spaces around the start/end to test that it gets trimmed down to something valid
" %s:%d-%d ",
expectedGenomeLoc.getContig(),
expectedGenomeLoc.getStart(),
expectedGenomeLoc.getEnd())),
"testIntervalFileIntervalsAreTrimmed", ".intervals.list");

final List<String> intervalArgs = new ArrayList<>(1);
intervalArgs.add(gatkIntervalFile.getAbsolutePath());

final GenomeLocSortedSet locs = IntervalUtils.loadIntervals(
intervalArgs, IntervalSetRule.UNION, IntervalMergingRule.ALL, 0, hg19GenomeLocParser);
Assert.assertEquals(locs.size(), 1);
Assert.assertEquals(locs.toList().get(0), expectedGenomeLoc);
}

@Test(expectedExceptions=UserException.MalformedGenomeLoc.class, dataProvider="invalidIntervalTestData")
public void testInvalidGATKFileIntervalHandling(GenomeLocParser genomeLocParser,
String contig, int intervalStart, int intervalEnd ) throws Exception {

File gatkIntervalFile = createTempFile("testInvalidGATKFileIntervalHandling", ".intervals",
String.format("%s:%d-%d", contig, intervalStart, intervalEnd));
File gatkIntervalFile = IOUtils.writeTempFile(
Arrays.asList(String.format("%s:%d-%d", contig, intervalStart, intervalEnd)),
"testInvalidGATKFileIntervalHandling", ".intervals");

List<String> intervalArgs = new ArrayList<>(1);
intervalArgs.add(gatkIntervalFile.getAbsolutePath());
Expand Down Expand Up @@ -1220,12 +1242,6 @@ public Object[][] negativeOneLengthIntervalDataProvider() throws Exception {
};
}

private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception {
File tempFile = GATKBaseTest.createTempFile(tempFilePrefix, tempFileExtension);
FileUtils.writeLines(tempFile, Arrays.asList(lines));
return tempFile;
}

@DataProvider(name = "sortAndMergeIntervals")
public Object[][] getSortAndMergeIntervals() {
return new Object[][] {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.nio.file.attribute.BasicFileAttributes;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public final class IOUtilsUnitTest extends GATKBaseTest {

Expand Down Expand Up @@ -594,6 +595,26 @@ public void testCreateTempFileInDirectory() {
"file was not written to temp file: " + tempFile + " in dir: " + tempDir);
}

@Test
public void testCreateTempFileWithContent() throws IOException {
final String prefix = "prefix";
final String suffix = ".txt";
final List<String> inputStrings = Arrays.asList("first line", "second line");

final File tempFile = IOUtils.writeTempFile(inputStrings,prefix, suffix);

Assert.assertTrue(tempFile.exists(), "file was not written to temp file: " + tempFile);
Assert.assertTrue(tempFile.getName().startsWith("prefix"));
Assert.assertTrue(tempFile.getName().endsWith(".txt"));

final List<String> actualStrings = new ArrayList<>();
try (Stream<String> lines = Files.lines( tempFile.toPath())) {
lines.forEach(actualStrings::add);
}

Assert.assertEquals(actualStrings, inputStrings);
}

@DataProvider
public Object[][] tmpPathDirs() throws Exception {
return new Object[][] {
Expand Down