-
Notifications
You must be signed in to change notification settings - Fork 442
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Finer control of --regions vs --targets overlap
This is to address a long-standing design flaw in handling regions and targets, as described in these BCFtools issues: samtools/bcftools#1420 samtools/bcftools#1421 HTSlib (and BCFtools) recognize two sets of behaviors / options for resctricting VCF/BCF files by region, one is for streaming (`-t/-T`) and one for index-gumping (`-r/-R`). They behave differently, the first includes only records with POS coordinate within the regions, the other includes overlapping regions. This allows to modify the default behavior and provides three options: - Include only records with POS starting in the regions/targets - Include VCF records that overlap regions/targets, even if POS itself is outside the regions - Include only VCF records where the true variation overlaps regions/targets, e.g. consider the difference between `TC>T-` and `C>-` Most importantly, this allows to make the regions and targets behave the same way. Note that the default behavior remains unchanged.
- Loading branch information
Showing
2 changed files
with
107 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
/// @file htslib/synced_bcf_reader.h | ||
/// Stream through multiple VCF files. | ||
/* | ||
Copyright (C) 2012-2017, 2019-2020 Genome Research Ltd. | ||
Copyright (C) 2012-2017, 2019-2021 Genome Research Ltd. | ||
Author: Petr Danecek <[email protected]> | ||
|
@@ -96,7 +96,9 @@ typedef enum | |
{ | ||
BCF_SR_REQUIRE_IDX, | ||
BCF_SR_PAIR_LOGIC, // combination of the PAIR_* values above | ||
BCF_SR_ALLOW_NO_IDX // allow to proceed even if required index is not present (at the user's risk) | ||
BCF_SR_ALLOW_NO_IDX, // allow to proceed even if required index is not present (at the user's risk) | ||
BCF_SR_REGIONS_OVERLAP, // include overlapping records with POS outside the regions: 0=no, 1=VCF line overlap, 2=true variant overlap [1] | ||
BCF_SR_TARGETS_OVERLAP // include overlapping records with POS outside the targets: 0=no, 1=VCF line overlap, 2=true variant overlap [0] | ||
} | ||
bcf_sr_opt_t; | ||
|
||
|
@@ -110,7 +112,8 @@ typedef struct bcf_sr_regions_t | |
kstring_t line; // holder of the current line, set only when reading from tabix-indexed files | ||
htsFile *file; | ||
char *fname; | ||
int is_bin; // is open in binary mode (tabix access) | ||
int is_bin:30, // is open in binary mode (tabix access) | ||
overlap:2; // see BCF_SR_REGIONS_OVERLAP/BCF_SR_TARGETS_OVERLAP | ||
char **als; // parsed alleles if targets_als set and _regions_match_alleles called | ||
kstring_t als_str; // block of parsed alleles | ||
int nals, mals; // number of set alleles and the size of allocated array | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters