Skip to content

Commit

Permalink
A more compact and useful representation for unphased data (#13)
Browse files Browse the repository at this point in the history
For unphased data, each variant stores the number of copies that it
represents (1...ploidy). E.g., for diploid data you'll have potentially
two copies of each variant, one with num_copies=1 and one with
num_copies=2 (homozygotes).

Included is an example that shows how to calculate ROH from this
format.
  • Loading branch information
dcdehaas authored Dec 10, 2024
1 parent f96cc92 commit 2342527
Show file tree
Hide file tree
Showing 3 changed files with 209 additions and 27 deletions.
41 changes: 41 additions & 0 deletions examples/igdroh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Example demonstrating runs-of-homozygosity using unphased IGD data.
import pyigd
import sys

# 500kbp
THRESHOLD = 500_000

if len(sys.argv) < 2:
print("Pass in an IGD filename")
exit(1)

with open(sys.argv[1], "rb") as f:
igd_file = pyigd.IGDReader(f)
assert not igd_file.is_phased, "This example only works for unphased data"
print(f"Version: {igd_file.version}", file=sys.stderr)
print(f"Ploidy: {igd_file.ploidy}", file=sys.stderr)
print(f"Variants: {igd_file.num_variants}", file=sys.stderr)
print(f"Individuals: {igd_file.num_individuals}", file=sys.stderr)
print(f"Source: {igd_file.source}", file=sys.stderr)
print(f"Description: {igd_file.description}", file=sys.stderr)
assert igd_file.num_samples == igd_file.num_individuals

last_het_site_per_idv = [0 for _ in range(igd_file.num_individuals)]
print(f"INDIV\tSTART_BP\tEND_BP")
for variant_index in range(igd_file.num_variants):
position, flags, num_copies = igd_file.get_position_flags_copies(variant_index)
# Homozygous occurs when the number of copies of the alt allele are equal to the
# organism ploidy. We check all heterozygous cases as they "break" the ROH, so
# if we break an ROH that exceeds the threshold we emit it.
if num_copies < igd_file.ploidy:
_, is_missing, sample_list = igd_file.get_samples(variant_index)
assert not is_missing, "Missing data not handled in this example"
for indiv in sample_list:
hom_span = position - last_het_site_per_idv[indiv]
if hom_span >= THRESHOLD:
print(f"{indiv}\t{last_het_site_per_idv[indiv]+1}\t{position-1}")
last_het_site_per_idv[indiv] = position
for indiv in range(igd_file.num_individuals):
hom_span = position - last_het_site_per_idv[indiv]
if hom_span >= THRESHOLD:
print(f"{indiv}\t{last_het_site_per_idv[indiv]+1}\t{position-1}")
81 changes: 64 additions & 17 deletions pyigd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from dataclasses import dataclass
from enum import Enum
from typing import BinaryIO, List, Tuple, Union, Optional

import struct

# Optional import. BitVector is only needed when using the APIs that return them.
Expand All @@ -27,6 +26,14 @@ class BpPosFlags(Enum):
IS_MISSING = 0x0200000000000000


# Left shift amount to get the numCopies value
BP_POS_COPY_SHIFT = 48
# Mask for getting only the numCopies value
BP_POS_COPY_MASK = 0xFF << BP_POS_COPY_SHIFT
# Mask for getting only the bp position value
BP_POS_ONLY_MASK = ~(BpPosFlags.MASK.value | BP_POS_COPY_MASK)


def _read_uint64(file_obj: BinaryIO) -> int:
return struct.unpack("Q", file_obj.read(8))[0]

Expand Down Expand Up @@ -178,9 +185,11 @@ def num_individuals(self):
@property
def num_samples(self):
"""
Number of haploid samples (same as num_individuals * ploidy).
Number of samples. For phased data, this is num_individuals * ploidy. For unphased
data this is just num_individuals. Every returned sample index (from get_samples())
will be less than num_samples.
"""
return self._num_idv * self._ploidy
return self._num_idv * self._ploidy if self.is_phased else self._num_idv

@property
def source(self):
Expand All @@ -199,6 +208,29 @@ def description(self):
def _get_var_idx_offset(self, variant_idx: int) -> int:
return self._fp_idx + (variant_idx * IGDConstants.INDEX_ENTRY_BYTES)

def get_position_flags_copies(self, variant_idx: int) -> Tuple[int, int, int]:
"""
Given a variant index between 0...(num_variants-1), return the tuple (position, flags, num_copies).
Much faster than `get_samples` or `get_samples_bv` because it only scans the variant index
and does not read the actual genotype data.
:param variant_idx: Variant index between 0...(num_variants-1). Variants are ordered from
smallest to largest base-pair position.
:return: The tuple (position, flags, num_copies) where position is the base-pair position
(integer), flags is an integer that can be bitwise ANDed with BpPosFlags values, and
num_copies is an integer indicating how many copies of the alternate allele this
variant represents (for unphased data only).
"""
self.file_obj.seek(self._get_var_idx_offset(variant_idx))
(bpp,) = struct.unpack(
"Q", self.file_obj.read(IGDConstants.INDEX_ENTRY_BYTES // 2)
)
flags = bpp & BpPosFlags.MASK.value
copies = (bpp & BP_POS_COPY_MASK) >> BP_POS_COPY_SHIFT
position = bpp & BP_POS_ONLY_MASK
return (position, flags, copies)

def get_position_and_flags(self, variant_idx: int) -> Tuple[int, int]:
"""
Given a variant index between 0...(num_variants-1), return the tuple (position, flags).
Expand All @@ -211,17 +243,21 @@ def get_position_and_flags(self, variant_idx: int) -> Tuple[int, int]:
:return: The tuple (position, flags) where position is the base-pair position (integer) and
flags is an integer that can be bitwise ANDed with BpPosFlags values.
"""
assert self.is_phased, "Use get_position_flags_copies() for unphased data"
self.file_obj.seek(self._get_var_idx_offset(variant_idx))
bpp, _ = struct.unpack("QQ", self.file_obj.read(IGDConstants.INDEX_ENTRY_BYTES))
(bpp,) = struct.unpack(
"Q", self.file_obj.read(IGDConstants.INDEX_ENTRY_BYTES // 2)
)
flags = bpp & BpPosFlags.MASK.value
position = bpp & ~BpPosFlags.MASK.value
position = bpp & BP_POS_ONLY_MASK
return (position, flags)

def get_samples(self, variant_idx: int) -> Tuple[int, bool, List[int]]:
"""
Given a variant index between 0...(num_variants-1), return the tuple (position, missing, samples)
where position is the base-pair position, missing is True when this represents a row of missing
data, and samples is a list of sample indexes that have the alt allele for the given variant.
data, samples is a list of sample indexes that have the alt allele for the given variant, and
copies is the number of copies of the alternate allele (for unphased data only).
When missing is True, the sample list contains samples that are missing the given variant.
Expand All @@ -236,7 +272,7 @@ def get_samples(self, variant_idx: int) -> Tuple[int, bool, List[int]]:
flags = bpp & BpPosFlags.MASK.value
is_missing = 0 != (flags & BpPosFlags.IS_MISSING.value)
is_sparse = 0 != (flags & BpPosFlags.SPARSE.value)
position = bpp & ~BpPosFlags.MASK.value
position = bpp & BP_POS_ONLY_MASK
self.file_obj.seek(fp_data)
if is_sparse:
sample_list = _read_u32_list(self.file_obj)
Expand Down Expand Up @@ -270,7 +306,7 @@ def get_samples_bv(self, variant_idx: int):
flags = bpp & BpPosFlags.MASK.value
is_missing = 0 != (flags & BpPosFlags.IS_MISSING.value)
is_sparse = 0 != (flags & BpPosFlags.SPARSE.value)
position = bpp & ~BpPosFlags.MASK.value
position = bpp & BP_POS_ONLY_MASK
self.file_obj.seek(fp_data)
byte_count = _div_round_up(self.num_samples, 8)
if is_sparse:
Expand Down Expand Up @@ -466,7 +502,7 @@ def __init__(
self.ref_alleles: List[str] = []
self.alt_alleles: List[str] = []
self.index: List[bytes] = []
self.num_samples = individuals * ploidy
self.num_samples = (individuals * ploidy) if phased else individuals
self.should_be_sparse = self.num_samples / sparse_threshold
# The position of the last variant that we wrote. These have to be in order.
self.last_var_position = 0
Expand All @@ -483,14 +519,14 @@ def write_header(self):

@staticmethod
def _make_index_entry(
position: int, is_missing: bool, is_sparse: bool, filepos: int
position: int, is_missing: bool, is_sparse: bool, num_copies: int, filepos: int
) -> bytes:
flags = 0
if is_sparse:
flags |= BpPosFlags.SPARSE.value
if is_missing:
flags |= BpPosFlags.IS_MISSING.value
encoded_pos = position | flags
encoded_pos = position | flags | (num_copies << BP_POS_COPY_SHIFT)
return struct.pack("QQ", encoded_pos, filepos)

def write_variant(
Expand All @@ -500,6 +536,7 @@ def write_variant(
alt_allele: str,
samples: List[int],
is_missing: bool = False,
num_copies: int = 0,
):
"""
Write the next variant, including sample information, to the file.
Expand All @@ -511,20 +548,25 @@ def write_variant(
:param samples: The list of samples, as indexes. E.g. the list [4, 10] means that samples
numbered 4 and 10 have this variant's alternate allele. This list must be in ascending
order.
:param is_missing: Set to true if the sample list represents the list of samples that are
missing allele values at this position (in which case the reference and alt allele are
somewhat irrelevant).
:param is_missing: [Optional] Set to true if the sample list represents the list of samples
that are missing allele values at this position (in which case the reference and alt
allele are somewhat irrelevant).
:param num_copies: [Optional] For unphased data, set to the number of copies of the alternate
allele that this variant represents (1...ploidy).
"""
assert (
position >= self.last_var_position
), "Out of order variant (must be written in ascending order)"
assert (
self.header.flags & IGDConstants.FLAG_IS_PHASED
) or num_copies > 0, "Unphased data must be written with num_copies specified to a non-zero value"
self.last_var_position = position
self.ref_alleles.append(ref_allele)
self.alt_alleles.append(alt_allele)
is_sparse = len(samples) <= self.should_be_sparse
filepos = self.out.tell()
self.index.append(
self._make_index_entry(position, is_missing, is_sparse, filepos)
self._make_index_entry(position, is_missing, is_sparse, num_copies, filepos)
)
if is_sparse:
_write_u32(self.out, len(samples))
Expand Down Expand Up @@ -588,7 +630,8 @@ def write_variant_ids(self, labels: List[str]):
"""
Write the identifiers for the variants (optional).
:param labels: Empty list or a list of strings, one for each variant.
:param labels: Empty list or a list of strings, one for each variant. That is, if you called
write_variant() `X` times, then there should be `X` entries in this list.
"""
if len(labels) == 0:
self.header.fp_variantids = 0
Expand Down Expand Up @@ -636,11 +679,12 @@ def transform(self):
variant_ids = self.reader.get_variant_ids()
self.writer.write_header()
for i in range(self.reader.num_variants):
_, _, num_copies = self.reader.get_position_flags_copies(i)
if self.use_bitvectors:
position, is_missing, samples = self.reader.get_samples_bv(i)
else:
position, is_missing, samples = self.reader.get_samples(i)
samples = self.modify_samples(position, is_missing, samples)
samples = self.modify_samples(position, is_missing, samples, num_copies)
if samples is None:
variant_ids[i] = None
else:
Expand All @@ -656,6 +700,8 @@ def transform(self):
self.reader.get_ref_allele(i),
self.reader.get_alt_allele(i),
samples,
is_missing,
num_copies,
)
self.writer.write_index()
self.writer.write_variant_info()
Expand All @@ -671,5 +717,6 @@ def modify_samples(
position: int,
is_missing: bool,
samples: Union["BitVector.BitVector", List[int]],
num_copies: int = 0,
) -> Optional[Union["BitVector.BitVector", List[int]]]:
raise NotImplementedError("Derived class must implement modify_samples()")
Loading

0 comments on commit 2342527

Please sign in to comment.