trtools/utils/tr_harmonizer.py

"""
Utilities for harmonizing tandem repeat VCF records.

Handles VCFs generated by various TR genotyping tools
"""
import enum
import re
import warnings
from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Set, Tuple, Union

import cyvcf2
import numpy as np

import trtools.utils.utils as utils
import trtools.utils.common as common

# List of supported VCF types
# TODO: add Beagle
# TODO: add support for tool version numbers

_beagle_error = "If this file was imputed by Beagle, did you remember to copy the info fields over?"

class VcfTypes(enum.Enum):
    """The different tr callers that tr_harmonizer supports."""

    # enum constants must have values, so given them
    # redundant values
    gangstr = "gangstr"
    advntr = "advntr"
    hipstr = "hipstr"
    eh = "eh"
    popstr = "popstr"
    longtr = "longtr"

    # Don't include the redundant values
    # in how enums are printed out
    def __repr__(self):
        return '<{}.{}>'.format(self.__class__.__name__, self.name)

class TRDosageTypes(enum.Enum):
    """Ways to compute TR dosages."""
    bestguess = "bestguess"
    beagleap = "beagleap"
    bestguess_norm = "bestguess_norm"
    beagleap_norm = "beagleap_norm"
    def __repr__(self):
        return '<{}.{}>'.format(self.__class__.__name__, self.name)

def _ToVCFType(vcftype: Union[str, VcfTypes]):
    # Convert the input to a VcfTypes enum.
    #
    # If it is a string, look up the VcfTypes enum.
    # If it is already a VcfTypes enum, return it.
    # Otherwise, error

    if isinstance(vcftype, str):
        if vcftype not in VcfTypes.__members__:
            raise ValueError(("{} is not an excepted TR vcf type. "
                              "Expected one of {}").format(
                vcftype, list(VcfTypes.__members__)))
        return VcfTypes[vcftype]
    elif isinstance(vcftype, VcfTypes):
        return vcftype
    else:
        raise TypeError(("{} (of type {}) is not a vcftype"
                         .format(vcftype, type(vcftype))))


def MayHaveImpureRepeats(vcftype: Union[str, VcfTypes]):
    """
    Determine if any of the alleles in this VCF may contain impure repeats.

    Specifically, impure repeats include:

    * impurities in the underlying sequence (e.g. AAATAAAAA)
    * partial repeats (e.g. AATAATAATAA)

    This is a guarantee that the caller attempted to call impure repeats,
    not that it found any. It also does not guarantee that
    all impurities present were identified and called.

    Returns
    -------
    bool
      Indicates whether repeat sequences may be impure
    """
    vcftype = _ToVCFType(vcftype)
    if vcftype == VcfTypes.gangstr:
        return False
    if vcftype == VcfTypes.hipstr:
        return True
    if vcftype == VcfTypes.longtr:
        return True
    if vcftype == VcfTypes.advntr:
        return True
    if vcftype == VcfTypes.popstr:
        return True
    if vcftype == VcfTypes.eh:
        return False

    # Can't cover this line because it is future proofing.
    # (It explicitly is not reachable now,
    # would only be reachable if VcfTypes is expanded in the future)
    _UnexpectedTypeError(vcftype)  # pragma: no cover


def HasLengthRefGenotype(vcftype: Union[str, VcfTypes]):
    """
    Determine if the reference alleles of variants are given by length.

    If True, then reference alleles for all variants produced by this
    caller are specified by length and not by sequence. Sequences are
    fabricated according to :py:func:`trtools.utils.utils.FabricateAllele`.

    If True, then :py:meth:`HasLengthAltGenotypes` will also be true

    Returns
    -------
    bool
      Indicates whether ref alleles are specified by length
    """
    vcftype = _ToVCFType(vcftype)
    if vcftype == VcfTypes.gangstr:
        return False
    if vcftype == VcfTypes.hipstr:
        return False
    if vcftype == VcfTypes.longtr:
        return False
    if vcftype == VcfTypes.advntr:
        return False
    if vcftype == VcfTypes.popstr:
        return False
    if vcftype == VcfTypes.eh:
        return True

    # Can't cover this line because it is future proofing.
    # (It explicitly is not reachable now,
    # would only be reachable if VcfTypes is expanded in the future)
    _UnexpectedTypeError(vcftype)  # pragma: no cover


def HasLengthAltGenotypes(vcftype: Union[str, VcfTypes]):
    """
    Determine if the alt alleles of variants are given by length.

    If True, then alt alleles for all variants produced by this
    caller are specified by length and not by sequence. Sequences are
    fabricated according to :py:func:`trtools.utils.utils.FabricateAllele`.

    Returns
    -------
    bool
      Indicates whether alt alleles are specified by length
    """
    vcftype = _ToVCFType(vcftype)
    if vcftype == VcfTypes.gangstr:
        return False
    if vcftype == VcfTypes.hipstr:
        return False
    if vcftype == VcfTypes.longtr:
        return False
    if vcftype == VcfTypes.advntr:
        return False
    if vcftype == VcfTypes.popstr:
        return True
    if vcftype == VcfTypes.eh:
        return True

    # Can't cover this line because it is future proofing.
    # (It explicitly is not reachable now,
    # would only be reachable if VcfTypes is expanded in the future)
    _UnexpectedTypeError(vcftype)  # pragma: no cover


def _UnexpectedTypeError(vcftype: Union[str, VcfTypes]):
    raise ValueError("self.vcftype is the unexpected type {}"
                     .format(vcftype))


def InferVCFType(vcffile: cyvcf2.VCF, vcftype: Union[str, VcfTypes] = "auto") -> VcfTypes:
    """
    Infer the genotyping tool used to create the VCF.

    When we can, infer from header metadata.
    Otherwise, try to infer the type from the ALT field.

    Parameters
    ----------
    vcffile :
        The input VCF file
    vcftype :
        If it is unclear which of a few VCF callers produced the underlying
        VCF (because the output markings of those VCF callers are similar)
        this string can be supplied by the user to choose from among
        those callers.

    Returns
    -------
    vcftype : VcfTypes
       Type of the VCF file

    Raises
    ------
    TypeError
      If this VCF does not look like it was produced by any supported TR
      caller, or if it looks like it could have been produced by more than one
      supported TR caller and vcftype == 'auto', or if vcftype doesn't match
      any of the callers that could have produced this VCF.
    """
    possible_vcf_types = set()
    header = vcffile.raw_header.lower()
    if 'command=' in header and 'gangstr' in header:
        possible_vcf_types.add(VcfTypes.gangstr)
    if 'command=' in header and 'hipstr' in header:
        possible_vcf_types.add(VcfTypes.hipstr)
    if 'command=' in header and 'longtr' in header:
        possible_vcf_types.add(VcfTypes.longtr)
    if 'source=advntr' in header:
        possible_vcf_types.add(VcfTypes.advntr)
    if 'source=popstr' in header:
        possible_vcf_types.add(VcfTypes.popstr)
    if re.search(r'ALT=<ID=STR\d+'.lower(), header):
        possible_vcf_types.add(VcfTypes.eh)

    if len(possible_vcf_types) == 0:
        raise TypeError('Could not identify the type of this vcf')

    if vcftype == 'auto':
        if len(possible_vcf_types) == 1:
            return next(iter(possible_vcf_types))
        else:
            raise TypeError(('Confused - this vcf looks like it could have '
                             'been any of the types: {}. Please specify '
                             '--vcftype to choose one of '
                             'them').format(possible_vcf_types))

    user_supplied_type = _ToVCFType(vcftype)
    if user_supplied_type in possible_vcf_types:
        return user_supplied_type
    else:
        raise TypeError(('Confused - this vcf looks like it could have '
                         'been any of the types: {}. But you specified: '
                         '--vcftype {} which is not one of those types.'
                         .format(possible_vcf_types, vcftype)))

def IsBeagleVCF(vcffile: cyvcf2.VCF) -> bool:
    """
    Is this a VCF produced by running the Beagle software to impute STRs from a panel generated by an TR genotyper,
    or does it consist of STRs directly called by a TR genotyper?

    Parameters
    ----------
    vcffile :
        The input VCF file

    Returns
    -------
    bool
        Whether this is a VCF produced by Beagle
    """

    return bool(re.search('##source=(\'|")beagle', vcffile.raw_header.lower()))

def HarmonizeRecord(vcftype: Union[str, VcfTypes], vcfrecord: cyvcf2.Variant):
    """
    Create a standardized TRRecord object out of a cyvcf2.Variant
    object of possibly unknown type.

    Parameters
    ----------
    vcfrecord :
        A cyvcf2.Variant Object
    vcftype : VcfTypes
       Type of the VCF file

    Returns
    -------
    trrecord : TRRecord
        A TRRecord object built out of the input record
    """
    vcftype = _ToVCFType(vcftype)
    if vcftype == VcfTypes.gangstr:
        return _HarmonizeGangSTRRecord(vcfrecord)
    if vcftype == VcfTypes.hipstr:
        return _HarmonizeHipSTRRecord(vcfrecord)
    # Note: LongTR is the same format of HipSTR so
    # we re-use that function here
    if vcftype == VcfTypes.longtr:
        return _HarmonizeHipSTRRecord(vcfrecord)
    if vcftype == VcfTypes.advntr:
        return _HarmonizeAdVNTRRecord(vcfrecord)
    if vcftype == VcfTypes.eh:
        return _HarmonizeEHRecord(vcfrecord)
    if vcftype == VcfTypes.popstr:
        return _HarmonizePopSTRRecord(vcfrecord)

    # Can't cover this line because it is future proofing.
    # (It explicitly is not reachable now,
    # would only be reachable if VcfTypes is expanded in the future)
    _UnexpectedTypeError(vcftype)  # pragma: no cover


def _HarmonizeGangSTRRecord(vcfrecord: cyvcf2.Variant):
    """
    Turn a cyvcf2.Variant with GangSTR content into a TRRecord.

    Parameters
    ----------
    vcfrecord :
        A cyvcf2.Variant Object

    Returns
    -------
    TRRecord
    """
    if vcfrecord.INFO.get('RU') is None:
        raise TypeError(
            "Record at {}:{} is missing mandatory GangSTR info field RU. ".format(vcfrecord.CHROM, vcfrecord.POS) + _beagle_error
        )
    if vcfrecord.INFO.get('VID') is not None:
        raise TypeError(
            "Trying to read an AdVNTR record as a GangSTR record {}:{}".format(vcfrecord.CHROM, vcfrecord.POS))
    if vcfrecord.INFO.get('VARID') is not None:
        raise TypeError("Trying to read an EH record as a GangSTR record {}:{}".format(vcfrecord.CHROM, vcfrecord.POS))
    ref_allele = vcfrecord.REF.upper()
    if vcfrecord.ALT:
        alt_alleles = _UpperCaseAlleles(vcfrecord.ALT)
    else:
        alt_alleles = []
    motif = vcfrecord.INFO["RU"].upper()
    record_id = None

    return TRRecord(vcfrecord, ref_allele, alt_alleles, motif, record_id, 'Q' if vcfrecord.INFO.get('IMP') is None else None)


def _HarmonizeHipSTRRecord(vcfrecord: cyvcf2.Variant):
    """
    Turn a cyvcf2.Variant with HipSTR content into a TRRecord.

    Parameters
    ----------
    vcfrecord :
        A cyvcf2.Variant Object

    Returns
    -------
    TRRecord
    """
    if (vcfrecord.INFO.get('START') is None
            or vcfrecord.INFO.get('END') is None
            or vcfrecord.INFO.get('PERIOD') is None):
        raise TypeError(
            "Record at {}:{} is missing one of the mandatory HipSTR/LongTR info fields START, END, PERIOD. ".format(vcfrecord.CHROM, vcfrecord.POS) +  _beagle_error
        )
    # determine full alleles and trimmed alleles
    pos = int(vcfrecord.POS)
    start_offset = int(vcfrecord.INFO['START']) - pos
    pos_end_offset = int(vcfrecord.INFO['END']) - pos
    neg_end_offset = pos_end_offset + 1 - len(vcfrecord.REF)
    if start_offset == 0 and neg_end_offset == 0:
        full_alleles = None
    else:
        if vcfrecord.ALT:
            full_alts = _UpperCaseAlleles(vcfrecord.ALT)
        else:
            full_alts = []

        full_alleles = (vcfrecord.REF.upper(),
                        full_alts)
    # neg_end_offset is the number of flanking non repeat bp to remove
    # from the end of each allele
    # e.g. 'AAAT'[0:-1] == 'AAA'
    # however, if neg_end_offset == 0, then we would get
    # 'AAAA'[1:0] == '' which is not the intent
    # so we need an if statement to instead write 'AAAA'[0:]
    # which gives us 'AAAA'
    if neg_end_offset == 0:
        ref_allele = vcfrecord.REF[start_offset:].upper()
        if vcfrecord.ALT:
            alt_alleles = []
            for alt in vcfrecord.ALT:
                alt_alleles.append(str(alt)[start_offset:].upper())
        else:
            alt_alleles = []
    else:
        ref_allele = vcfrecord.REF[start_offset:neg_end_offset].upper()
        if vcfrecord.ALT:
            alt_alleles = []
            for alt in vcfrecord.ALT:
                alt_alleles.append(
                    str(alt)[start_offset:neg_end_offset].upper()
                )
        else:
            alt_alleles = []
    # Get the motif.
    # Hipstr doesn't tell us this explicitly, so figure it out
    motif = utils.InferRepeatSequence(ref_allele[start_offset:],
                                      vcfrecord.INFO["PERIOD"])
    record_id = vcfrecord.ID

    return TRRecord(vcfrecord,
                    ref_allele,
                    alt_alleles,
                    motif,
                    record_id,
                    'Q' if vcfrecord.INFO.get('IMP') is None else None,
                    harmonized_pos=int(vcfrecord.INFO['START']),
                    full_alleles=full_alleles)


def _HarmonizeAdVNTRRecord(vcfrecord: cyvcf2.Variant):
    """
    Turn a cyvcf2.Variant with adVNTR content into a TRRecord.

    Parameters
    ----------
    vcfrecord :
        A cyvcf2.Variant Object

    Returns
    -------
    TRRecord
    """
    if vcfrecord.INFO.get('RU') is None or vcfrecord.INFO.get('VID') is None:
        raise TypeError(
            "Record at {}:{} is missing one of the mandatory ADVNTR info fields RU, VID. ".format(vcfrecord.CHROM, vcfrecord.POS) + _beagle_error
        )
    ref_allele = vcfrecord.REF.upper()
    if vcfrecord.ALT:
        alt_alleles = _UpperCaseAlleles(vcfrecord.ALT)
    else:
        alt_alleles = []
    motif = vcfrecord.INFO["RU"].upper()
    record_id = vcfrecord.INFO["VID"]

    return TRRecord(vcfrecord, ref_allele, alt_alleles, motif, record_id, 'ML' if vcfrecord.INFO.get('IMP') is None else None)


# def _PHREDtoProb(phred: int) -> float:
#    """Convert PHRED score to probability
#
#    Notes
#    -----
#    Per https://en.wikipedia.org/wiki/Phred_quality_score
#    """
#    return 10**(-phred/10)


# def _ConvertPLtoQualityProb(PL: List[int]) -> float:
#    """
#    Convert a list of PHRED-scaled genotype probabilities to the
#    unscaled probability of the single most likely genotype.#
#
#    Notes
#    -----
#    PHRED scaling is not very accurate around numbers close to 1
#    unfortunately, so for PHRED score of 0, instead calculate the probability
#    by 1 - sum(probabilities of other genotypes)
#    """
#
#    max_likelihood = min(PL)
#    if max_likelihood != 0:
#        return _PHREDtoProb(max_likelihood)
#
#    sum_other_likelihoods = 0.0
#    for phred_likelihood in PL:
#        if phred_likelihood == 0:
#            continue
#        sum_other_likelihoods += _PHREDtoProb(phred_likelihood)
#    return max(_PHREDtoProb(1), 1 - sum_other_likelihoods)


def _HarmonizePopSTRRecord(vcfrecord: cyvcf2.Variant):
    """
    Turn a cyvcf2.Variant with popSTR content into a TRRecord.

    Parameters
    ----------
    vcfrecord :
        A cyvcf2.Variant Object

    Returns
    -------
    TRRecord
    """
    if vcfrecord.INFO.get('Motif') is None:
        raise TypeError(
            "Record at {}:{} is missing mandatory PopSTR info field MOTIF".format(vcfrecord.CHROM, vcfrecord.POS)
        )
    ref_allele = vcfrecord.REF.upper()
    motif = vcfrecord.INFO["Motif"].upper()
    record_id = vcfrecord.ID

    if vcfrecord.ALT:
        alt_allele_lengths = []
        for alt in vcfrecord.ALT:
            alt = str(alt)
            if alt[0] != "<" or alt[-1] != ">":
                raise TypeError("This record does not look like a PopSTR"
                                " record. Alt alleles were not formatted"
                                " as expected")
            alt_allele_lengths.append(float(alt[1:-1]))
    else:
        alt_allele_lengths = []

    return TRRecord(vcfrecord,
                    ref_allele,
                    None,
                    motif,
                    record_id,
                    None,
                    alt_allele_lengths=alt_allele_lengths)


def _HarmonizeEHRecord(vcfrecord: cyvcf2.Variant):
    """
    Turn a cyvcf2.Variant with EH content into a TRRecord.

    Parameters
    ----------
    vcfrecord :
        A cyvcf2.Variant Object

    Returns
    -------
    TRRecord
    """
    if vcfrecord.INFO.get('VARID') is None or vcfrecord.INFO.get('RU') is None:
        raise TypeError(
            "Record at {}:{} is missing one of the mandatory ExpansionHunter info fields VARID, RU. ".format(vcfrecord.CHROM, vcfrecord.POS)
            + _beagle_error
        )
    record_id = vcfrecord.INFO["VARID"]
    motif = vcfrecord.INFO["RU"].upper()
    ref_allele_length = int(vcfrecord.INFO["RL"]) / len(motif)
    if vcfrecord.ALT:
        alt_allele_lengths = []
        for alt in vcfrecord.ALT:
            alt = str(alt)
            if alt[:4] != "<STR" or alt[-1] != ">":
                raise TypeError("This record does not look like an EH "
                                " record. Alt alleles were not formatted"
                                " as expected")
            alt_allele_lengths.append(float(alt[4:-1]))
    else:
        alt_allele_lengths = []

    return TRRecord(vcfrecord, None, None, motif, record_id, None,
                    ref_allele_length=ref_allele_length,
                    alt_allele_lengths=alt_allele_lengths)


def _UpperCaseAlleles(alleles: List[str]):
    # Convert the list of allele strings to upper case
    upper_alleles = []
    for allele in alleles:
        upper_alleles.append(allele.upper())
    return upper_alleles


class _Cyvcf2FormatDict():
    """
    Provide an immutable dict-like interface for accessing
    format fields from a cyvcf2 record.
    To iterate over this dict, use :code:`iter(this)`
    or :code:`this.keys()`.
    """

    def __init__(self, record: cyvcf2.Variant):
        self.record = record

    def __getitem__(self, key: str):
        return self.record.format(key)

    def __len__(self):
        return len(self.record.FORMAT)

    def __iter__(self):
        return iter(self.record.FORMAT)

    def __contains__(self, key: str):
        return key in self.record.FORMAT

    def keys(self):
        return self.record.FORMAT

    def get(self, key: str):
        return self.record.format(key)


class TRRecord:
    """
    A representation of a VCF record specialized for TR fields.

    Allows downstream functions to be agnostic to the
    genotyping tool used to create the record.

    Parameters
    ----------
    vcfrecord :
       Cyvcf2 Variant object with the underlying data
    ref_allele :
       Reference allele string
    alt_alleles :
       List of alternate allele strings
    motif :
       Repeat unit
    record_id :
       Identifier for the record
    quality_field :
        the name of the FORMAT field which contains the quality score for each
        call for this record

    Attributes
    ----------
    vcfrecord : cyvcf2.Variant
       The cyvcf2 Variant object used to init this record.
    ref_allele : str
       Reference allele sequences, fabricated if necessary.
       Gets converted to uppercase e.g. ACGACGACG
    alt_alleles : List[str]
       List of alternate allele sequences, fabricated if necessary
    motif : str
       Repeat unit
    record_id : str
       Identifier for the record
    chrom : str
        The chromosome this locus is in
    pos : int
        The bp along the chromosome that this locus is at (ignoring flanking base pairs/full alleles)
    end_pos:
        Position of the last bp of ref allele (ignoring flanking base pairs/full alleles)
    full_alleles_pos:
        Position of the first bp of the full ref allele (including the flanking base pairs)
    full_alleles_end_pos:
        Position of the last bp of the full ref allele (including the flanking base pairs)
    info : Dict[str, Any]
        The dictionary of INFO fields at this locus
    format : Dict[str, np.ndarray]
        The dictionary of FORMAT fields at this locus.
        Numeric format fields are 2D numpy arrays with rows corresponding
        to samples (normally 1 column, but if there are multiple numbers
        then more than one column)
        String format fields are 1D numpy arrays with entries corresponding
        to samples

    Other Parameters
    ----------------
    harmonized_pos :
        If this record has flanking base pairs before the repeat, set this
        to note at which bp the repeat begins
    full_alleles :
        A tuple of string genotypes (ref_allele, [alt_alleles])
        where each allele may contain any number of flanking
        basepairs in addition to containing the tandem repeat.
        If set, these can be accessed through :py:meth:`GetFullStringGenotypes`
        If the alt alleles have differently sized flanks than the ref allele
        then those alt alleles will be improperly trimmed.
    alt_allele_lengths :
        The lengths of each of the alt alleles, in order.
        Thus is measured in number of copies of repeat unit,
        NOT the allele length in base pairs.

        Should be passed to the constructor when only the lengths of the alt alleles
        were measured and not the sequences. If sequences are passed to the
        constructor then this is set automatically.

        If this is passed, the alt_alleles parameter to the constructor must
        be set to None and the alt_alleles attribute of the record will be set
        to fabricated alleles (see
        :py:meth:`trtools.utils.utils.FabricateAllele`)
    ref_allele_length :
        like alt_allele_lengths, but for the reference allele.
        If this is passed, alt_allele_lengths must also be passed
    min_allele_length :
        Minimum allele length from the reference and alternate alleles
    max_allele_length :
        Maximum allele length from the reference and alternate alleles
    quality_score_transform :
        A function which turns the quality_field value into a float
        score. When None, the quality_field values are assumed
        to already be floats

    Notes
    -----
    Alleles are stored as upper case strings with all the repeats written out.
    Alleles may contain partial repeat copies or impurities.
    This class will attempt to make sure alleles do not contain any extra base
    pairs to either side of the repeat. If you wish to have those base pairs,
    use the 'Full' methods
    """

    def __init__(self,
                 vcfrecord: cyvcf2.Variant,
                 ref_allele: Optional[str],
                 alt_alleles: Optional[List[str]],
                 motif: str,
                 record_id: str,
                 quality_field: Optional[str],
                 *,
                 harmonized_pos: Optional[int] = None,
                 full_alleles: Optional[Tuple[str, List[str]]] = None,
                 ref_allele_length: Optional[float] = None,
                 alt_allele_lengths: Optional[List[float]] = None,
                 quality_score_transform: Optional[Callable[..., float]] = None):
        self.vcfrecord = vcfrecord
        self.ref_allele = ref_allele
        self.alt_alleles = alt_alleles
        self.motif = motif
        self.record_id = record_id
        self.chrom = vcfrecord.CHROM
        self.pos = harmonized_pos if harmonized_pos is not None else vcfrecord.POS
        self.info = dict(vcfrecord.INFO)
        self.format = _Cyvcf2FormatDict(vcfrecord)
        self.full_alleles = full_alleles
        self.full_alleles_pos = self.vcfrecord.POS
        self.ref_allele_length = ref_allele_length
        self.alt_allele_lengths = alt_allele_lengths
        self.quality_field = quality_field
        self.quality_score_transform = quality_score_transform

        if full_alleles is not None and (alt_alleles is None or ref_allele is
                                         None):
            raise ValueError("Cannot set full alleles without setting "
                             "regular alleles")

        if alt_allele_lengths is not None and alt_alleles is not None:
            raise ValueError("Must specify only the sequences or the lengths"
                             " of the alt alleles, not both.")

        if ref_allele_length is not None and alt_allele_lengths is None:
            raise ValueError("If the ref allele is specified by length, the "
                             "alt alleles must be too.")

        if ref_allele_length is not None:
            self.has_fabricated_ref_allele = True
            self.ref_allele = utils.FabricateAllele(motif, ref_allele_length)
        else:
            self.has_fabricated_ref_allele = False
            self.ref_allele_length = len(ref_allele) / len(motif)

        # declaration of end_pos variables. Values are rounded because self.ref_allele_length can
        # sometimes be a float because of partial repeats. This can cause weird float problems, and simple cast
        # is not enought to ensure that the proper position is calculated
        self.end_pos = round(self.pos + self.ref_allele_length * len(motif) - 1)
        self.full_alleles_end_pos = self.end_pos if full_alleles is None else \
            round(self.full_alleles_pos + len(self.full_alleles[0]) - 1)

        if alt_allele_lengths is not None:
            self.has_fabricated_alt_alleles = True
            self.alt_alleles = [
                utils.FabricateAllele(motif, length) for length in
                alt_allele_lengths
            ]
        else:
            self.has_fabricated_alt_alleles = False
            self.alt_allele_lengths = [
                len(allele) / len(motif) for allele in self.alt_alleles
            ]

        # Update min/max length
        if len(self.alt_alleles) > 0:
            self.min_allele_length = min(self.ref_allele_length, min(self.alt_allele_lengths))
            self.max_allele_length = max(self.ref_allele_length, max(self.alt_allele_lengths))
        else:
            self.min_allele_length = self.ref_allele_length
            self.max_allele_length = self.ref_allele_length

        try:
            self._CheckRecord()
        except ValueError as e:
            raise ValueError(("Invalid TRRecord. TRRecord: {} Original record:"
                              " {}").format(str(self), str(self.vcfrecord)), e)

    def _CheckRecord(self):
        """
        Check that this record is properly constructed.

        Checks that the same number of alt alleles were specified
        as in the underlying record and that the full_alleles, if supplied,
        contain their corresponding standard alleles

        Raises an error if a check fails
        """
        if len(self.alt_alleles) != len(self.vcfrecord.ALT):
            raise ValueError("Underlying record does not have the same "
                             "number of alt alleles as given to the TRRecord "
                             "constructor. Underlying alt alleles: {}, "
                             " constructor alt alleles: {}".format(
                self.vcfrecord.ALT, self.alt_alleles))

        if self.full_alleles:
            if len(self.full_alleles) != 2:
                raise ValueError("full_alleles doesn't have both"
                                 " a ref allele and alt alleles")
            full_ref, full_alts = self.full_alleles
            if len(full_alts) != len(self.alt_alleles):
                raise ValueError("Different number of full alternate alleles "
                                 "than normal alt alleles")
            if self.ref_allele not in full_ref:
                raise ValueError("could not find ref allele inside "
                                 "full ref allele")
            for idx, (full_alt, alt) \
                    in enumerate(zip(full_alts, self.alt_alleles)):
                if alt not in full_alt:
                    raise ValueError(("Could not find alt allele {} "
                                      "inside its full alt "
                                      "allele").format(idx))

    def GetMaxPloidy(self) -> int:
        """
        Return the maximum ploidy of any sample at this locus.

        All genotypes will be a tuple of that many haplotypes,
        where samples with a smaller ploidy than that
        will have haplotypes at the end of the tuple set to ','
        (for string genotypes) or -2 (for index or length genotypes)
        """
        return self.vcfrecord.ploidy

    def GetNumSamples(self) -> int:
        """
        Return the number of samples at this locus (called or not).

        Same as the number of samples in the overall vcf
        """
        return self.vcfrecord.genotype.n_samples

    def GetGenotypeIndicies(self) -> Optional[np.ndarray]:
        """
        Get an array of genotype indicies across all samples.

        A genotype index is a number 0, 1, 2 ...
        where 0 corresponds to the reference allele,
        1 to the first alt allele, 2 to the second, etc.
        The array is an array of ints with one row per sample.
        The number of columns is the maximum ploidy of any sample
        (normally 2) plus 1 for phasing.
        All but the final column represent the index of the genotypes
        of each call.
        The final column has values 0 for unphased sampels or 1 for phased.
        So a sample with gt '0|2' would be represented by the row [0, 2, 1]
        and a sample with gt '3/0' would be represented by the row [3, 0, 0].
        Uncalled haplotypes (represented by '.' in the VCF) are represented
        by '-1' genotypes. If the sample has fewer haplotypes than the
        maximum ploidy of all samples at this locus, then the row is padded
        with -2s, so a haploid sample with gt '1' where other samples
        are diploid would be represented by the row [1, -2, 0].
        If all the genotype columns for a sample are negative then the
        sample is a no call. Note: the value of the phasing
        column is unspecified for haploid or no-call samples.

        Returns
        -------
        Optional[np.ndarray]
            The numpy array described above, of type int.
            If there are no samples in the vcf this record comes from
            then return None instead
        """
        if self.vcfrecord.genotype is None:
            return None
        return self.vcfrecord.genotype.array().astype(int)

    def GetCalledSamples(self, strict: bool = True) -> Optional[np.ndarray]:
        """
        Get an array listing which samples have been called at this locus.

        Parameters
        ----------
        strict :
            By default genotypes such as '1/.' are considered not called
            because at least one of the haplotypes present is not called.
            Set strict = False to mark these as being called.
            Note: genotypes having lesser ploidy will not be marked
            as no calls even when strict = True (e.g. if some samples
            have tetraploid genotypes at this locus, a genotype of '1/2/2'
            will be marked as called even though it is triploid)

        Returns
        -------
        Optional[np.ndarray]
            A bool array of length equal to the number of samples,
            where true indicates a sample has been called
            and false indicates otherwise.
            If there are no samples in the vcf this record comes from
            then return None instead
        """
        gt_idxs = self.GetGenotypeIndicies()
        if gt_idxs is None:
            return None

        if strict:
            return ~np.any(gt_idxs[:, :-1] == -1, axis=1)
        else:
            return ~np.all(np.logical_or(gt_idxs[:, :-1] == -1,
                                         gt_idxs[:, :-1] == -2),
                           axis=1)

    def GetSamplePloidies(self) -> Optional[np.ndarray]:
        """
        Get an array listing the ploidies of each sample

        Returns
        -------
        Optional[np.ndarray]
            An array of positive ints with length equal to the
            number of samples where each entry denotes the
            number of genotypes for each sample at this locus
            (including no calls)
            If there are no samples in the vcf this record comes from
            then return None instead
        """
        gt_idxs = self.GetGenotypeIndicies()
        if gt_idxs is None:
            return None

        return (
                gt_idxs.shape[1] - 1 - np.sum(gt_idxs[:, :-1] == -2, axis=1)
        )

    def GetCallRate(self, strict: bool = True) -> float:
        """
        Return the call rate at this locus.

        Parameters
        ----------
        strict :
            By default genotypes such as '1/.' are considered not called
            because at least one of the haplotypes present is not called.
            Set strict = False to mark these as being called.
            Note: genotypes having lesser ploidy will not be marked
            as no calls even when strict = True (e.g. if some samples
            have tetraploid genotypes at this locus, a genotype of '1/2/2'
            will be marked as called even though it is triploid)

        Returns
        -------
            The fraction of the samples at this locus that have been
            called. If there are no samples in the vcf this record comes from
            then return np.nan instead
        """
        called_samples = self.GetCalledSamples(strict=strict)
        if called_samples is None:
            return None
        else:
            return np.sum(called_samples) / called_samples.shape[0]

    def _GetStringGenotypeArray(
            self,
            idx_gts: np.ndarray,
            seq_alleles: List[str]):

        max_len = max(len(allele) for allele in seq_alleles)
        seq_array = np.empty(idx_gts.shape, dtype="<U{}".format(max_len))
        seq_array[:, -1][idx_gts[:, -1] == 0] = '0'
        seq_array[:, -1][idx_gts[:, -1] == 1] = '1'
        for allele_idx, seq_allele in enumerate(seq_alleles):
            seq_array[:, :-1][idx_gts[:, :-1] == allele_idx] = seq_allele
        seq_array[:, :-1][idx_gts[:, :-1] == -1] = '.'
        seq_array[:, :-1][idx_gts[:, :-1] == -2] = ','
        return seq_array

    def GetStringGenotypes(self) -> Optional[np.ndarray]:
        """
        Get an array of string genotypes for each sample.

        The array is as described in :py:meth:`GetGenotypeIndicies`
        except that the indicies are replaced by their corresponding
        sequences, -1 indicies (nocalls) are replaced by '.',
        -2 indicies (not present due to smaller ploidy) are replaced
        by ',', and the phasing bits (0 or 1) are replaced by the strings
        '0' or '1'.

        Will not include flanking base pairs. To get genotypes that include
        flanking base pairs (for callers that call those), use
        :py:meth:`GetFullStringGenotypes`. For callers that include flanking base pairs
        it is possible that some of the alleles in the regular string genotypes
        (with the flanks stripped) will be identical. In this case, you may
        use :py:meth:`UniqueStringGenotypeMapping` to get a canonical unique subset
        of indicies which represent all possible alleles.

        Note that some TR callers will only call allele lengths, not allele
        sequences. In such a case, this method will return a fabricated
        sequence based on the called length (see
        :py:meth:`trtools.utils.utils.FabricateAllele`) and
        a warning will be raised. This may not be intended -
        use :py:meth:`GetLengthGenotypes` for a fully caller agnostic
        way of handling genotypes.

        This method is inefficient for many samples, consider either using
        length genotypes (:py:meth:`GetLengthGenotypes`), or
        using genotype indicies (:py:meth:`GetGenotypeIndicies`) and
        accessing string genotypes as needed through the fields ref_allele and
        alt_alleles, instead.

        Returns
        -------
        Optional[np.ndarray]
            The numpy array described above, of type '<UN' where 'N'
            is the max allele length.
            If there are no samples in the vcf this record comes from
            then return None instead
        """
        idx_gts = self.GetGenotypeIndicies()
        if idx_gts is None:
            return None

        if self.HasFabricatedAltAlleles():
            warnings.warn("String genotypes have been requested for a"
                          " TRRecord generated by a caller which only "
                          "generates length genotypes, not string genotypes"
                          ". Returning a fabricated string genotype. Consider"
                          " requesting length based genotypes instead.")
        seq_alleles = list(self.alt_alleles)
        seq_alleles.insert(0, self.ref_allele)

        return self._GetStringGenotypeArray(idx_gts, seq_alleles)

    def GetFullStringGenotypes(self) -> Optional[np.ndarray]:
        """
        Get an array of full string genotypes for each sample.
        See :py:meth:`GetStringGenotypes` for details and
        limitations of string genotypes.

        If the sample does not have full genotypes that are distinct
        from its regular string genotypes (because no flanking base pairs
        were called) then the regular string genotypes are returned.

        Returns
        -------
        Optional[np.ndarray]
            The numpy array described above, of type '<UN' where 'N'
            is the max allele length.
            If there are no samples in the vcf this record comes from
            then return None instead
        """
        if not self.HasFullStringGenotypes():
            return self.GetStringGenotypes()

        idx_gts = self.GetGenotypeIndicies()
        if idx_gts is None:
            return None

        full_seq_alleles = self.full_alleles[1].copy()
        full_seq_alleles.insert(0, self.full_alleles[0])

        return self._GetStringGenotypeArray(idx_gts, full_seq_alleles)

    def UniqueStringGenotypeMapping(self) -> Dict[int, int]:
        """
        Get a mapping whose values are unique string genotype indicies.

        Return
        ------
        Dict[int, int]
            A mapping allele idx -> allele idx
            whose keys are all allele indicies and whose values are a
            subset of indicies which represents all the unique regular
            string alleles for this variant. For almost all records,
            this will be a mapping from each index to itself. For some
            records with full string genotypes that include flanking base
            pairs, some of the regular string alleles will be identical.
            In this case, only one of those allele's indicies will be in the
            set of values of this dictionary, and all identical alleles
            will map to that one index.
        """
        mapping = {}
        if not self.HasFullStringGenotypes():
            for idx in range(len(self.alt_alleles) + 1):
                mapping[idx] = idx
        else:
            allele_to_idx = {}
            alleles = [self.ref_allele]
            alleles.extend(self.alt_alleles)
            for idx, allele in enumerate(alleles):
                if allele not in allele_to_idx:
                    allele_to_idx[allele] = idx
                    mapping[idx] = idx
                else:
                    mapping[idx] = allele_to_idx[allele]

        return mapping

    def UniqueStringGenotypes(self) -> Set[int]:
        """
        Find allele indicies corresponding to the unique alleles.

        Equivalent to calling
        :code:`set(UniqueStringGenotypeMapping().values())`

        Returns
        -------
        Set[int]
            The indicies of the unique string alleles
        """
        return set(self.UniqueStringGenotypeMapping().values())

    def GetDosages(self, 
            dosagetype: TRDosageTypes = TRDosageTypes.bestguess, strict: bool = True) -> Optional[np.ndarray]:
        """
        Get an array of genotype dosages for each sample.

        Multiple strategies are used to compute dosages:

        - bestguess - Sum of the length (in num. rpt units) of alleles
        - beagleap - For each haplotype, dosage is computed as:
            sum_a len(a) x p(a) where len(a) is the length (in rpt. units)
            of each allele a, and p(a) is the allele probability (from Beagle AP1/AP2 fields)
            The total dosage is this value summed across the two haplotypes
        - bestguess_norm - Same as bestguess but scaled to be between 0 and 2
        - beagleap_norm - Same as beagleap but scaled to be between 0 and 2

        Note: normalized dosages currently not supported for haploid calls
        Those are set to np.nan in the output dosages if using a _norm option

        Parameters
        ----------
        dosagetype : Enum
            Which TRDosageType to compute. Default bestguess
        strict : bool
            If False, output a warning but do not die on errors validating AP field
            If errors are encountered, return nan dosage values
 
        Returns
        -------
        dosages : npt.NDArray[np.float32]
            A numpy array of dosages, of type float
            If no samples are in the array, return None
        """
        if self.GetNumSamples() == 0:
            return None
        if (dosagetype in [TRDosageTypes.beagleap, TRDosageTypes.beagleap_norm]) and \
            (("AP1" not in self.vcfrecord.FORMAT or "AP2" not in self.vcfrecord.FORMAT) or \
            (self.vcfrecord.format("AP1") is None or self.vcfrecord.format("AP2") is None)):
                error_msg = "Requested Beagle dosages for record at {}:{} but AP1/AP2 fields not found.".format(self.chrom, self.pos)
                if strict:
                    raise ValueError(error_msg)
                else:
                    common.WARNING(error_msg)
                    return np.array([np.nan]*self.GetNumSamples())
        if dosagetype in [TRDosageTypes.bestguess, TRDosageTypes.bestguess_norm]:
            # Get length gts and replace -1 (missing) and -2 (low ploidy) with 0
            # But if normalizing set those to np.nan since unclear
            # how to normalize haploid dosages which end up negative
            # under the current method...
            lengts = self.GetLengthGenotypes()
            if dosagetype == TRDosageTypes.bestguess_norm:
                lengts[lengts==-1] = np.nan
                lengts[lengts==-2] = np.nan
            else:
                lengts[lengts==-1] = 0
                lengts[lengts==-2] = 0
            unnorm_dosages = lengts[:,:-1].sum(axis=1).astype(np.float32)
        elif dosagetype in [TRDosageTypes.beagleap, TRDosageTypes.beagleap_norm]:
            # Extract allele probabilities
            ap1 = self.vcfrecord.format("AP1")
            ref1 = np.clip(1-np.sum(ap1, axis=1), 0, 1) # If neg due to rounding, cutoff at 0
            ap2 = self.vcfrecord.format("AP2")
            ref2 = np.clip(1-np.sum(ap2, axis=1), 0, 1)

            # Check AP field. allow wiggle room for rounding
            if np.any(np.sum(ap1, axis=1) > 1.1) or np.any(np.sum(ap2, axis=1) > 1.1):
                error_msg = "{}:{} AP1 or AP2 field summing to more than 1 detected".format(self.chrom, self.pos)
                if strict:
                    raise ValueError(error_msg)
                else:
                    common.WARNING(error_msg)
                    return np.array([np.nan]*self.GetNumSamples())
            if np.any(ap1 < 0) or np.any(ap2 < 0):
                error_msg = "{}:{} Negative AP1 or AP2 fields detected".format(self.chrom, self.pos)
                if strict:
                    raise ValueError("Negative AP1 or AP2 fields detected")
                else:
                    common.WARNING(error_msg)
                    return np.array([np.nan]*self.GetNumSamples())
            # Get haplotype dosages
            if len(self.alt_allele_lengths) > 0:
                max_alt_len = max(self.alt_allele_lengths)
                h1_dos = np.clip(np.dot(ap1, self.alt_allele_lengths), 0, max_alt_len)
                h2_dos = np.clip(np.dot(ap2, self.alt_allele_lengths), 0, max_alt_len)
            else:
                h1_dos = 0
                h2_dos = 0
            ref1_dos = ref1*self.ref_allele_length
            ref2_dos = ref2*self.ref_allele_length

            # Add together for final dosage
            unnorm_dosages = (h1_dos + h2_dos + ref1_dos + ref2_dos).astype(np.float32)
        else:
            raise ValueError("Unsupported dosagetype")
        if dosagetype in [TRDosageTypes.bestguess_norm, TRDosageTypes.beagleap_norm]:
            if self.min_allele_length == self.max_allele_length:
                # Can't normalize, just set all to 0
                dosages = np.zeros(self.GetNumSamples(), dtype=np.float32)
            else:
                # Normalize to be between 0 and 2
                dosages = (unnorm_dosages-2*self.min_allele_length)/(self.max_allele_length-self.min_allele_length)
                if (np.any(dosages>=2.1) or np.any(dosages<=-0.1)):
                    error_msg = "{}:{} Error normalizing dosages: value >=2.1 or <=-0.1 detected".format(self.chrom, self.pos)
                    if strict:
                        raise ValueError(error_msg)
                    else:
                        common.WARNING(error_msg)
                        return np.array([np.nan]*self.GetNumSamples())
                dosages = np.clip(dosages, 0, 2)
        else:
            dosages = unnorm_dosages
        return dosages
        
    def GetLengthGenotypes(self) -> Optional[np.ndarray]:
        """
        Get an array of length genotypes for each sample.

        Represents the sample's genotype in terms of the number
        of repeats of the motif in each allele.
        Returns a pair of floats - alleles including partial repeats
        or other impurities may have noninteger lengths.

        The array is as described in :py:meth:`GetGenotypeIndicies`
        except that indicies are replaced by their length genotypes.
        -1s, -2s and the phasing bits are not modified.

        For records with both regular and full sequences (those with flanking
        bps), this returns the length of the regular sequences

        Returns
        -------
        Optional[np.ndarray]
            The numpy array described above, of type float
            If there are no samples in the vcf this record comes from
            then return None instead
        """
        idx_gts = self.GetGenotypeIndicies()
        if idx_gts is None:
            return None

        # store allele lengths in a numpy array
        # and add fake alleles for -2 and -1 missing values
        allele_lens = np.array([self.ref_allele_length, *self.alt_allele_lengths, -2, -1])

        # copy repeats lengths and phasing for each sample
        len_gts = allele_lens[idx_gts]
        len_gts[:, -1] = idx_gts[:, -1]

        return len_gts

    def UniqueLengthGenotypeMapping(self) -> Dict[int, int]:
        """
        Get a mapping whose values are unique string genotype indicies.

        Return
        ------
        genotypeMapping : Dict[int, int]
            A mapping allele idx -> allele idx
            whose keys are all allele indicies and whose values are a
            subset of indicies which represents all the unique
            length alleles for this variant. For variants where
            multiple alleles have the same length, all will map to
            a single index from among those alleles.
        """
        mapping = {}
        allele_to_idx = {}
        alleles = [self.ref_allele]
        alleles.extend(self.alt_alleles)
        for idx, allele in enumerate(alleles):
            allele = len(allele)
            if allele not in allele_to_idx:
                allele_to_idx[allele] = idx
                mapping[idx] = idx
            else:
                mapping[idx] = allele_to_idx[allele]

        return mapping

    def UniqueLengthGenotypes(self) -> Set[int]:
        """
        Find allele indicies corresponding to the unique length alleles.

        Equivalent to calling
        :code:`set(UniqueLengthGenotypeMapping().values())`
        
        Returns
        -------
        Set[int]
            The indicies of the unique string alleles
        """
        return set(self.UniqueLengthGenotypeMapping().values())

    def HasFullStringGenotypes(self) -> bool:
        """
        Determine if this record has full string genotypes.

        Returns
        -------
        bool:
            True iff :py:meth:`GetFullStringGenotypes` will return
            a different value than :py:meth:`GetStringGenotypes` for some
            alleles.
        """
        return self.full_alleles is not None

    def HasFabricatedRefAllele(self) -> bool:
        """
        Determine if this record has a fabricated ref allels.

        Returns
        -------
        bool:
            True iff ref_allele_length was passed to this
            record's constructor.
        """
        return self.has_fabricated_ref_allele

    def HasFabricatedAltAlleles(self) -> bool:
        """
        Determine if this record has fabricated alt alleles.

        Returns
        -------
        bool:
            True iff alt_allele_lengths was passed to this
            record's constructor.
        """
        return self.has_fabricated_alt_alleles

    def GetGenotypeCounts(
            self,
            sample_index: Optional[Any] = None,
            uselength: bool = True,
            index: bool = False,
            fullgenotypes: bool = False,
            include_nocalls: bool = False) -> Dict[tuple, int]:
        """
        Get the counts of each genotype for a record.

        For samples with a lower ploidy than the max ploidy among all samples,
        the -2 placeholder haplotypes are sorted to the beginning of the call
        (e.g. (-2, 5) instead of (5, -2))

        This currently returns unphased genotypes (with no phasing column), it could be
        extend to have an option to respect phasing

        Parameters
        ----------
        sample_index :
            Used to index the numpy array of samples. So can be a numpy array
            of sample indicies, or a bool array with length of the number
            of samples, etc.
            If None, then all samples are included.
        uselength :
            If True, represent alleles as lengths
            else represent as strings
        index :
            If True, represent alleles as indexes (0 = ref, 1 = first_alt,
            etc.) instead of sequences or lengths
        fullgenotypes :
            If True, include flanking basepairs in allele representations.
            Only makes sense when expliictly stating uselength=False.
            Cannot be combined with index.
        include_nocalls:
            If False, all genotypes with one or more uncalled haplotypes
            (-1 or '.') are excluded from the returned dictionary,
            they are included if True. Genotypes with lower ploidy (-2 or ',')
            are included regardless.

        Returns
        -------
        genotype_counts: Dict[tuple, int]
            Gives the count of each genotype.
            Genotypes are represented as tuples of alleles,
            where the type of allele representation is determined by the uselength, index
            and fullgenotypes optional parameters.
        """
        # TODO test these
        if uselength and fullgenotypes:
            raise ValueError("Can't specify both uselength and fullgenotypes")
        if index and not uselength:
            raise ValueError("Specified uselength=False and index at the same"
                             " time, these are mutually exclusive options")

        if index:
            gts = self.GetGenotypeIndicies()
            nocall_entry = -1
        elif uselength and not index:
            gts = self.GetLengthGenotypes()
            nocall_entry = -1
        elif not uselength and not fullgenotypes:
            gts = self.GetStringGenotypes()
            nocall_entry = '.'
        elif fullgenotypes:
            gts = self.GetFullStringGenotypes()
            nocall_entry = '.'

        if gts is None:
            return {}

        gts = gts[:, :-1]  # remove phasing
        gts = np.sort(gts, axis=1)

        if sample_index is not None:
            gts = gts[sample_index, :]

        genotypes, counts = np.unique(
            gts,
            axis=0,
            return_counts=True
        )
        count_dict = dict(zip(tuple(map(tuple, genotypes)), counts))

        if not include_nocalls:
            gts_to_remove = set()
            for genotype in count_dict:
                if nocall_entry in genotype:
                    gts_to_remove.add(genotype)
            for gt in gts_to_remove:
                del count_dict[gt]

        return count_dict

    def GetAlleleCounts(self,
                        sample_index: Optional[Any] = None,
                        *,
                        uselength: bool = True,
                        index: bool = False,
                        fullgenotypes: bool = False) -> Dict[Any, int]:
        """
        Get the counts of each allele for a record.

        This does not return counts of no calls as it is not
        clear how many 'no call alleles' would be present per no call

        Alleles that are not called in any sample are not present
        in the returned dictionary

        Parameters
        ----------
        sample_index :
            Used to index the numpy array of samples. So can be a numpy array
            of sample indicies, or a bool array with length of the number
            of samples, etc.
            If None, then all samples are included.
        uselength : bool, optional
            If True, represent alleles a lengths
            else represent as strings
        index :
            If True, represent alleles as indexes (0 = ref, 1 = first_alt,
            etc.) instead of sequences or lengths
        fullgenotypes :
            If True, include flanking basepairs in allele representations
            Only makes sense when expliictly stating uselength=False.
            Cannot be combined with index.

        Returns
        -------
        allele_counts: Dict[Any, int]
            Gives the count of each allele.
            The type of allele representation is determined by the uselength, index
            and fullgenotypes optional parameters.
        """
        # TODO test these
        if uselength and fullgenotypes:
            raise ValueError("Can't specify both uselength and fullgenotypes")
        if index and not uselength:
            raise ValueError("Specified uselength=False and index at the same"
                             " time, these are mutually exclusive options")
        if index:
            gts = self.GetGenotypeIndicies()
            nocall_entry = -1
            lowploidy_entry = -2
        elif uselength and not index:
            gts = self.GetLengthGenotypes()
            nocall_entry = -1
            lowploidy_entry = -2
        elif not uselength and not fullgenotypes:
            gts = self.GetStringGenotypes()
            nocall_entry = '.'
            lowploidy_entry = ','
        elif fullgenotypes:
            gts = self.GetFullStringGenotypes()
            nocall_entry = '.'
            lowploidy_entry = ','

        if gts is None:
            return {}

        gts = gts[:, :-1]  # remove phasing

        if sample_index is not None:
            gts = gts[sample_index, :]

        # remove no calls and missing haplotypes
        gts = gts[gts != nocall_entry]
        gts = gts[gts != lowploidy_entry]

        alleles, counts = np.unique(
            gts,
            return_counts=True
        )
        return dict(zip(alleles, counts))

    def GetAlleleFreqs(self,
                       sample_index: Optional[Any] = None,
                       *,
                       uselength: bool = True,
                       index: bool = False,
                       fullgenotypes: bool = False) -> Dict[Any, float]:
        """
        Get the frequencies of each allele for a record.

        Parameters
        ----------
        sample_index :
            Used to index the numpy array of samples. So can be a numpy array
            of sample indicies, or a bool array with length of the number
            of samples, etc.
            If None, then all samples are included.
        uselength :
            If True, represent alleles a lengths
            else represent as strings
        index :
            If True, represent alleles as indexes (0 = ref, 1 = first_alt,
            etc.) instead of sequences or lengths
        fullgenotypes :
            If True, include flanking basepairs in allele representations.
            Only makes sense when expliictly stating uselength=False.
            Cannot be combined with index.

        Returns
        -------
        allele_freqs: Dict[Any, float]
            Gives the frequency of each allele among called samples
            The type of allele representation is determined by the uselength, index
            and fullgenotypes optional parameters.
        """
        allele_counts = self.GetAlleleCounts(uselength=uselength,
                                             index=index,
                                             fullgenotypes=fullgenotypes,
                                             sample_index=sample_index)
        total = float(sum(allele_counts.values()))
        return {key: value / total for key, value in allele_counts.items()}

    def GetMaxAllele(self,
                     sample_index: Optional[Any] = None) -> float:
        """
        Get the maximum allele length called in a record.
        
        Represents lengths in terms of the number of repeats of the motif.
        The longest allele may have a noninteger length if it includes
        partial repeats or other impurities.

        For records with both regular and full sequences (those with flanking
        bps), this returns the length of the regular sequences

        Parameters
        ----------
        sample_index :
            Used to index the numpy array of samples. So can be a numpy array
            of sample indicies, or a bool array with length of the number
            of samples, etc.
            If None, then all samples are included.

        Returns
        -------
        maxallele : float
            The maximum allele length called (in number of repeat units),
            or nan if no alleles called
        """
        # TODO should we have GetMinAllele too?
        # TODO should we have an option for grabbing
        # the index of the longest allele?
        alleles = self.GetAlleleCounts(uselength=True,
                                       sample_index=sample_index).keys()
        if len(alleles) == 0:
            return np.nan
        return max(alleles)

    def HasQualityScores(self) -> bool:
        """
        Does this TRRecord contain quality scores for each of its calls?
        If present, the meaning and reliability of these scores is
        genotyper dependent, see the doc section :ref:`Quality Scores`.

        Return
        ------
        boolean:
            Whether or not a FORMAT field that could be interpreted as a
            quality score has been identified
        """
        return (self.quality_field is not None and
                self.quality_field in self.format)

    def GetQualityScores(self) -> np.ndarray:
        """
        Get the quality scores of the calls for each sample.

        The meaning and reliability of these scores is genotyper
        dependent, see the doc section :ref:`Quality Scores`.

        Returns
        -------
        np.ndarray :
            An array of quality score floats, one row per sample
            Samples which were not called have the value np.nan
        """
        if not self.HasQualityScores():
            raise TypeError(
                "This TRRecord does not have a corresponding quality score"
                " field"
            )
        quality_val = self.format[self.quality_field]
        transform = self.quality_score_transform
        if transform is None:
            return quality_val
        else:
            return np.apply_along_axis(transform, 0, quality_val)

    def __str__(self):
        """Generate a summary of the variant described by this record."""
        record_id = self.record_id
        if record_id is None:
            record_id = "{}:{}".format(self.vcfrecord.CHROM,
                                       self.vcfrecord.POS)
        if self.HasFullStringGenotypes():
            string = "{} {} {} ".format(record_id,
                                        self.motif,
                                        self.full_alleles[0])
            string += ",".join(self.full_alleles[1])
            return string

        if self.HasFabricatedRefAllele():
            string = "{} {} n_reps:{} ".format(record_id,
                                               self.motif,
                                               self.ref_allele_length)
        else:
            string = "{} {} {} ".format(record_id,
                                        self.motif,
                                        self.ref_allele)

        if len(self.alt_alleles) == 0:
            string += '.'
        elif self.HasFabricatedAltAlleles():
            string += ",".join("n_reps:" + str(length) for length
                               in self.alt_allele_lengths)
        else:
            string += ','.join(self.alt_alleles)

        return string


class TRRecordHarmonizer:
    """
    Class producing a uniform interface for accessing TR VCF records.

    Produces the same output interface regardless of the
    tool that created the input VCF.

    The main purpose of this class is to infer which tool
    a VCF came from, and appropriately convert its records
    to TRRecord objects.

    This class provides the object oriented paradigm for iterating
    through a TR vcf. If you wish to use the functional paradigm and
    provide the cyvcf2.Variant objects yourself, use the top-level
    functions in this module.

    Parameters
    ----------
    vcffile : cyvcf2.VCF instance
    vcftype : {'auto', 'gangstr', 'advntr', 'hipstr', 'eh', 'popstr'}, optional
       Type of the VCF file. Default='auto'.
       If vcftype=='auto', attempts to infer the type.

    Attributes
    ----------
    vcffile : cyvcf2.VCF instance
    vcftype : enum
       Type of the VCF file. Must be included in VcfTypes

    Raises
    ------
    TypeError
        If the type of the VCF cannot be properly inferred.
        See :py:meth:`InferVCFType` for more details.
    """

    def __init__(self, vcffile: cyvcf2.VCF, vcftype: Union[str, VcfTypes] = "auto"):
        self.vcffile = vcffile
        self.vcftype = InferVCFType(vcffile, vcftype)
        self._record_idx = None

    def MayHaveImpureRepeats(self) -> bool:
        """
        Determine if any of the alleles in this VCF may contain impure repeats.

        See Also
        --------
        tr_harmonizer.MayHaveImpureRepeats
        """
        return MayHaveImpureRepeats(self.vcftype)

    def HasLengthRefGenotype(self) -> bool:
        """
        Determine if the reference alleles of variants are given by length.

        See Also
        --------
        tr_harmonizer.HasLengthRefGenotype
        """
        return HasLengthRefGenotype(self.vcftype)

    def HasLengthAltGenotypes(self) -> bool:
        """
        Determine if the alt alleles of variants are given by length.

        See Also
        --------
        tr_harmonizer.HasLengthAltGenotypes
        """
        return HasLengthAltGenotypes(self.vcftype)

    def HasQualityScore(self) -> bool:
        """
        Does this VCF contain quality scores for each of its calls?
        If present, the meaning and reliability of these scores is
        genotyper dependent, see the doc section :ref:`Quality Scores`.

        Returns
        -------
        bool
            Whether or not a FORMAT field that could be interpreted as a
            quality score has been identified
        """
        if self.vcftype == VcfTypes.gangstr:
            return 'FORMAT=<ID=Q,' in self.vcffile.raw_header
        if self.vcftype == VcfTypes.hipstr:
            return not self.IsBeagleVCF()
        if self.vcftype == VcfTypes.longtr:
            return not self.IsBeagleVCF()
        if self.vcftype == VcfTypes.advntr:
            return not self.IsBeagleVCF()
        if self.vcftype == VcfTypes.popstr:
            return False
        if self.vcftype == VcfTypes.eh:
            return False

        # Can't cover this line because it is future proofing.
        # (It explicitly is not reachable now,
        # would only be reachable if VcfTypes is expanded in the future)
        _UnexpectedTypeError(self.vcftype)  # pragma: no cover

    def IsBeagleVCF(self) -> bool:
        """
        Is this a VCF produced by running the Beagle software to impute STRs from a panel generated by an TR genotyper?

        See Also
        --------
        tr_harmonizer.IsBeagleVCF
        """
        return IsBeagleVCF(self.vcffile)

    def __iter__(self) -> Iterator[TRRecord]:
        """Iterate over TRRecords produced from the underlying vcf."""
        return self

    def __next__(self) -> TRRecord:
        """Iterate over TRRecord produced from the underlying vcf."""
        if self._record_idx is None:
            self._record_idx = 1
        self._record_idx += 1
        try:
            record = next(self.vcffile)
        except StopIteration:
            raise
        except Exception:
            raise ValueError(
                "Unable to parse the "+str(self._record_idx)+"th tandem "
                "repeat in the provided VCF. Check that it is properly formatted."
            )
        return HarmonizeRecord(self.vcftype, record)

# TODO check all users of this class for new options