Skip to content

Commit

Permalink
Updated GencodeGtfCodec to be more permissive.
Browse files Browse the repository at this point in the history
- Now the GencodeGtfCodec no longer parses transcriptType and geneType
into enums.  They are now stored as strings.  This allows for arbitrary
values in these fields and will help to future-proof (and species proof)
  the GTF parser.

- Fixes #7054
  • Loading branch information
jonn-smith committed Mar 26, 2021
1 parent 02b2f55 commit 8813eab
Show file tree
Hide file tree
Showing 12 changed files with 598 additions and 591 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,8 @@ public ComparatorByProteinCodingStatus(){}
@Override
public int compare( final GencodeFuncotation a, final GencodeFuncotation b ) {
// Is it protein coding?
final boolean isAProteinCoding = a.getGeneTranscriptType() == GencodeGtfFeature.GeneTranscriptType.PROTEIN_CODING;
final boolean isBProteinCoding = b.getGeneTranscriptType() == GencodeGtfFeature.GeneTranscriptType.PROTEIN_CODING;
final boolean isAProteinCoding = a.getGeneTranscriptType().equals(GencodeGtfFeature.KnownGeneBiotype.PROTEIN_CODING.toString());
final boolean isBProteinCoding = b.getGeneTranscriptType().equals(GencodeGtfFeature.KnownGeneBiotype.PROTEIN_CODING.toString());
if ( isAProteinCoding != isBProteinCoding ) {
if ( isAProteinCoding ) {
return -1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import org.broadinstitute.hellbender.tools.funcotator.Funcotation;
import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadata;
import org.broadinstitute.hellbender.tools.funcotator.vcfOutput.VcfOutputRenderer;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfGeneFeature;

import java.util.Arrays;
Expand Down Expand Up @@ -77,7 +76,7 @@ public class GencodeFuncotation implements Funcotation {
private GencodeGtfGeneFeature.FeatureTag apprisRank;
private Integer transcriptLength;
private String version;
private GencodeGtfFeature.GeneTranscriptType geneTranscriptType;
private String geneTranscriptType;

//------------------------------------------------------------
// Fields for overriding serialized values:
Expand Down Expand Up @@ -694,11 +693,11 @@ public void setVersion(final String version) {
this.version = version;
}

public GencodeGtfFeature.GeneTranscriptType getGeneTranscriptType() {
public String getGeneTranscriptType() {
return geneTranscriptType;
}

public void setGeneTranscriptType(final GencodeGtfFeature.GeneTranscriptType geneTranscriptType) {
public void setGeneTranscriptType(final String geneTranscriptType) {
this.geneTranscriptType = geneTranscriptType;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadataUtils;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfGeneFeature;

import java.util.ArrayList;
Expand Down Expand Up @@ -311,11 +310,11 @@ public GencodeFuncotationBuilder setVersion(final String version) {
}

/**
* Set the GeneTranscriptType {@link org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature.GeneTranscriptType} in the {@link GencodeFuncotation}.
* Set the GeneTranscriptType in the {@link GencodeFuncotation}.
* @param geneTranscriptType The {@link String} containing the GeneTranscriptType for the {@link GencodeFuncotation}.
* @return {@code this} {@link GencodeFuncotationBuilder}
*/
public GencodeFuncotationBuilder setGeneTranscriptType(final GencodeGtfFeature.GeneTranscriptType geneTranscriptType) {
public GencodeFuncotationBuilder setGeneTranscriptType(final String geneTranscriptType) {
gencodeFuncotation.setGeneTranscriptType( geneTranscriptType );
return this;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,7 @@ private GencodeFuncotation createExonFuncotation(final VariantContext variant,

// Before we get started, check to see if this is a non-protein-coding feature.
// If it is, we must handle it differently:
if ( transcript.getGeneType() != GencodeGtfFeature.GeneTranscriptType.PROTEIN_CODING) {
if ( transcript.getGeneType() != GencodeGtfFeature.KnownGeneBiotype.PROTEIN_CODING.toString()) {
return createCodingRegionFuncotationForNonProteinCodingFeature(variant, altAllele, reference, transcript, exon);
}
else {
Expand Down Expand Up @@ -1700,7 +1700,7 @@ private GencodeFuncotation createIntronFuncotation(final VariantContext variant,
gencodeFuncotationBuilder.setReferenceContext(referenceBases.getBaseString(Strand.POSITIVE));

// Set the VariantClassification:
if ( transcript.getGeneType() == GencodeGtfFeature.GeneTranscriptType.PROTEIN_CODING ) {
if ( transcript.getGeneType() == GencodeGtfFeature.KnownGeneBiotype.PROTEIN_CODING.toString() ) {
gencodeFuncotationBuilder.setVariantClassification(GencodeFuncotation.VariantClassification.INTRON);
}
else {
Expand Down Expand Up @@ -2736,84 +2736,20 @@ else if ( gtfApprisTags.size() == 1 ) {
}

/**
* Converts a given {@link org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature.GeneTranscriptType} to a {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification}.
* Assumes the given {@code type} is not {@link GencodeGtfFeature.GeneTranscriptType#PROTEIN_CODING}.
* Converts a given GeneTranscriptType {@link String} to a {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification}.
* Assumes the given {@code type} is not {@link GencodeGtfFeature.KnownGeneBiotype#PROTEIN_CODING}.
* If no type can be assessed, returns {@code null}.
* @param type A {@link org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature.GeneTranscriptType} to convert to a {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification}.
* @return A {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification} representing the given {@link org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature.GeneTranscriptType}, or {@code null}.
*/
private static GencodeFuncotation.VariantClassification convertGeneTranscriptTypeToVariantClassification (final GencodeGtfFeature.GeneTranscriptType type ) {

//TODO: This all needs to be fixed so there is a 1:1 mapping of GeneTranscriptType->VariantClassification - Issue #4405
switch (type) {
// case IG_C_GENE: break;
// case IG_D_GENE: break;
// case IG_J_GENE: break;
// case IG_LV_GENE: break;
// case IG_V_GENE: break;
// case TR_C_GENE: break;
// case TR_J_GENE: break;
// case TR_V_GENE: break;
// case TR_D_GENE: break;
// case IG_PSEUDOGENE: break;
// case IG_C_PSEUDOGENE: break;
// case IG_J_PSEUDOGENE: break;
// case IG_V_PSEUDOGENE: break;
// case TR_V_PSEUDOGENE: break;
// case TR_J_PSEUDOGENE: break;
case MT_RRNA: return GencodeFuncotation.VariantClassification.RNA;
case MT_TRNA: return GencodeFuncotation.VariantClassification.RNA;
case MIRNA: return GencodeFuncotation.VariantClassification.RNA;
case MISC_RNA: return GencodeFuncotation.VariantClassification.RNA;
case RRNA: return GencodeFuncotation.VariantClassification.RNA;
case SCRNA: return GencodeFuncotation.VariantClassification.RNA;
case SNRNA: return GencodeFuncotation.VariantClassification.RNA;
case SNORNA: return GencodeFuncotation.VariantClassification.RNA;
case RIBOZYME: return GencodeFuncotation.VariantClassification.RNA;
case SRNA: return GencodeFuncotation.VariantClassification.RNA;
case SCARNA: return GencodeFuncotation.VariantClassification.RNA;
case MT_TRNA_PSEUDOGENE: return GencodeFuncotation.VariantClassification.RNA;
case TRNA_PSEUDOGENE: return GencodeFuncotation.VariantClassification.RNA;
case SNORNA_PSEUDOGENE: return GencodeFuncotation.VariantClassification.RNA;
case SNRNA_PSEUDOGENE: return GencodeFuncotation.VariantClassification.RNA;
case SCRNA_PSEUDOGENE: return GencodeFuncotation.VariantClassification.RNA;
case RRNA_PSEUDOGENE: return GencodeFuncotation.VariantClassification.RNA;
case MISC_RNA_PSEUDOGENE: return GencodeFuncotation.VariantClassification.RNA;
case MIRNA_PSEUDOGENE: return GencodeFuncotation.VariantClassification.RNA;
// case TEC: break;
// case NONSENSE_MEDIATED_DECAY: break;
// case NON_STOP_DECAY: break;
// case RETAINED_INTRON: break;
// case PROTEIN_CODING: break;
// case PROCESSED_TRANSCRIPT: break;
// case NON_CODING: break;
// case AMBIGUOUS_ORF: break;
// case SENSE_INTRONIC: break;
// case SENSE_OVERLAPPING: break;
// case ANTISENSE: break;
case ANTISENSE_RNA: return GencodeFuncotation.VariantClassification.RNA;
case KNOWN_NCRNA: return GencodeFuncotation.VariantClassification.RNA;
// case PSEUDOGENE: break;
// case PROCESSED_PSEUDOGENE: break;
// case POLYMORPHIC_PSEUDOGENE: break;
// case RETROTRANSPOSED: break;
// case TRANSCRIBED_PROCESSED_PSEUDOGENE: break;
// case TRANSCRIBED_UNPROCESSED_PSEUDOGENE: break;
// case TRANSCRIBED_UNITARY_PSEUDOGENE: break;
// case TRANSLATED_PROCESSED_PSEUDOGENE: break;
// case TRANSLATED_UNPROCESSED_PSEUDOGENE: break;
// case UNITARY_PSEUDOGENE: break;
// case UNPROCESSED_PSEUDOGENE: break;
// case ARTIFACT: break;
case LINCRNA: return GencodeFuncotation.VariantClassification.LINCRNA;
case MACRO_LNCRNA: return GencodeFuncotation.VariantClassification.LINCRNA;
case THREE_PRIME_OVERLAPPING_NCRNA: return GencodeFuncotation.VariantClassification.RNA;
// case DISRUPTED_DOMAIN: break;
case VAULTRNA: return GencodeFuncotation.VariantClassification.RNA;
case BIDIRECTIONAL_PROMOTER_LNCRNA: return GencodeFuncotation.VariantClassification.RNA;
default:
return GencodeFuncotation.VariantClassification.RNA;
}
* @param type A {@link String} representing a GeneTranscriptType to convert to a {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification}.
* @return A {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification} representing the given GeneTranscriptType {@link String}, or {@code null}.
*/
private static GencodeFuncotation.VariantClassification convertGeneTranscriptTypeToVariantClassification (final String type ) {

//TODO: This all needs to be fixed so there is a 1:1 mapping of GencodeGtfFeature.KnownGeneBiotype->VariantClassification - Issue #4405
if (type.equals(GencodeGtfFeature.KnownGeneBiotype.LINCRNA.toString()) ||
type.equals(GencodeGtfFeature.KnownGeneBiotype.MACRO_LNCRNA.toString())) {
return GencodeFuncotation.VariantClassification.LINCRNA;
}
return GencodeFuncotation.VariantClassification.RNA;
}

//==================================================================================================================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,11 @@ protected GencodeGtfFeature(final String[] gtfFields, final String gtfFileType)
baseData.transcriptId = fieldValue;
break;
case "gene_type":
baseData.geneType = GeneTranscriptType.getEnum(fieldValue);
baseData.geneType = fieldValue;
break;
// For ENSEMBL GTF files:
case "gene_biotype":
baseData.geneType = GeneTranscriptType.getEnum(fieldValue);
baseData.geneType = fieldValue;
break;
case "gene_status":
baseData.geneStatus = GeneTranscriptStatus.valueOf(fieldValue);
Expand All @@ -168,10 +168,10 @@ protected GencodeGtfFeature(final String[] gtfFields, final String gtfFileType)
baseData.geneName = fieldValue;
break;
case "transcript_type":
baseData.transcriptType = GeneTranscriptType.getEnum(fieldValue);
baseData.transcriptType = fieldValue;
break;
case "transcript_biotype":
baseData.transcriptType = GeneTranscriptType.getEnum(fieldValue);
baseData.transcriptType = fieldValue;
break;
case "transcript_status":
baseData.transcriptStatus = GeneTranscriptStatus.valueOf(fieldValue);
Expand Down Expand Up @@ -527,15 +527,15 @@ public String getTranscriptId() {
return baseData.transcriptId;
}

public GeneTranscriptType getGeneType() {
public String getGeneType() {
return baseData.geneType;
}

public String getGeneName() {
return baseData.geneName;
}

public GeneTranscriptType getTranscriptType() {
public String getTranscriptType() {
return baseData.transcriptType;
}

Expand Down Expand Up @@ -890,15 +890,18 @@ public static GenomicPhase getEnum(final String s) {
}
}


/**
* Biotype / transcript type for the transcript or gene represented in a feature.
* This is a tag of some biological function associated with a feature.
*
* The values here are not exhaustive, but should be used as a reference for already used / known types.
*
* For more information, see:
* https://www.gencodegenes.org/data_format.html
* https://en.wikipedia.org/wiki/General_feature_format
*/
public enum GeneTranscriptType {
public enum KnownGeneBiotype {
// Immunoglobulin (Ig) variable chain and T-cell receptor (TcR) genes imported or annotated according to the IMGT (http://www.imgt.org/)
IG_C_GENE("IG_C_gene"),
IG_D_GENE("IG_D_gene"),
Expand Down Expand Up @@ -1032,12 +1035,12 @@ public enum GeneTranscriptType {
BIDIRECTIONAL_PROMOTER_LNCRNA("bidirectional_promoter_lncRNA");

@SuppressWarnings("unchecked")
private static final Map<String, GeneTranscriptType> VALUE_MAP =
private static final Map<String, KnownGeneBiotype> VALUE_MAP =
Arrays.stream(values()).collect(Collectors.toMap(v -> v.serialized.toLowerCase(), v -> v));

private final String serialized;

GeneTranscriptType(final String serializedValue) {
KnownGeneBiotype(final String serializedValue) {
serialized = serializedValue;
}

Expand All @@ -1048,7 +1051,7 @@ public String toString() {

private static final Map<String, String> SPECIAL_CASE_STRING_VALUE_MAP = createSpecialCaseMap();

public static GeneTranscriptType getEnum(final String s) {
public static KnownGeneBiotype getEnum(final String s) {
String lowerS = s.toLowerCase();

// Handle special cases:
Expand All @@ -1061,7 +1064,7 @@ public static GeneTranscriptType getEnum(final String s) {
}

/**
* Create a special case map for alternate field names for known {@link GeneTranscriptType}s.
* Create a special case map for alternate field names for known {@link KnownGeneBiotype}s.
*/
private static Map<String, String> createSpecialCaseMap() {
final Map<String, String> map = new HashMap<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ final public class GencodeGtfFeatureBaseData {
*
* There is no restriction on which keywords can be used here, but some
* known annotation sources are defined in their GTF parent objects:
* {@link GencodeGtfCodec#GENCODE_ANNOTATION_SOURCE_ENSEMBL}
* {@link GencodeGtfCodec#GENCODE_ANNOTATION_SOURCE_HAVANA}
* {@link EnsemblGtfCodec#ENSEMBL_ANNOTATION_SOURCE_ENA}
* {@link GencodeGtfFeature#ANNOTATION_SOURCE_ENSEMBL}
* {@link GencodeGtfFeature#ANNOTATION_SOURCE_HAVANA}
* {@link GencodeGtfFeature#ANNOTATION_SOURCE_ENA}
*/
public String annotationSource;

Expand All @@ -64,9 +64,17 @@ final public class GencodeGtfFeatureBaseData {
public String geneId = null;
public String transcriptId = null;
public GencodeGtfFeature.GeneTranscriptStatus geneStatus = null;
public GencodeGtfFeature.GeneTranscriptType geneType = null;
/**
* There are no formal definitions for what can be a valid geneType and the
* number of possible values seem to increase as new versions of Gencode are released.
*/
public String geneType = null;
public String geneName = null;
public GencodeGtfFeature.GeneTranscriptType transcriptType = null;
/**
* There are no formal definitions for what can be a valid transcriptType and the
* number of possible values seem to increase as new versions of Gencode are released.
*/
public String transcriptType = null;
public GencodeGtfFeature.GeneTranscriptStatus transcriptStatus = null;
public String transcriptName = null;
public int exonNumber = GencodeGtfFeature.NO_EXON_NUMBER;
Expand Down Expand Up @@ -100,10 +108,10 @@ public GencodeGtfFeatureBaseData(
final GencodeGtfFeature.GenomicPhase genomicPhase,
final String geneId,
final String transcriptId,
final GencodeGtfFeature.GeneTranscriptType geneType,
final String geneType,
final GencodeGtfFeature.GeneTranscriptStatus geneStatus,
final String geneName,
final GencodeGtfFeature.GeneTranscriptType transcriptType,
final String transcriptType,
final GencodeGtfFeature.GeneTranscriptStatus transcriptStatus,
final String transcriptName,
final int exonNumber,
Expand Down
Loading

0 comments on commit 8813eab

Please sign in to comment.