Skip to content

Commit

Permalink
feat: build source data starting from accessions (#210)
Browse files Browse the repository at this point in the history
  • Loading branch information
hunterckx committed Jan 16, 2025
1 parent 0947c6d commit de9f6b7
Show file tree
Hide file tree
Showing 13 changed files with 581 additions and 359 deletions.
5 changes: 3 additions & 2 deletions app/apis/catalog/brc-analytics-catalog/common/entities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,19 @@ export interface BRCDataCatalogGenome {
scaffoldCount: number;
scaffoldL50: number;
scaffoldN50: number;
species: string;
speciesTaxonomyId: string;
strain: string | null;
tags: string[];
taxon: string;
ucscBrowserUrl: string | null;
}

export interface BRCDataCatalogOrganism {
assemblyCount: number;
genomes: BRCDataCatalogGenome[];
ncbiTaxonomyId: string;
species: string;
tags: string[];
taxon: string;
}

export interface EntitiesResponse<R> {
Expand Down
2 changes: 1 addition & 1 deletion app/apis/catalog/brc-analytics-catalog/common/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ export function getGenomeId(genome: BRCDataCatalogGenome): string {

export function getGenomeTitle(genome?: BRCDataCatalogGenome): string {
if (!genome) return "";
return `${genome.taxon}`;
return `${genome.species}`;
}

export function getOrganismId(organism: BRCDataCatalogOrganism): string {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,15 @@ export const buildGcPercent = (
};

/**
* Build props for the taxon cell.
* Build props for the species cell.
* @param genome - Genome entity.
* @returns Props to be used for the cell.
*/
export const buildGenomeTaxon = (
export const buildGenomeSpecies = (
genome: BRCDataCatalogGenome
): ComponentProps<typeof C.BasicCell> => {
return {
value: genome.taxon,
value: genome.species,
};
};

Expand Down Expand Up @@ -193,7 +193,7 @@ export const buildOrganismTaxon = (
organism: BRCDataCatalogOrganism
): ComponentProps<typeof C.Link> => {
return {
label: organism.taxon,
label: organism.species,
url: `${ROUTES.ORGANISMS}/${encodeURIComponent(getOrganismId(organism))}`,
};
};
Expand Down Expand Up @@ -338,7 +338,7 @@ export const buildGenomeDetails = (
keyValuePairs.set(
"Taxon",
C.Link({
label: genome.taxon,
label: genome.species,
url: `https://www.ncbi.nlm.nih.gov/datasets/taxonomy/${encodeURIComponent(
genome.ncbiTaxonomyId
)}/`,
Expand Down Expand Up @@ -416,9 +416,9 @@ function buildOrganismGenomesTableColumns(): ColumnDef<BRCDataCatalogGenome>[] {
header: BRC_DATA_CATALOG_CATEGORY_LABEL.ACCESSION,
},
{
accessorKey: BRC_DATA_CATALOG_CATEGORY_KEY.TAXON,
cell: ({ row }) => C.BasicCell(buildGenomeTaxon(row.original)),
header: BRC_DATA_CATALOG_CATEGORY_LABEL.TAXON,
accessorKey: BRC_DATA_CATALOG_CATEGORY_KEY.SPECIES,
cell: ({ row }) => C.BasicCell(buildGenomeSpecies(row.original)),
header: BRC_DATA_CATALOG_CATEGORY_LABEL.SPECIES,
},
{
accessorKey: BRC_DATA_CATALOG_CATEGORY_KEY.STRAIN,
Expand Down Expand Up @@ -493,7 +493,7 @@ function getGenomeEntityChooseAnalysisMethodBreadcrumbs(
): Breadcrumb[] {
return [
{ path: ROUTES.GENOMES, text: "Assemblies" },
{ path: "", text: `${genome.taxon}` },
{ path: "", text: `${genome.species}` },
{ path: "", text: "Choose Analysis Methods" },
];
}
Expand All @@ -508,7 +508,7 @@ function getOrganismEntityAssembliesBreadcrumbs(
): Breadcrumb[] {
return [
{ path: ROUTES.ORGANISMS, text: "Organisms" },
{ path: "", text: `${organism.taxon}` },
{ path: "", text: `${organism.species}` },
{ path: "", text: "Assemblies" },
];
}
76 changes: 38 additions & 38 deletions files/build-catalog.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,56 +4,29 @@ import {
BRCDataCatalogGenome,
BRCDataCatalogOrganism,
} from "../app/apis/catalog/brc-analytics-catalog/common/entities";
import { SourceGenome, SourceOrganism } from "./entities";
import { SourceGenome } from "./entities";

const SOURCE_PATH_ORGANISMS = "files/source/organisms-from-ncbi.tsv";
const SOURCE_PATH_GENOMES = "files/source/genomes-from-ncbi.tsv";

buildCatalog();

async function buildCatalog(): Promise<void> {
const organisms = await buildOrganisms();
const genomes = await buildGenomes();
const organisms = buildOrganisms(genomes);

const organismsByTaxon = new Map(
organisms.map((organism) => [organism.taxon, organism])
);

const genomes = await buildGenomes(organismsByTaxon);
console.log("Genomes:", genomes.length);
await saveJson("files/out/genomes.json", genomes);

console.log("Organisms:", genomes.length);
await saveJson("files/out/organisms.json", organisms);

console.log("Genomes:", genomes.length);
await saveJson("files/out/genomes.json", genomes);

console.log("Done");
}

async function buildOrganisms(): Promise<BRCDataCatalogOrganism[]> {
const sourceRows = await readValuesFile<SourceOrganism>(
SOURCE_PATH_ORGANISMS
);
const mappedRows = sourceRows.map((row): BRCDataCatalogOrganism => {
return {
assemblyCount: parseNumber(row.assemblyCount),
genomes: [],
ncbiTaxonomyId: row.taxonomyId,
tags: row.CustomTags ? [row.CustomTags] : [],
taxon: row.taxon,
};
});
return mappedRows.sort((a, b) =>
a.ncbiTaxonomyId.localeCompare(b.ncbiTaxonomyId)
);
}

async function buildGenomes(
organismsByTaxon: Map<string, BRCDataCatalogOrganism>
): Promise<BRCDataCatalogGenome[]> {
async function buildGenomes(): Promise<BRCDataCatalogGenome[]> {
const sourceRows = await readValuesFile<SourceGenome>(SOURCE_PATH_GENOMES);
const mappedRows = sourceRows.map((row): BRCDataCatalogGenome => {
const organism = organismsByTaxon.get(row.taxon);
const genome: BRCDataCatalogGenome = {
return {
accession: row.accession,
annotationStatus: parseStringOrNull(row.annotationStatus),
chromosomes: parseNumberOrNull(row.chromosomeCount),
Expand All @@ -67,17 +40,44 @@ async function buildGenomes(
scaffoldCount: parseNumber(row.scaffoldCount),
scaffoldL50: parseNumber(row.scaffoldL50),
scaffoldN50: parseNumber(row.scaffoldN50),
species: row.species,
speciesTaxonomyId: row.speciesTaxonomyId,
strain: parseStringOrNull(row.strain),
tags: organism?.tags ?? [],
taxon: row.taxon,
tags: row.CustomTags.split(/,\s*/),
ucscBrowserUrl: parseStringOrNull(row.ucscBrowser),
};
organism?.genomes.push(genome);
return genome;
});
return mappedRows.sort((a, b) => a.accession.localeCompare(b.accession));
}

function buildOrganisms(
genomes: BRCDataCatalogGenome[]
): BRCDataCatalogOrganism[] {
const organismsByTaxonomyId = new Map<string, BRCDataCatalogOrganism>();
for (const genome of genomes) {
organismsByTaxonomyId.set(
genome.speciesTaxonomyId,
buildOrganism(organismsByTaxonomyId.get(genome.speciesTaxonomyId), genome)
);
}
return Array.from(organismsByTaxonomyId.values()).sort((a, b) =>
a.ncbiTaxonomyId.localeCompare(b.ncbiTaxonomyId)
);
}

function buildOrganism(
organism: BRCDataCatalogOrganism | undefined,
genome: BRCDataCatalogGenome
): BRCDataCatalogOrganism {
return {
assemblyCount: (organism?.assemblyCount ?? 0) + 1,
genomes: [...(organism?.genomes ?? []), genome],
ncbiTaxonomyId: genome.speciesTaxonomyId,
species: genome.species,
tags: Array.from(new Set([...(organism?.tags ?? []), ...genome.tags])),
};
}

async function readValuesFile<T>(
filePath: string,
delimiter = "\t"
Expand Down
95 changes: 39 additions & 56 deletions files/build-files-from-ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,45 +3,42 @@
import urllib.parse
import re

TAXA_URL = "https://docs.google.com/spreadsheets/d/1Gg9sw2Qw765tOx2To53XkTAn-RAMiBtqYrfItlLXXrc/gviz/tq?tqx=out:csv&sheet=Sheet1.csv"

TAXONOMY_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/dataset_report"
SOURCE_LIST_URL = "https://docs.google.com/spreadsheets/d/1Gg9sw2Qw765tOx2To53XkTAn-RAMiBtqYrfItlLXXrc/gviz/tq?tqx=out:csv&sheet=Sheet1.csv"

ASSEMBLIES_URL = "https://hgdownload.soe.ucsc.edu/hubs/BRC/assemblyList.json"

ORGANISMS_OUTPUT_PATH = "files/source/organisms-from-ncbi.tsv"
GENOMES_OUTPUT_PATH = "files/source/genomes-from-ncbi.tsv"

def build_taxonomy_request_body(taxa):
return {"taxons": taxa, "children": False, "ranks": ["genus"]}

def get_organism_row(organism_info, accession):
if len(organism_info.get("errors", [])) > 0:
raise Exception(organism_info)

organism_taxonomy = organism_info["taxonomy"]
def get_paginated_ncbi_results(base_url, query_description):
page = 1
next_page_token = None
results = []
while next_page_token or page == 1:
print(f"Requesting page {page} of {query_description}")
request_url = f"{base_url}?page_size=1000{"&page_token=" + next_page_token if next_page_token else ""}"
page_data = requests.get(request_url).json()
if len(page_data["reports"][0].get("errors", [])) > 0:
raise Exception(page_data["reports"][0])
results += page_data["reports"]
next_page_token = page_data.get("next_page_token")
page += 1
return results

def get_species_row(taxon_info):
species_info = taxon_info["taxonomy"]["classification"]["species"]
return {
"taxon": organism_taxonomy["current_scientific_name"]["name"],
"taxonomyId": str(organism_taxonomy["tax_id"]),
"assemblyCount": 1,
"accession": accession,
"taxonomyId": taxon_info["taxonomy"]["tax_id"],
"species": species_info["name"],
"speciesTaxonomyId": species_info["id"],
}

def get_organisms_df(taxa_with_accessions):
organisms_info_with_accessions = [(organism_info, accession) for taxon, accession in taxa_with_accessions for organism_info in requests.post(TAXONOMY_URL, json=build_taxonomy_request_body([taxon])).json()["reports"]]
return pd.DataFrame([get_organism_row(organism_info, accession) for organism_info, accession in organisms_info_with_accessions])

def get_tax_ids(organisms_df):
return list(organisms_df["taxonomyId"])
def get_species_df(taxonomy_ids):
species_info = get_paginated_ncbi_results(f"https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy/taxon/{",".join([str(id) for id in taxonomy_ids])}/dataset_report", "taxa")
return pd.DataFrame([get_species_row(info) for info in species_info])

def build_genomes_url(tax_ids, next_page_token):
return f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/taxon/{urllib.parse.quote(",".join([str(id) for id in tax_ids]))}/dataset_report?page_size=1000{"&page_token=" + next_page_token if next_page_token else ""}&filters.assembly_source=refseq&filters.has_annotation=true&filters.exclude_paired_reports=true&filters.exclude_atypical=true&filters.assembly_level=scaffold&filters.assembly_level=chromosome&filters.assembly_level=complete_genome"

def get_genome_row(genome_info, taxon):
def get_genome_row(genome_info):
refseq_category = genome_info["assembly_info"].get("refseq_category")
return {
"taxon": taxon,
"strain": genome_info["organism"].get("infraspecific_names", {}).get("strain", ""),
"taxonomyId": genome_info["organism"]["tax_id"],
"accession": genome_info["accession"],
Expand All @@ -55,27 +52,12 @@ def get_genome_row(genome_info, taxon):
"coverage": genome_info["assembly_stats"].get("genome_coverage"),
"gcPercent": genome_info["assembly_stats"]["gc_percent"],
"annotationStatus": genome_info["annotation_info"].get("status"),
"pairedAccession": genome_info["paired_accession"],
"pairedAccession": genome_info.get("paired_accession"),
}

def get_organism_genomes_results(tax_id):
page = 1
next_page_token = None
results = []
while next_page_token or page == 1:
print(f"Requesting page {page} of taxon {tax_id}")
page_data = requests.get(build_genomes_url([tax_id], next_page_token)).json()
results += page_data["reports"]
next_page_token = page_data.get("next_page_token")
page += 1
return results

def get_organism_genomes(tax_id, accession):
return [genome_info for genome_info in get_organism_genomes_results(tax_id) if genome_info["accession"] == accession]

def get_genomes_df(organism_ids):
genomes_info_with_organisms = [(genome_info, taxon) for tax_id, taxon, accession in organism_ids for genome_info in get_organism_genomes(tax_id, accession)]
return pd.DataFrame(data=[get_genome_row(*info) for info in genomes_info_with_organisms])
def get_genomes_df(accessions):
genomes_info = get_paginated_ncbi_results(f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{",".join(accessions)}/dataset_report", "genomes")
return pd.DataFrame(data=[get_genome_row(info) for info in genomes_info])

def _id_to_gene_model_url(asm_id):
hubs_url = "https://hgdownload.soe.ucsc.edu/hubs/"
Expand Down Expand Up @@ -111,23 +93,24 @@ def add_gene_model_url(genomes_df: pd.DataFrame):
def build_files():
print("Building files")

taxa_df = pd.read_csv(TAXA_URL, keep_default_na=False)

organisms_source_df = get_organisms_df([(taxon.strip(), accession.strip()) for taxon, accession in zip(taxa_df["Name"], taxa_df["RefSeq Accession"]) if taxon])
source_list_df = pd.read_csv(SOURCE_LIST_URL, keep_default_na=False)

organisms_df = organisms_source_df.merge(taxa_df[["TaxId", "CustomTags"]], how="left", left_on="taxonomyId", right_on="TaxId").drop(columns=["TaxId"])
base_genomes_df = get_genomes_df(source_list_df["Reference"])

organisms_df.to_csv(ORGANISMS_OUTPUT_PATH, index=False, sep="\t")
species_df = get_species_df(base_genomes_df["taxonomyId"])

print(f"Wrote to {ORGANISMS_OUTPUT_PATH}")
genomes_with_species_df = (
base_genomes_df
.merge(source_list_df[["Reference", "CustomTags"]], how="left", left_on="accession", right_on="Reference").drop(columns=["Reference"])
.merge(species_df, how="left", on="taxonomyId")
)

genomes_source_df = get_genomes_df(zip(organisms_df["taxonomyId"], organisms_df["taxon"], organisms_df["accession"]))
assemblies_df = pd.DataFrame(requests.get(ASSEMBLIES_URL).json()["data"])[["ucscBrowser", "genBank", "refSeq"]]

gen_bank_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="accession", right_on="genBank")
ref_seq_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="accession", right_on="refSeq")
gen_bank_merge_df = genomes_with_species_df.merge(assemblies_df, how="left", left_on="accession", right_on="genBank")
ref_seq_merge_df = genomes_with_species_df.merge(assemblies_df, how="left", left_on="accession", right_on="refSeq")

unmatched_accessions = genomes_source_df["accession"][~(genomes_source_df["accession"].isin(assemblies_df["genBank"]) | genomes_source_df["accession"].isin(assemblies_df["refSeq"]))]
unmatched_accessions = genomes_with_species_df["accession"][~(genomes_with_species_df["accession"].isin(assemblies_df["genBank"]) | genomes_with_species_df["accession"].isin(assemblies_df["refSeq"]))]
if len(unmatched_accessions) > 0:
print(f"{len(unmatched_accessions)} accessions had no match in assembly list: {", ".join(unmatched_accessions)}")

Expand Down
11 changes: 3 additions & 8 deletions files/entities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ export interface SourceGenome {
annotationStatus: string;
chromosomeCount: string;
coverage: string;
CustomTags: string;
gcPercent: string;
geneModelUrl: string;
isRef: string;
Expand All @@ -11,15 +12,9 @@ export interface SourceGenome {
scaffoldCount: string;
scaffoldL50: string;
scaffoldN50: string;
species: string;
speciesTaxonomyId: string;
strain: string;
taxon: string;
taxonomyId: string;
ucscBrowser: string;
}

export interface SourceOrganism {
assemblyCount: string;
CustomTags: string;
taxon: string;
taxonomyId: string;
}
Loading

0 comments on commit de9f6b7

Please sign in to comment.