Skip to content

Commit

Permalink
fix: remove catalog input rows with duplicate ids (#18)
Browse files Browse the repository at this point in the history
  • Loading branch information
hunterckx committed Sep 5, 2024
1 parent 9c57bd3 commit a0f6b6b
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 26 deletions.
13 changes: 11 additions & 2 deletions data-catalog/files/build-genomes-files.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,23 @@

OUTPUT_PATH = "files/source/genomes.tsv"

def get_duplicate_ids(genomes_df):
counts = genomes_df["Genome Version/Assembly ID"].value_counts()
return list(counts.index.to_series().loc[counts > 1])

def build_genomes_files():
print("Building files")

genomes_source_df = pd.read_csv(GENOMES_SOURCE_URL, keep_default_na=False, usecols=lambda name: re.fullmatch(r"Unnamed: \d+", name) is None)
assemblies_df = pd.DataFrame(requests.get(ASSEMBLIES_URL).json()["data"])

duplicate_ids = get_duplicate_ids(genomes_source_df)
print(f"Removing rows with duplicate Genome Version/Assembly ID values of: {", ".join(duplicate_ids)}")

deduped_genomes_df = genomes_source_df.drop_duplicates(subset=["Genome Version/Assembly ID"])

gen_bank_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="Genome Version/Assembly ID", right_on="genBank")
ref_seq_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="Genome Version/Assembly ID", right_on="refSeq")
gen_bank_merge_df = deduped_genomes_df.merge(assemblies_df, how="left", left_on="Genome Version/Assembly ID", right_on="genBank")
ref_seq_merge_df = deduped_genomes_df.merge(assemblies_df, how="left", left_on="Genome Version/Assembly ID", right_on="refSeq")

result_df = gen_bank_merge_df.combine_first(ref_seq_merge_df).dropna(subset=["ucscBrowser"])

Expand Down
22 changes: 0 additions & 22 deletions data-catalog/files/out/genomes.json
Original file line number Diff line number Diff line change
Expand Up @@ -1187,28 +1187,6 @@
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000209065.1",
"vEuPathDbProject": "TriTrypDB"
},
{
"chromosomes": 41,
"contigs": 0,
"genomeVersionAssemblyId": "GCA_000209065.1",
"organism": "Trypanosoma cruzi CL Brener Esmeraldo-like",
"species": "Trypanosoma cruzi",
"strain": "CL Brener Esmeraldo-like",
"supercontigs": 0,
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000209065.1",
"vEuPathDbProject": "TriTrypDB"
},
{
"chromosomes": 41,
"contigs": 0,
"genomeVersionAssemblyId": "GCA_000209065.1",
"organism": "Trypanosoma cruzi CL Brener Non-Esmeraldo-like",
"species": "Trypanosoma cruzi",
"strain": "CL Brener Non-Esmeraldo-like",
"supercontigs": 0,
"ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000209065.1",
"vEuPathDbProject": "TriTrypDB"
},
{
"chromosomes": 0,
"contigs": 0,
Expand Down
2 changes: 0 additions & 2 deletions data-catalog/files/source/genomes.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -522,8 +522,6 @@ Trypanosoma cruzi Berenice INSDC GCA_013358655.1 no 0 923 0 Trypanosoma cruzi Be
Trypanosoma cruzi Brazil A4 GenBank GCA_015033625.1 no 359 0 43 Trypanosoma cruzi Brazil A4 TriTrypDB 5693.0 GCA_015033625.1_ASM1503362v1 GCA_015033625.1 False Trypanosoma cruzi Trypanosoma cruzi (Brazil clone A4 2020) https://genome.ucsc.edu/h/GCA_015033625.1
Trypanosoma cruzi Bug2148 GenBank GCA_002749415.1 no 929 0 0 Trypanosoma cruzi Bug2148 TriTrypDB 5693.0 GCA_002749415.1_ASM274941v1 GCA_002749415.1 False Trypanosoma cruzi Trypanosoma cruzi (Bug2148 2017) https://genome.ucsc.edu/h/GCA_002749415.1
Trypanosoma cruzi strain CL INSDC GCA_003719155.1 no 0 7764 0 Trypanosoma cruzi strain CL TriTrypDB 5693.0 GCA_003719155.1_ASM371915v1 GCA_003719155.1 False Trypanosoma cruzi Trypanosoma cruzi (CL 2018) https://genome.ucsc.edu/h/GCA_003719155.1
Trypanosoma cruzi CL Brener Esmeraldo-like GenBank GCA_000209065.1 yes 0 0 41 Trypanosoma cruzi CL Brener Esmeraldo-like TriTrypDB 5693.0 GCF_000209065.1_ASM20906v1 GCA_000209065.1 GCF_000209065.1 True Trypanosoma cruzi Trypanosoma cruzi (CL Brener 2005 kinetoplastids) https://genome.ucsc.edu/h/GCF_000209065.1
Trypanosoma cruzi CL Brener Non-Esmeraldo-like GenBank GCA_000209065.1 no 0 0 41 Trypanosoma cruzi CL Brener Non-Esmeraldo-like TriTrypDB 5693.0 GCF_000209065.1_ASM20906v1 GCA_000209065.1 GCF_000209065.1 True Trypanosoma cruzi Trypanosoma cruzi (CL Brener 2005 kinetoplastids) https://genome.ucsc.edu/h/GCF_000209065.1
Trypanosoma cruzi Dm28c 2017 GenBank GCA_002219105.2 no 0 1029 0 Trypanosoma cruzi Dm28c 2017 TriTrypDB 85057.0 GCA_002219105.2_TcruziDm28cPB1 GCA_002219105.2 False Trypanosoma cruzi cruzi Trypanosoma cruzi cruzi (Dm28c 2017) https://genome.ucsc.edu/h/GCA_002219105.2
Trypanosoma cruzi Dm28c 2018 GenBank GCA_003177105.1 no 636 0 0 Trypanosoma cruzi Dm28c 2018 TriTrypDB 5693.0 GCA_003177105.1_ASM317710v1 GCA_003177105.1 False Trypanosoma cruzi Trypanosoma cruzi (Dm28c 2018) https://genome.ucsc.edu/h/GCA_003177105.1
Trypanosoma cruzi strain G INSDC GCA_003719455.1 no 0 1450 0 Trypanosoma cruzi strain G TriTrypDB 5693.0 GCA_003719455.1_ASM371945v1 GCA_003719455.1 False Trypanosoma cruzi Trypanosoma cruzi (G 2018) https://genome.ucsc.edu/h/GCA_003719455.1
Expand Down

0 comments on commit a0f6b6b

Please sign in to comment.