From a0f6b6b0ed96eea0386fd1560d3dcf635998ea68 Mon Sep 17 00:00:00 2001 From: hunterckx <118154470+hunterckx@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:31:36 -0700 Subject: [PATCH] fix: remove catalog input rows with duplicate ids (#18) --- data-catalog/files/build-genomes-files.py | 13 +++++++++++-- data-catalog/files/out/genomes.json | 22 ---------------------- data-catalog/files/source/genomes.tsv | 2 -- 3 files changed, 11 insertions(+), 26 deletions(-) diff --git a/data-catalog/files/build-genomes-files.py b/data-catalog/files/build-genomes-files.py index 43ccf869..7cf4d0f7 100644 --- a/data-catalog/files/build-genomes-files.py +++ b/data-catalog/files/build-genomes-files.py @@ -7,14 +7,23 @@ OUTPUT_PATH = "files/source/genomes.tsv" +def get_duplicate_ids(genomes_df): + counts = genomes_df["Genome Version/Assembly ID"].value_counts() + return list(counts.index.to_series().loc[counts > 1]) + def build_genomes_files(): print("Building files") genomes_source_df = pd.read_csv(GENOMES_SOURCE_URL, keep_default_na=False, usecols=lambda name: re.fullmatch(r"Unnamed: \d+", name) is None) assemblies_df = pd.DataFrame(requests.get(ASSEMBLIES_URL).json()["data"]) + + duplicate_ids = get_duplicate_ids(genomes_source_df) + print(f"Removing rows with duplicate Genome Version/Assembly ID values of: {", ".join(duplicate_ids)}") + + deduped_genomes_df = genomes_source_df.drop_duplicates(subset=["Genome Version/Assembly ID"]) - gen_bank_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="Genome Version/Assembly ID", right_on="genBank") - ref_seq_merge_df = genomes_source_df.merge(assemblies_df, how="left", left_on="Genome Version/Assembly ID", right_on="refSeq") + gen_bank_merge_df = deduped_genomes_df.merge(assemblies_df, how="left", left_on="Genome Version/Assembly ID", right_on="genBank") + ref_seq_merge_df = deduped_genomes_df.merge(assemblies_df, how="left", left_on="Genome Version/Assembly ID", right_on="refSeq") result_df = gen_bank_merge_df.combine_first(ref_seq_merge_df).dropna(subset=["ucscBrowser"]) diff --git a/data-catalog/files/out/genomes.json b/data-catalog/files/out/genomes.json index b6136db6..f39d0a65 100644 --- a/data-catalog/files/out/genomes.json +++ b/data-catalog/files/out/genomes.json @@ -1187,28 +1187,6 @@ "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000209065.1", "vEuPathDbProject": "TriTrypDB" }, - { - "chromosomes": 41, - "contigs": 0, - "genomeVersionAssemblyId": "GCA_000209065.1", - "organism": "Trypanosoma cruzi CL Brener Esmeraldo-like", - "species": "Trypanosoma cruzi", - "strain": "CL Brener Esmeraldo-like", - "supercontigs": 0, - "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000209065.1", - "vEuPathDbProject": "TriTrypDB" - }, - { - "chromosomes": 41, - "contigs": 0, - "genomeVersionAssemblyId": "GCA_000209065.1", - "organism": "Trypanosoma cruzi CL Brener Non-Esmeraldo-like", - "species": "Trypanosoma cruzi", - "strain": "CL Brener Non-Esmeraldo-like", - "supercontigs": 0, - "ucscBrowserUrl": "https://genome.ucsc.edu/h/GCF_000209065.1", - "vEuPathDbProject": "TriTrypDB" - }, { "chromosomes": 0, "contigs": 0, diff --git a/data-catalog/files/source/genomes.tsv b/data-catalog/files/source/genomes.tsv index 23923d4d..03657117 100644 --- a/data-catalog/files/source/genomes.tsv +++ b/data-catalog/files/source/genomes.tsv @@ -522,8 +522,6 @@ Trypanosoma cruzi Berenice INSDC GCA_013358655.1 no 0 923 0 Trypanosoma cruzi Be Trypanosoma cruzi Brazil A4 GenBank GCA_015033625.1 no 359 0 43 Trypanosoma cruzi Brazil A4 TriTrypDB 5693.0 GCA_015033625.1_ASM1503362v1 GCA_015033625.1 False Trypanosoma cruzi Trypanosoma cruzi (Brazil clone A4 2020) https://genome.ucsc.edu/h/GCA_015033625.1 Trypanosoma cruzi Bug2148 GenBank GCA_002749415.1 no 929 0 0 Trypanosoma cruzi Bug2148 TriTrypDB 5693.0 GCA_002749415.1_ASM274941v1 GCA_002749415.1 False Trypanosoma cruzi Trypanosoma cruzi (Bug2148 2017) https://genome.ucsc.edu/h/GCA_002749415.1 Trypanosoma cruzi strain CL INSDC GCA_003719155.1 no 0 7764 0 Trypanosoma cruzi strain CL TriTrypDB 5693.0 GCA_003719155.1_ASM371915v1 GCA_003719155.1 False Trypanosoma cruzi Trypanosoma cruzi (CL 2018) https://genome.ucsc.edu/h/GCA_003719155.1 -Trypanosoma cruzi CL Brener Esmeraldo-like GenBank GCA_000209065.1 yes 0 0 41 Trypanosoma cruzi CL Brener Esmeraldo-like TriTrypDB 5693.0 GCF_000209065.1_ASM20906v1 GCA_000209065.1 GCF_000209065.1 True Trypanosoma cruzi Trypanosoma cruzi (CL Brener 2005 kinetoplastids) https://genome.ucsc.edu/h/GCF_000209065.1 -Trypanosoma cruzi CL Brener Non-Esmeraldo-like GenBank GCA_000209065.1 no 0 0 41 Trypanosoma cruzi CL Brener Non-Esmeraldo-like TriTrypDB 5693.0 GCF_000209065.1_ASM20906v1 GCA_000209065.1 GCF_000209065.1 True Trypanosoma cruzi Trypanosoma cruzi (CL Brener 2005 kinetoplastids) https://genome.ucsc.edu/h/GCF_000209065.1 Trypanosoma cruzi Dm28c 2017 GenBank GCA_002219105.2 no 0 1029 0 Trypanosoma cruzi Dm28c 2017 TriTrypDB 85057.0 GCA_002219105.2_TcruziDm28cPB1 GCA_002219105.2 False Trypanosoma cruzi cruzi Trypanosoma cruzi cruzi (Dm28c 2017) https://genome.ucsc.edu/h/GCA_002219105.2 Trypanosoma cruzi Dm28c 2018 GenBank GCA_003177105.1 no 636 0 0 Trypanosoma cruzi Dm28c 2018 TriTrypDB 5693.0 GCA_003177105.1_ASM317710v1 GCA_003177105.1 False Trypanosoma cruzi Trypanosoma cruzi (Dm28c 2018) https://genome.ucsc.edu/h/GCA_003177105.1 Trypanosoma cruzi strain G INSDC GCA_003719455.1 no 0 1450 0 Trypanosoma cruzi strain G TriTrypDB 5693.0 GCA_003719455.1_ASM371945v1 GCA_003719455.1 False Trypanosoma cruzi Trypanosoma cruzi (G 2018) https://genome.ucsc.edu/h/GCA_003719455.1