From 6174381bf2b82b972dc8b216a6c1acdd8205977b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Thu, 15 Dec 2022 17:58:29 +0100 Subject: [PATCH 01/12] Adapt first version of BigBio adapter implementation --- flair/datasets/biomedical.py | 195 +++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index 0f0ba9ca26..43f1915790 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -5167,3 +5167,198 @@ class HUNER_SPECIES(HunerMultiCorpus): def __init__(self, sentence_splitter: SentenceSplitter = None): super(HUNER_SPECIES, self).__init__(entity_type="SPECIES", sentence_splitter=sentence_splitter) + + +class BIGBIO_NER_CORPUS(ColumnCorpus): + """ + This class implements an adapter to data sets implemented in the BigBio framework: + + https://github.com/bigscience-workshop/biomedical + + The BigBio framework harmonizes over 120 biomedical data sets and provides a uniform + programming api to access them. This adapter allows to use all named entity recognition + data sets by using the bigbio_kb schema. + """ + + def __init__( + self, + dataset_name: str, + base_path: Union[str, Path] = None, + in_memory: bool = True, + sentence_splitter: SentenceSplitter = None + ): + """ + :param dataset_name: Name of the dataset in the huggingface hub (e.g. nlmchem or bigbio/nlmchem) + :param base_path: Path to the corpus on your machine + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` which + segments the text into sentences and tokens (default :class:`SciSpacySentenceSplitter`) + """ + + if base_path is None: + base_path = flair.cache_root / "datasets" + else: + base_path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # build dataset name and full huggingface reference name + if not dataset_name.startswith("bigbio/"): + full_dataset_name = "bigbio" + "/" + dataset_name + else: + full_dataset_name = dataset_name + dataset_name = dataset_name.replace("bigbio/", "") + + dataset_dir_name = self.build_corpus_directory_name(dataset_name) + data_folder = base_path / dataset_dir_name + + train_file = data_folder / "train.conll" + test_file = data_folder / "test.conll" + + if not (train_file.exists() and test_file.exists()): + from datasets import load_dataset + dataset = load_dataset(full_dataset_name, name=dataset_name + "_bigbio_kb") + + splits = { + "train": self.to_internal_dataset(dataset, "train"), + "test": self.to_internal_dataset(dataset, "test") + } + + # Not every dataset has a dev / validation set! + if "validation" in dataset: + splits["dev"] = self.to_internal_dataset(dataset, "validation") + + # Perform type mapping if necessary + type_mapping = self.get_entity_type_mapping() + if type_mapping: + splits = { + split: filter_and_map_entities(dataset, type_mapping) + for split, dataset in splits.items() + } + + if sentence_splitter is None: + sentence_splitter = SciSpacySentenceSplitter() + + conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) + conll_writer.process_dataset(splits, data_folder) + + super(BIGBIO_NER_CORPUS, self).__init__( + data_folder, + columns, + in_memory=in_memory, + comment_symbol="#", + ) + + def get_entity_type_mapping(self) -> Optional[Dict]: + """ + Return the mapping of entity type given in the dataset to canonical types. Note, if + a entity type is not present in the map it is discarded. + """ + return None + + def build_corpus_directory_name(self, dataset_name: str) -> str: + """ + Builds the directory name for the given data set. + """ + return "bigbio-" + dataset_name.lower() + + def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: + """ + Converts a dataset given in hugging datasets format to our internal corpus representation. + """ + id_to_text = {} + id_to_entities = {} + for document in dataset[split]: + document_id = document["document_id"] + passage_offsets = [] + + # Collect all texts of the document, each passage will be + # a text in our internal format + for passage in document["passages"]: + passage_id = document_id + "#" + str(passage["id"]) + id_to_text[passage_id] = " ".join(passage["text"]) + passage_offsets.append((passage_id, passage["offsets"])) + + id_to_entities[passage_id] = [] + + # Sort passages by start offset + passage_offsets = sorted(passage_offsets, key=lambda e: e[1][0][0]) + + # Transform all entity annotations into internal format + for entity in document["entities"]: + # Find the passage of the entity (necessary for offset adaption) + passage_id, passage_offset = self.bin_search_passage(passage_offsets, 0, len(passage_offsets)-1, entity) + + # Adapt entity offsets according to passage offsets + entity_offset = entity["offsets"][0] + entity_offset = (entity_offset[0] - passage_offset[0], entity_offset[1] - passage_offset[0]) + + id_to_entities[passage_id].append( + Entity(char_span=entity_offset, entity_type=entity["type"]) + ) + + # FIXME: This is just for debugging purposes + passage_text = id_to_text[passage_id] + doc_text = passage_text[entity_offset[0]:entity_offset[1]] + mention_text = entity["text"][0] + if doc_text != mention_text: + print(f"Annotation error ({document['document_id']}) - Doc: {doc_text} vs. Mention: {mention_text}") + + return InternalBioNerDataset( + documents=id_to_text, + entities_per_document=id_to_entities + ) + + def bin_search_passage(self, passages: List[Tuple[str, List[Tuple[int, int]]]], low: int, high: int, entity: Dict): + """ + Helper methods to find the passage to a given entity mention (incl. offset). The implementation + uses binary search to find the passage in the ordered sequence passages. + """ + # Check base case + if high >= low: + # Get element in the middle + mid = (high + low) // 2 + first_text_offset = passages[mid][1][0] + first_mention_offset = entity["offsets"][0] + + # Is the mention with the passage offsets? + if first_mention_offset[0] >= first_text_offset[0] and first_mention_offset[1] <= first_text_offset[1]: + return passages[mid][0], first_text_offset + + # If element is smaller than mid, then it can only + # be present in left subarray + elif first_text_offset[0] > first_mention_offset[0]: + return self.bin_search_passage(passages, low, mid - 1, entity) + else: + # Else the element can only be present in right subarray + return self.bin_search_passage(passages, mid + 1, high, entity) + + else: + # This should never happen :-D + return None + + +class HUNER_CHEMICAL_NLM_CHEM(BIGBIO_NER_CORPUS): + + def __init__(self, *args, **kwargs): + super(HUNER_CHEMICAL_NLM_CHEM, self).__init__(*args, dataset_name="nlmchem", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Chemical": CHEMICAL_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_BIONLP_ST2013_CG(BIGBIO_NER_CORPUS): + + def __init__(self, *args, **kwargs): + super(HUNER_GENE_BIONLP_ST2013_CG, self).__init__(*args, dataset_name="bionlp_st_2013_cg", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Gene_or_gene_product": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + From 20821ec80f806c36227e7c0f1ea8570ce5bae79e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Thu, 19 Jan 2023 09:37:30 +0100 Subject: [PATCH 02/12] Bug fix: only write dev/val split if it exists --- flair/datasets/biomedical.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index 43f1915790..f29d9bde03 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -350,9 +350,11 @@ def __init__( def process_dataset(self, datasets: Dict[str, InternalBioNerDataset], out_dir: Path): self.write_to_conll(datasets["train"], out_dir / "train.conll") - self.write_to_conll(datasets["dev"], out_dir / "dev.conll") self.write_to_conll(datasets["test"], out_dir / "test.conll") + if "dev" in datasets: + self.write_to_conll(datasets["dev"], out_dir / "dev.conll") + def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path): os.makedirs(str(output_file.parent), exist_ok=True) filter_nested_entities(dataset) From 35c9330fe3cf103033747dd5928f464976b9bdc0 Mon Sep 17 00:00:00 2001 From: Xing Wang Date: Tue, 7 Feb 2023 12:14:41 +0100 Subject: [PATCH 03/12] Added BigBio adapter classes for new datasets of Hunflair v2 --- flair/datasets/biomedical.py | 390 +++++++++++++++++++++++++++++++---- 1 file changed, 345 insertions(+), 45 deletions(-) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index 17a50b911f..706d32cda2 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -349,11 +349,12 @@ def __init__( self.sentence_splitter = sentence_splitter def process_dataset(self, datasets: Dict[str, InternalBioNerDataset], out_dir: Path): - self.write_to_conll(datasets["train"], out_dir / "train.conll") - self.write_to_conll(datasets["test"], out_dir / "test.conll") - + if "train" in datasets: + self.write_to_conll(datasets["train"], out_dir / "train.conll") if "dev" in datasets: self.write_to_conll(datasets["dev"], out_dir / "dev.conll") + if "test" in datasets: + self.write_to_conll(datasets["test"], out_dir / "test.conll") def write_to_conll(self, dataset: InternalBioNerDataset, output_file: Path): os.makedirs(str(output_file.parent), exist_ok=True) @@ -1727,7 +1728,7 @@ class HUNER_CHEMICAL_CHEMDNER(HunerDataset): """ def __init__(self, *args, download_folder=None, **kwargs): - self.download_folder = download_folder or CHEMDNER.default_dir / "original" + self.download_folder = download_folder super().__init__(*args, **kwargs) @staticmethod @@ -1735,6 +1736,7 @@ def split_url() -> str: return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chemdner" def to_internal(self, data_dir: Path) -> InternalBioNerDataset: + self.download_folder = data_dir / "original" os.makedirs(str(self.download_folder), exist_ok=True) CHEMDNER.download_dataset(self.download_folder) train_data = bioc_to_internal(self.download_folder / "chemdner_corpus" / "training.bioc.xml") @@ -5173,21 +5175,21 @@ def __init__(self, sentence_splitter: SentenceSplitter = None): class BIGBIO_NER_CORPUS(ColumnCorpus): """ - This class implements an adapter to data sets implemented in the BigBio framework: + This class implements an adapter to data sets implemented in the BigBio framework: - https://github.com/bigscience-workshop/biomedical + https://github.com/bigscience-workshop/biomedical - The BigBio framework harmonizes over 120 biomedical data sets and provides a uniform - programming api to access them. This adapter allows to use all named entity recognition - data sets by using the bigbio_kb schema. + The BigBio framework harmonizes over 120 biomedical data sets and provides a uniform + programming api to access them. This adapter allows to use all named entity recognition + data sets by using the bigbio_kb schema. """ def __init__( - self, - dataset_name: str, - base_path: Union[str, Path] = None, - in_memory: bool = True, - sentence_splitter: SentenceSplitter = None + self, + dataset_name: str, + base_path: Union[str, Path] = None, + in_memory: bool = True, + sentence_splitter: SentenceSplitter = None, ): """ :param dataset_name: Name of the dataset in the huggingface hub (e.g. nlmchem or bigbio/nlmchem) @@ -5218,26 +5220,43 @@ def __init__( train_file = data_folder / "train.conll" test_file = data_folder / "test.conll" - if not (train_file.exists() and test_file.exists()): + # Download data if necessary + # Some datasets only have train or test splits, not both + if not train_file.exists() and not test_file.exists(): from datasets import load_dataset + dataset = load_dataset(full_dataset_name, name=dataset_name + "_bigbio_kb") - splits = { - "train": self.to_internal_dataset(dataset, "train"), - "test": self.to_internal_dataset(dataset, "test") - } + # Special case for ProGene: We need to use the split_0_train and split_0_test splits + train_split_name = None + if "train" in dataset: + train_split_name = "train" + elif "split_0_train" in dataset: + train_split_name = "split_0_train" + test_split_name = None + if "test" in dataset: + test_split_name = "test" + elif "split_0_test" in dataset: + test_split_name = "split_0_test" + validation_split_name = None + if "validation" in dataset: + validation_split_name = "validation" + elif "split_0_validation" in dataset: + validation_split_name = "split_0_validation" + splits = {} # Not every dataset has a dev / validation set! - if "validation" in dataset: - splits["dev"] = self.to_internal_dataset(dataset, "validation") + if train_split_name is not None: + splits["train"] = self.to_internal_dataset(dataset, train_split_name) + if test_split_name is not None: + splits["test"] = self.to_internal_dataset(dataset, test_split_name) + if validation_split_name is not None: + splits["dev"] = self.to_internal_dataset(dataset, validation_split_name) # Perform type mapping if necessary type_mapping = self.get_entity_type_mapping() if type_mapping: - splits = { - split: filter_and_map_entities(dataset, type_mapping) - for split, dataset in splits.items() - } + splits = {split: filter_and_map_entities(dataset, type_mapping) for split, dataset in splits.items()} if sentence_splitter is None: sentence_splitter = SciSpacySentenceSplitter() @@ -5254,20 +5273,20 @@ def __init__( def get_entity_type_mapping(self) -> Optional[Dict]: """ - Return the mapping of entity type given in the dataset to canonical types. Note, if - a entity type is not present in the map it is discarded. + Return the mapping of entity type given in the dataset to canonical types. Note, if + a entity type is not present in the map it is discarded. """ return None def build_corpus_directory_name(self, dataset_name: str) -> str: """ - Builds the directory name for the given data set. + Builds the directory name for the given data set. """ return "bigbio-" + dataset_name.lower() def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: """ - Converts a dataset given in hugging datasets format to our internal corpus representation. + Converts a dataset given in hugging datasets format to our internal corpus representation. """ id_to_text = {} id_to_entities = {} @@ -5290,32 +5309,29 @@ def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: # Transform all entity annotations into internal format for entity in document["entities"]: # Find the passage of the entity (necessary for offset adaption) - passage_id, passage_offset = self.bin_search_passage(passage_offsets, 0, len(passage_offsets)-1, entity) + passage_id, passage_offset = self.bin_search_passage( + passage_offsets, 0, len(passage_offsets) - 1, entity + ) # Adapt entity offsets according to passage offsets entity_offset = entity["offsets"][0] entity_offset = (entity_offset[0] - passage_offset[0], entity_offset[1] - passage_offset[0]) - id_to_entities[passage_id].append( - Entity(char_span=entity_offset, entity_type=entity["type"]) - ) + id_to_entities[passage_id].append(Entity(char_span=entity_offset, entity_type=entity["type"])) # FIXME: This is just for debugging purposes passage_text = id_to_text[passage_id] - doc_text = passage_text[entity_offset[0]:entity_offset[1]] + doc_text = passage_text[entity_offset[0] : entity_offset[1]] mention_text = entity["text"][0] if doc_text != mention_text: print(f"Annotation error ({document['document_id']}) - Doc: {doc_text} vs. Mention: {mention_text}") - return InternalBioNerDataset( - documents=id_to_text, - entities_per_document=id_to_entities - ) + return InternalBioNerDataset(documents=id_to_text, entities_per_document=id_to_entities) def bin_search_passage(self, passages: List[Tuple[str, List[Tuple[int, int]]]], low: int, high: int, entity: Dict): """ - Helper methods to find the passage to a given entity mention (incl. offset). The implementation - uses binary search to find the passage in the ordered sequence passages. + Helper methods to find the passage to a given entity mention (incl. offset). The implementation + uses binary search to find the passage in the ordered sequence passages. """ # Check base case if high >= low: @@ -5341,10 +5357,174 @@ def bin_search_passage(self, passages: List[Tuple[str, List[Tuple[int, int]]]], return None -class HUNER_CHEMICAL_NLM_CHEM(BIGBIO_NER_CORPUS): +class HUNER_GENE_NLM_GENE(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_NLM_GENE, self).__init__(*args, dataset_name="nlm_gene", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Gene": GENE_TAG, "GENERIF": GENE_TAG, "STARGENE": GENE_TAG, "Domain": GENE_TAG, "Other": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + +class HUNER_GENE_DRUGPROT(BIGBIO_NER_CORPUS): def __init__(self, *args, **kwargs): - super(HUNER_CHEMICAL_NLM_CHEM, self).__init__(*args, dataset_name="nlmchem", **kwargs) + super(HUNER_GENE_DRUGPROT, self).__init__(*args, dataset_name="drugprot", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"GENE-N": GENE_TAG, "GENE-Y": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_CHEMICAL_DRUGPROT(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_CHEMICAL_DRUGPROT, self).__init__(*args, dataset_name="drugprot", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"CHEMICAL": CHEMICAL_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_BIORED(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_BIORED, self).__init__(*args, dataset_name="biored", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"GeneOrGeneProduct": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_CHEMICAL_BIORED(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_CHEMICAL_BIORED, self).__init__(*args, dataset_name="biored", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"ChemicalEntity": CHEMICAL_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_DISEASE_BIORED(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_DISEASE_BIORED, self).__init__(*args, dataset_name="biored", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"DiseaseOrPhenotypicFeature": DISEASE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_SPECIES_BIORED(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_SPECIES_BIORED, self).__init__(*args, dataset_name="biored", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"OrganismTaxon": SPECIES_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_CELL_LINE_BIORED(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_CELL_LINE_BIORED, self).__init__(*args, dataset_name="biored", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"CellLine": CELL_LINE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_CPI(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_CPI, self).__init__(*args, dataset_name="cpi", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"protein": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_CHEMICAL_CPI(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_CHEMICAL_CPI, self).__init__(*args, dataset_name="cpi", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"compound": CHEMICAL_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_BIONLP_ST_2013_PC(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_BIONLP_ST_2013_PC, self).__init__(*args, dataset_name="bionlp_st_2013_pc", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Gene_or_gene_product": GENE_TAG, "Complex": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_CHEMICAL_BIONLP_ST_2013_PC(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_CHEMICAL_BIONLP_ST_2013_PC, self).__init__(*args, dataset_name="bionlp_st_2013_pc", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Simple_chemical": CHEMICAL_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_BIONLP_ST_2013_GE(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_BIONLP_ST_2013_GE, self).__init__(*args, dataset_name="bionlp_st_2013_ge", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"protein": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_BIONLP_ST_2011_GE(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_BIONLP_ST_2011_GE, self).__init__(*args, dataset_name="bionlp_st_2011_ge", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Protein": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_BIONLP_ST_2011_ID(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_BIONLP_ST_2011_ID, self).__init__(*args, dataset_name="bionlp_st_2011_id", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Protein": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_CHEMICAL_BIONLP_ST_2011_ID(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_CHEMICAL_BIONLP_ST_2011_ID, self).__init__(*args, dataset_name="bionlp_st_2011_id", **kwargs) def get_entity_type_mapping(self) -> Optional[Dict]: return {"Chemical": CHEMICAL_TAG} @@ -5353,14 +5533,134 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() -class HUNER_GENE_BIONLP_ST2013_CG(BIGBIO_NER_CORPUS): +class HUNER_SPECIES_BIONLP_ST_2011_ID(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_SPECIES_BIONLP_ST_2011_ID, self).__init__(*args, dataset_name="bionlp_st_2011_id", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Organism": SPECIES_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_BIONLP_ST_2011_REL(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_BIONLP_ST_2011_REL, self).__init__(*args, dataset_name="bionlp_st_2011_rel", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Protein": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_BIONLP_ST_2011_EPI(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_BIONLP_ST_2011_EPI, self).__init__(*args, dataset_name="bionlp_st_2011_epi", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Protein": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_SPECIES_BIONLP_ST_2019_BB(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_SPECIES_BIONLP_ST_2019_BB, self).__init__(*args, dataset_name="bionlp_st_2019_bb", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Microorganism": SPECIES_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + +class HUNER_GENE_BIOID(BIGBIO_NER_CORPUS): def __init__(self, *args, **kwargs): - super(HUNER_GENE_BIONLP_ST2013_CG, self).__init__(*args, dataset_name="bionlp_st_2013_cg", **kwargs) + super(HUNER_GENE_BIOID, self).__init__(*args, dataset_name="bioid", **kwargs) def get_entity_type_mapping(self) -> Optional[Dict]: - return {"Gene_or_gene_product": GENE_TAG} + return {"gene": GENE_TAG, "protein": GENE_TAG} def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() + +class HUNER_CHEMICAL_BIOID(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_CHEMICAL_BIOID, self).__init__(*args, dataset_name="bioid", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"chemical": CHEMICAL_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_SPECIES_BIOID(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_SPECIES_BIOID, self).__init__(*args, dataset_name="bioid", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"species": SPECIES_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_CELL_LINE_BIOID(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_CELL_LINE_BIOID, self).__init__(*args, dataset_name="bioid", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"cell": CELL_LINE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_GNORMPLUS(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_GNORMPLUS, self).__init__(*args, dataset_name="gnormplus", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Gene": GENE_TAG, "FamilyName": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_GENE_PROGENE(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_PROGENE, self).__init__(*args, dataset_name="progene", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"progene_text": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +class HUNER_CHEMICAL_NLM_CHEM(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_CHEMICAL_NLM_CHEM, self).__init__(*args, dataset_name="nlmchem", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Chemical": CHEMICAL_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +# Already implemented earlier +# class HUNER_GENE_BIONLP_ST2013_CG(BIGBIO_NER_CORPUS): +# def __init__(self, *args, **kwargs): +# super(HUNER_GENE_BIONLP_ST2013_CG, self).__init__(*args, dataset_name="bionlp_st_2013_cg", **kwargs) + +# def get_entity_type_mapping(self) -> Optional[Dict]: +# return {"Gene_or_gene_product": GENE_TAG} + +# def build_corpus_directory_name(self, dataset_name: str) -> str: +# return self.__class__.__name__.lower() From c85760283ddd36f0964707c4eb3d754d87aaa97d Mon Sep 17 00:00:00 2001 From: Xing Wang Date: Tue, 7 Feb 2023 12:19:36 +0100 Subject: [PATCH 04/12] merged current master (07/02/23) into branch bigbio integration --- flair/datasets/biomedical.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index 335e9d4e79..1ad2d97bb0 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -1689,12 +1689,7 @@ class HUNER_CHEMICAL_CHEMDNER(HunerDataset): HUNER version of the CHEMDNER corpus containing chemical annotations. """ -<<<<<<< HEAD - def __init__(self, *args, download_folder=None, **kwargs): - self.download_folder = download_folder -======= def __init__(self, *args, **kwargs): ->>>>>>> master super().__init__(*args, **kwargs) @staticmethod @@ -1702,20 +1697,11 @@ def split_url() -> str: return "https://raw.githubusercontent.com/hu-ner/huner/master/ner_scripts/splits/chemdner" def to_internal(self, data_dir: Path) -> InternalBioNerDataset: -<<<<<<< HEAD - self.download_folder = data_dir / "original" - os.makedirs(str(self.download_folder), exist_ok=True) - CHEMDNER.download_dataset(self.download_folder) - train_data = bioc_to_internal(self.download_folder / "chemdner_corpus" / "training.bioc.xml") - dev_data = bioc_to_internal(self.download_folder / "chemdner_corpus" / "development.bioc.xml") - test_data = bioc_to_internal(self.download_folder / "chemdner_corpus" / "evaluation.bioc.xml") -======= os.makedirs(str(data_dir), exist_ok=True) CHEMDNER.download_dataset(data_dir) train_data = bioc_to_internal(data_dir / "chemdner_corpus" / "training.bioc.xml") dev_data = bioc_to_internal(data_dir / "chemdner_corpus" / "development.bioc.xml") test_data = bioc_to_internal(data_dir / "chemdner_corpus" / "evaluation.bioc.xml") ->>>>>>> master all_data = merge_datasets([train_data, dev_data, test_data]) all_data = filter_and_map_entities( all_data, From 804247bbeea977c568cd943f5c4385d36f956091 Mon Sep 17 00:00:00 2001 From: Xing Wang Date: Mon, 13 Feb 2023 10:09:18 +0100 Subject: [PATCH 05/12] added new BigBio dataset to HunFlair --- flair/datasets/biomedical.py | 194 ++++++++++++++++++++++++++++++----- 1 file changed, 169 insertions(+), 25 deletions(-) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index 1ad2d97bb0..af817aa45b 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -5162,6 +5162,9 @@ def __init__( base_path: Union[str, Path] = None, in_memory: bool = True, sentence_splitter: SentenceSplitter = None, + train_split_name: Union[str, bool] = None, + dev_split_name: Union[str, bool] = None, + test_split_name: Union[str, bool] = None, ): """ :param dataset_name: Name of the dataset in the huggingface hub (e.g. nlmchem or bigbio/nlmchem) @@ -5169,6 +5172,9 @@ def __init__( :param in_memory: If True, keeps dataset in memory giving speedups in training. :param sentence_splitter: Custom implementation of :class:`SentenceSplitter` which segments the text into sentences and tokens (default :class:`SciSpacySentenceSplitter`) + :param train_split_name: Name of the training split in bigbio, usually train (default: None) + :param dev_split_name: Name of the development split in bigbio, usually validation (default: None) + :param test_split_name: Name of the test split in bigbio, usually test (default: None) """ if base_path is None: @@ -5190,31 +5196,25 @@ def __init__( data_folder = base_path / dataset_dir_name train_file = data_folder / "train.conll" - test_file = data_folder / "test.conll" + # test_file = data_folder / "test.conll" # Download data if necessary - # Some datasets only have train or test splits, not both - if not train_file.exists() and not test_file.exists(): + # Some datasets in BigBio only have train or test splits, not both + # If only test split, assign it to train split + # If only train split, sample other from it (sample_missing_splits=True) + if not train_file.exists(): from datasets import load_dataset dataset = load_dataset(full_dataset_name, name=dataset_name + "_bigbio_kb") - # Special case for ProGene: We need to use the split_0_train and split_0_test splits - train_split_name = None if "train" in dataset: train_split_name = "train" - elif "split_0_train" in dataset: - train_split_name = "split_0_train" - test_split_name = None if "test" in dataset: test_split_name = "test" - elif "split_0_test" in dataset: - test_split_name = "split_0_test" - validation_split_name = None if "validation" in dataset: - validation_split_name = "validation" - elif "split_0_validation" in dataset: - validation_split_name = "split_0_validation" + dev_split_name = "validation" + + assert not (train_split_name is None and test_split_name is None) splits = {} # Not every dataset has a dev / validation set! @@ -5222,8 +5222,10 @@ def __init__( splits["train"] = self.to_internal_dataset(dataset, train_split_name) if test_split_name is not None: splits["test"] = self.to_internal_dataset(dataset, test_split_name) - if validation_split_name is not None: - splits["dev"] = self.to_internal_dataset(dataset, validation_split_name) + if dev_split_name is not None: + splits["dev"] = self.to_internal_dataset(dataset, dev_split_name) + if "train" not in splits and "test" in splits: + splits["train"] = splits.pop("test") # Perform type mapping if necessary type_mapping = self.get_entity_type_mapping() @@ -5237,10 +5239,7 @@ def __init__( conll_writer.process_dataset(splits, data_folder) super(BIGBIO_NER_CORPUS, self).__init__( - data_folder, - columns, - in_memory=in_memory, - comment_symbol="#", + data_folder, columns, in_memory=in_memory, comment_symbol="#", sample_missing_splits=True ) def get_entity_type_mapping(self) -> Optional[Dict]: @@ -5395,9 +5394,9 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() -class HUNER_GENE_SPECIES_BIORED(BIGBIO_NER_CORPUS): +class HUNER_SPECIES_BIORED(BIGBIO_NER_CORPUS): def __init__(self, *args, **kwargs): - super(HUNER_GENE_SPECIES_BIORED, self).__init__(*args, dataset_name="biored", **kwargs) + super(HUNER_SPECIES_BIORED, self).__init__(*args, dataset_name="biored", **kwargs) def get_entity_type_mapping(self) -> Optional[Dict]: return {"OrganismTaxon": SPECIES_TAG} @@ -5406,9 +5405,9 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() -class HUNER_GENE_CELL_LINE_BIORED(BIGBIO_NER_CORPUS): +class HUNER_CELL_LINE_BIORED(BIGBIO_NER_CORPUS): def __init__(self, *args, **kwargs): - super(HUNER_GENE_CELL_LINE_BIORED, self).__init__(*args, dataset_name="biored", **kwargs) + super(HUNER_CELL_LINE_BIORED, self).__init__(*args, dataset_name="biored", **kwargs) def get_entity_type_mapping(self) -> Optional[Dict]: return {"CellLine": CELL_LINE_TAG} @@ -5606,7 +5605,19 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: class HUNER_GENE_PROGENE(BIGBIO_NER_CORPUS): def __init__(self, *args, **kwargs): - super(HUNER_GENE_PROGENE, self).__init__(*args, dataset_name="progene", **kwargs) + # Special case for ProGene: We need to use the split_0_train and split_0_test splits + # as they are currently provided in BigBio + train_split_name = "split_0_train" + dev_split_name = "split_0_validation" + test_split_name = "split_0_test" + super(HUNER_GENE_PROGENE, self).__init__( + *args, + dataset_name="progene", + **kwargs, + train_split_name=train_split_name, + dev_split_name=dev_split_name, + test_split_name=test_split_name, + ) def get_entity_type_mapping(self) -> Optional[Dict]: return {"progene_text": GENE_TAG} @@ -5626,6 +5637,139 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() +class HUNER_GENE_SETH_CORPUS(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_SETH_CORPUS, self).__init__(*args, dataset_name="seth_corpus", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Gene": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +# FIXME: Annotation missmatch from the source PubTator files +# EXAMPLE: Annotation error (21904390) - Doc: p686k1684gene vs. Mention: DKFZ p686k1684 +def map_fn(example): + example["entities"] = [ + repair_doc_offsets(passages, entities) for passages, entities in zip(example["passages"], example["entities"]) + ] + return example + + +def repair_doc_offsets(passages, entities): + """ + Some offsets are broken in tmvar_v3, we need to fix them. Replace doc in-place. + """ + + text = " ".join([passage["text"][0] for passage in passages]) + + sentences = text.split(". ") + + sentence_indexes = [m.start() + 2 for m in re.finditer("\. ", text)] # because the suffix is ". " + sentence_indexes = [0] + sentence_indexes + + doc_entities = entities + + if len(doc_entities) == 0: + return + + # doc_entities = dataset[split].filter(lambda x: x["document_id"] == "21904390")[:]["entities"][0] + + # print(sentence_indexes) + # print(len(sentences)) + # print(text) + # print(doc_entities) + + sentence_index = 0 + entity_index = 0 + current_offset = 0 + next_sentence_offset = 0 + next_entity_offset = text[current_offset:].find(doc_entities[entity_index]["text"][0]) + while True: + if sentence_index >= len(sentence_indexes) and entity_index >= len(doc_entities): + break + if next_sentence_offset <= next_entity_offset: + sentence_end = sentence_indexes[sentence_index] + len(sentences[sentence_index]) + 2 + # print(f"Sentence {sentence_index} @ offsets {sentence_indexes[sentence_index]} to {sentence_end}") + # print(sentences[sentence_index] + ". ") + sentence_index += 1 + if sentence_index >= len(sentence_indexes): + next_sentence_offset = len(text) + else: + next_sentence_offset = sentence_indexes[sentence_index] + # print(f"DEBUG next_sentence_offset: {next_sentence_offset}") + else: # next_entity_offset < next_sentence_offset + entity = doc_entities[entity_index] + entity_name = entity["text"][0] + given_offset_start = entity["offsets"][0][0] + given_offset_end = entity["offsets"][0][1] + # print(f" {entity_name} @ offsets (real) {next_entity_offset} to {next_entity_offset + len(entity_name)}") + # print(f" {text[given_offset_start:given_offset_end]} @ offset (given) {given_offset_start} to {given_offset_end}") + if given_offset_start != next_entity_offset: # Mismatched entities + # print(doc_entities) + entity["offsets"][0][0] = next_entity_offset + entity["offsets"][0][1] = next_entity_offset + len(entity_name) + # print(doc_entities) + current_offset = next_entity_offset + len(entity_name) + entity_index += 1 + if entity_index >= len(doc_entities): + next_entity_offset = len(text) + else: + next_entity_offset = current_offset + text[current_offset:].find(doc_entities[entity_index]["text"][0]) + + return doc_entities + + +class HUNER_GENE_TMVAR_V3(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_TMVAR_V3, self).__init__(*args, dataset_name="tmvar_v3", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"['Gene']": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + # Some offsets are broken in tmvar_v3, we need to fix them + def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: + """ + Converts a dataset given in hugging datasets format to our internal corpus representation. + """ + dataset = dataset.map(map_fn, batched=True) + return super(HUNER_GENE_TMVAR_V3, self).to_internal_dataset(dataset, split) + + +class HUNER_SPECIES_TMVAR_V3(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_SPECIES_TMVAR_V3, self).__init__(*args, dataset_name="tmvar_v3", **kwargs) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"['Species']": SPECIES_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + # Some offsets are broken in tmvar_v3, we need to fix them + def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: + """ + Converts a dataset given in hugging datasets format to our internal corpus representation. + """ + dataset = dataset.map(map_fn, batched=True) + return super(HUNER_SPECIES_TMVAR_V3, self).to_internal_dataset(dataset, split) + + +# class HUNER_CELL_LINE_TMVAR_V3(BIGBIO_NER_CORPUS): +# def __init__(self, *args, **kwargs): +# super(HUNER_CELL_LINE_TMVAR_V3, self).__init__(*args, dataset_name="tmvar_v3", **kwargs) + +# def get_entity_type_mapping(self) -> Optional[Dict]: +# return {"['CellLine']": CELL_LINE_TAG} + +# def build_corpus_directory_name(self, dataset_name: str) -> str: +# return self.__class__.__name__.lower() + + # Already implemented earlier # class HUNER_GENE_BIONLP_ST2013_CG(BIGBIO_NER_CORPUS): # def __init__(self, *args, **kwargs): From 72a7ddf61b1659905f3b616c56a8ddb7cda4d3f1 Mon Sep 17 00:00:00 2001 From: Xing Wang Date: Tue, 28 Feb 2023 17:23:56 +0100 Subject: [PATCH 06/12] finished BigBio integration for HunFlair v2 --- flair/datasets/biomedical.py | 130 +++++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 53 deletions(-) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index af817aa45b..21052a6445 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -2770,7 +2770,7 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset: start, end = int(fields[2]), int(fields[3]) if start == end: - continue + continue_S800 entities_per_document[fname].append(Entity((start, end), "Species")) @@ -5186,7 +5186,11 @@ def __init__( columns = {0: "text", 1: "ner"} # build dataset name and full huggingface reference name - if not dataset_name.startswith("bigbio/"): + if dataset_name.startswith("/"): # Absolute path for local BigBio datasets + full_dataset_name = dataset_name + dataset_name = dataset_name.split("/")[-1] + dataset_name = dataset_name.split(".")[0] + elif not dataset_name.startswith("bigbio/"): full_dataset_name = "bigbio" + "/" + dataset_name else: full_dataset_name = dataset_name @@ -5325,7 +5329,7 @@ def bin_search_passage(self, passages: List[Tuple[str, List[Tuple[int, int]]]], else: # This should never happen :-D - return None + return -1, -1 class HUNER_GENE_NLM_GENE(BIGBIO_NER_CORPUS): @@ -5648,11 +5652,80 @@ def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() +class HUNER_GENE_TMVAR_V3(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_GENE_TMVAR_V3, self).__init__( + *args, + dataset_name="/vol/fob-wbib-vol3/wbi_stud/wangxida/Studienprojekt/biomedical/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py", + # dataset_name="tmvar_v3", + **kwargs, + ) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"Gene": GENE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + # Some offsets are broken in tmvar_v3, we need to fix them + def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: + """ + Converts a dataset given in hugging datasets format to our internal corpus representation. + """ + # dataset = dataset.map(map_fn, batched=True) + return super(HUNER_GENE_TMVAR_V3, self).to_internal_dataset(dataset, split) + + +class HUNER_SPECIES_TMVAR_V3(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_SPECIES_TMVAR_V3, self).__init__( + *args, + dataset_name="/vol/fob-wbib-vol3/wbi_stud/wangxida/Studienprojekt/biomedical/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py", + # dataset_name="tmvar_v3", + **kwargs, + ) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"['Species']": SPECIES_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + # Some offsets are broken in tmvar_v3, we need to fix them + def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: + """ + Converts a dataset given in hugging datasets format to our internal corpus representation. + """ + # dataset = dataset.map(map_fn, batched=True) + return super(HUNER_SPECIES_TMVAR_V3, self).to_internal_dataset(dataset, split) + + +class HUNER_CELL_LINE_TMVAR_V3(BIGBIO_NER_CORPUS): + def __init__(self, *args, **kwargs): + super(HUNER_CELL_LINE_TMVAR_V3, self).__init__( + *args, + dataset_name="/vol/fob-wbib-vol3/wbi_stud/wangxida/Studienprojekt/biomedical/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py", + # dataset_name="tmvar_v3", + **kwargs, + ) + + def get_entity_type_mapping(self) -> Optional[Dict]: + return {"['CellLine']": CELL_LINE_TAG} + + def build_corpus_directory_name(self, dataset_name: str) -> str: + return self.__class__.__name__.lower() + + +# Deprecated, is fixed in BigBio but useful code for debugging future issues # FIXME: Annotation missmatch from the source PubTator files # EXAMPLE: Annotation error (21904390) - Doc: p686k1684gene vs. Mention: DKFZ p686k1684 def map_fn(example): example["entities"] = [ - repair_doc_offsets(passages, entities) for passages, entities in zip(example["passages"], example["entities"]) + repair_doc_offsets(passages, entities) + if passages[0] + == "Two novel mutations of the PAX6 gene causing different phenotype in a cohort of Chinese patients." + else entities + for passages, entities in zip(example["passages"], example["entities"]) ] return example @@ -5721,55 +5794,6 @@ def repair_doc_offsets(passages, entities): return doc_entities -class HUNER_GENE_TMVAR_V3(BIGBIO_NER_CORPUS): - def __init__(self, *args, **kwargs): - super(HUNER_GENE_TMVAR_V3, self).__init__(*args, dataset_name="tmvar_v3", **kwargs) - - def get_entity_type_mapping(self) -> Optional[Dict]: - return {"['Gene']": GENE_TAG} - - def build_corpus_directory_name(self, dataset_name: str) -> str: - return self.__class__.__name__.lower() - - # Some offsets are broken in tmvar_v3, we need to fix them - def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: - """ - Converts a dataset given in hugging datasets format to our internal corpus representation. - """ - dataset = dataset.map(map_fn, batched=True) - return super(HUNER_GENE_TMVAR_V3, self).to_internal_dataset(dataset, split) - - -class HUNER_SPECIES_TMVAR_V3(BIGBIO_NER_CORPUS): - def __init__(self, *args, **kwargs): - super(HUNER_SPECIES_TMVAR_V3, self).__init__(*args, dataset_name="tmvar_v3", **kwargs) - - def get_entity_type_mapping(self) -> Optional[Dict]: - return {"['Species']": SPECIES_TAG} - - def build_corpus_directory_name(self, dataset_name: str) -> str: - return self.__class__.__name__.lower() - - # Some offsets are broken in tmvar_v3, we need to fix them - def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: - """ - Converts a dataset given in hugging datasets format to our internal corpus representation. - """ - dataset = dataset.map(map_fn, batched=True) - return super(HUNER_SPECIES_TMVAR_V3, self).to_internal_dataset(dataset, split) - - -# class HUNER_CELL_LINE_TMVAR_V3(BIGBIO_NER_CORPUS): -# def __init__(self, *args, **kwargs): -# super(HUNER_CELL_LINE_TMVAR_V3, self).__init__(*args, dataset_name="tmvar_v3", **kwargs) - -# def get_entity_type_mapping(self) -> Optional[Dict]: -# return {"['CellLine']": CELL_LINE_TAG} - -# def build_corpus_directory_name(self, dataset_name: str) -> str: -# return self.__class__.__name__.lower() - - # Already implemented earlier # class HUNER_GENE_BIONLP_ST2013_CG(BIGBIO_NER_CORPUS): # def __init__(self, *args, **kwargs): From 9ed972615fec2d013adf9fa804f95118b9cc1159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Wed, 15 Mar 2023 15:37:00 +0100 Subject: [PATCH 07/12] Remove debugging code + fix local data set paths --- flair/datasets/biomedical.py | 109 +++-------------------------------- 1 file changed, 8 insertions(+), 101 deletions(-) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index 21052a6445..f40e9dcd1d 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -5295,11 +5295,11 @@ def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: id_to_entities[passage_id].append(Entity(char_span=entity_offset, entity_type=entity["type"])) # FIXME: This is just for debugging purposes - passage_text = id_to_text[passage_id] - doc_text = passage_text[entity_offset[0] : entity_offset[1]] - mention_text = entity["text"][0] - if doc_text != mention_text: - print(f"Annotation error ({document['document_id']}) - Doc: {doc_text} vs. Mention: {mention_text}") + # passage_text = id_to_text[passage_id] + # doc_text = passage_text[entity_offset[0] : entity_offset[1]] + # mention_text = entity["text"][0] + # if doc_text != mention_text: + # print(f"Annotation error ({document['document_id']}) - Doc: {doc_text} vs. Mention: {mention_text}") return InternalBioNerDataset(documents=id_to_text, entities_per_document=id_to_entities) @@ -5656,8 +5656,7 @@ class HUNER_GENE_TMVAR_V3(BIGBIO_NER_CORPUS): def __init__(self, *args, **kwargs): super(HUNER_GENE_TMVAR_V3, self).__init__( *args, - dataset_name="/vol/fob-wbib-vol3/wbi_stud/wangxida/Studienprojekt/biomedical/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py", - # dataset_name="tmvar_v3", + dataset_name="tmvar_v3", **kwargs, ) @@ -5680,8 +5679,7 @@ class HUNER_SPECIES_TMVAR_V3(BIGBIO_NER_CORPUS): def __init__(self, *args, **kwargs): super(HUNER_SPECIES_TMVAR_V3, self).__init__( *args, - dataset_name="/vol/fob-wbib-vol3/wbi_stud/wangxida/Studienprojekt/biomedical/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py", - # dataset_name="tmvar_v3", + dataset_name="tmvar_v3", **kwargs, ) @@ -5704,8 +5702,7 @@ class HUNER_CELL_LINE_TMVAR_V3(BIGBIO_NER_CORPUS): def __init__(self, *args, **kwargs): super(HUNER_CELL_LINE_TMVAR_V3, self).__init__( *args, - dataset_name="/vol/fob-wbib-vol3/wbi_stud/wangxida/Studienprojekt/biomedical/bigbio/hub/hub_repos/tmvar_v3/tmvar_v3.py", - # dataset_name="tmvar_v3", + dataset_name="tmvar_v3", **kwargs, ) @@ -5714,93 +5711,3 @@ def get_entity_type_mapping(self) -> Optional[Dict]: def build_corpus_directory_name(self, dataset_name: str) -> str: return self.__class__.__name__.lower() - - -# Deprecated, is fixed in BigBio but useful code for debugging future issues -# FIXME: Annotation missmatch from the source PubTator files -# EXAMPLE: Annotation error (21904390) - Doc: p686k1684gene vs. Mention: DKFZ p686k1684 -def map_fn(example): - example["entities"] = [ - repair_doc_offsets(passages, entities) - if passages[0] - == "Two novel mutations of the PAX6 gene causing different phenotype in a cohort of Chinese patients." - else entities - for passages, entities in zip(example["passages"], example["entities"]) - ] - return example - - -def repair_doc_offsets(passages, entities): - """ - Some offsets are broken in tmvar_v3, we need to fix them. Replace doc in-place. - """ - - text = " ".join([passage["text"][0] for passage in passages]) - - sentences = text.split(". ") - - sentence_indexes = [m.start() + 2 for m in re.finditer("\. ", text)] # because the suffix is ". " - sentence_indexes = [0] + sentence_indexes - - doc_entities = entities - - if len(doc_entities) == 0: - return - - # doc_entities = dataset[split].filter(lambda x: x["document_id"] == "21904390")[:]["entities"][0] - - # print(sentence_indexes) - # print(len(sentences)) - # print(text) - # print(doc_entities) - - sentence_index = 0 - entity_index = 0 - current_offset = 0 - next_sentence_offset = 0 - next_entity_offset = text[current_offset:].find(doc_entities[entity_index]["text"][0]) - while True: - if sentence_index >= len(sentence_indexes) and entity_index >= len(doc_entities): - break - if next_sentence_offset <= next_entity_offset: - sentence_end = sentence_indexes[sentence_index] + len(sentences[sentence_index]) + 2 - # print(f"Sentence {sentence_index} @ offsets {sentence_indexes[sentence_index]} to {sentence_end}") - # print(sentences[sentence_index] + ". ") - sentence_index += 1 - if sentence_index >= len(sentence_indexes): - next_sentence_offset = len(text) - else: - next_sentence_offset = sentence_indexes[sentence_index] - # print(f"DEBUG next_sentence_offset: {next_sentence_offset}") - else: # next_entity_offset < next_sentence_offset - entity = doc_entities[entity_index] - entity_name = entity["text"][0] - given_offset_start = entity["offsets"][0][0] - given_offset_end = entity["offsets"][0][1] - # print(f" {entity_name} @ offsets (real) {next_entity_offset} to {next_entity_offset + len(entity_name)}") - # print(f" {text[given_offset_start:given_offset_end]} @ offset (given) {given_offset_start} to {given_offset_end}") - if given_offset_start != next_entity_offset: # Mismatched entities - # print(doc_entities) - entity["offsets"][0][0] = next_entity_offset - entity["offsets"][0][1] = next_entity_offset + len(entity_name) - # print(doc_entities) - current_offset = next_entity_offset + len(entity_name) - entity_index += 1 - if entity_index >= len(doc_entities): - next_entity_offset = len(text) - else: - next_entity_offset = current_offset + text[current_offset:].find(doc_entities[entity_index]["text"][0]) - - return doc_entities - - -# Already implemented earlier -# class HUNER_GENE_BIONLP_ST2013_CG(BIGBIO_NER_CORPUS): -# def __init__(self, *args, **kwargs): -# super(HUNER_GENE_BIONLP_ST2013_CG, self).__init__(*args, dataset_name="bionlp_st_2013_cg", **kwargs) - -# def get_entity_type_mapping(self) -> Optional[Dict]: -# return {"Gene_or_gene_product": GENE_TAG} - -# def build_corpus_directory_name(self, dataset_name: str) -> str: -# return self.__class__.__name__.lower() From ebaa95f68ad3fb3bfe28c08dcbe4797960f9c372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Wed, 15 Mar 2023 15:49:05 +0100 Subject: [PATCH 08/12] Fix typo --- flair/datasets/biomedical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index f40e9dcd1d..06baa9b014 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -2770,7 +2770,7 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset: start, end = int(fields[2]), int(fields[3]) if start == end: - continue_S800 + continue # Illegal annotation entities_per_document[fname].append(Entity((start, end), "Species")) From e134ee37ea4d108b87156ef01c463355350947dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Mon, 3 Apr 2023 14:22:29 +0200 Subject: [PATCH 09/12] Add deprecated tag to data sets that are also available in BigBio --- flair/datasets/biomedical.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index 06baa9b014..a50e11a505 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -8,6 +8,7 @@ from abc import ABC, abstractmethod from collections import defaultdict, deque from copy import copy +from deprecated import deprecated from operator import attrgetter from pathlib import Path from tarfile import ( @@ -230,7 +231,6 @@ def bioc_to_internal(bioc_file: Path): document_text += " " + text for annotation in passage.xpath(".//annotation"): - entity_types = [ i.text.replace(" ", "_") for i in annotation.xpath("./infon") @@ -644,6 +644,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return merge_datasets([train_data, test_data]) +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class JNLPBA(ColumnCorpus): """ Original corpus of the JNLPBA shared task. @@ -990,6 +991,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return data +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class MIRNA(ColumnCorpus): """ Original miRNA corpus. @@ -1617,6 +1619,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return filter_and_map_entities(dataset, {"protein": GENE_TAG}) +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class CHEMDNER(ColumnCorpus): """ Original corpus of the CHEMDNER shared task. @@ -1720,6 +1723,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return all_data +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class IEPA(ColumnCorpus): """ IEPA corpus as provided by http://corpora.informatik.hu-berlin.de/ @@ -1835,6 +1839,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return merge_datasets([train_data, test_data]) +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class LINNEAUS(ColumnCorpus): """ Original LINNEAUS corpus containing species annotations. @@ -1939,6 +1944,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return LINNEAUS.download_and_parse_dataset(data_dir) +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class CDR(ColumnCorpus): """ CDR corpus as provided by https://github.com/JHnlp/BioCreative-V-CDR-Corpus @@ -2054,6 +2060,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return all_data +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class VARIOME(ColumnCorpus): """ Variome corpus as provided by http://corpora.informatik.hu-berlin.de/corpora/brat2bioc/hvp_bioc.xml.zip @@ -2213,6 +2220,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return all_data +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class NCBI_DISEASE(ColumnCorpus): """ Original NCBI disease corpus containing disease annotations. @@ -2468,6 +2476,7 @@ def parse_input_file(input_file: Path): return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class SCAI_CHEMICALS(ScaiCorpus): """ Original SCAI chemicals corpus containing chemical annotations. @@ -2496,6 +2505,7 @@ def perform_corpus_download(data_dir: Path) -> Path: return corpus_file +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class SCAI_DISEASE(ScaiCorpus): """ Original SCAI disease corpus containing disease annotations. @@ -2575,6 +2585,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return filter_and_map_entities(corpus, entity_mapping) +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class OSIRIS(ColumnCorpus): """ Original OSIRIS corpus containing variation and gene annotations. @@ -2645,7 +2656,6 @@ def parse_dataset(cls, corpus_folder: Path, fix_annotation=True): file for file in os.listdir(str(corpus_folder)) if file.endswith(".txt") and not file.startswith("README") ] for text_file in input_files: - with open(os.path.join(str(corpus_folder), text_file), encoding="utf8") as text_reader: document_text = text_reader.read() if not document_text: @@ -2770,7 +2780,7 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset: start, end = int(fields[2]), int(fields[3]) if start == end: - continue # Illegal annotation + continue # Illegal annotation entities_per_document[fname].append(Entity((start, end), "Species")) @@ -3449,6 +3459,7 @@ def parse_dataset(data_dir: Path) -> InternalBioNerDataset: return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class BC2GM(ColumnCorpus): """ Original BioCreative-II-GM corpus containing gene annotations. @@ -3754,6 +3765,7 @@ def to_internal(self, data_dir: Path) -> InternalBioNerDataset: return filter_and_map_entities(dataset, entity_type_mapping) +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class CHEBI(ColumnCorpus): """ Original CHEBI corpus containing all annotations. @@ -4018,6 +4030,7 @@ def parse_input_files(input_folder: Path) -> InternalBioNerDataset: return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class BIONLP2013_PC(BioNLPCorpus): """ Corpus of the BioNLP'2013 Pathway Curation shared task @@ -4060,6 +4073,7 @@ def download_corpus(download_folder: Path) -> Tuple[Path, Path, Path]: return train_folder, dev_folder, test_folder +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class BIONLP2013_CG(BioNLPCorpus): """ Corpus of the BioNLP'2013 Cancer Genetics shared task @@ -4088,6 +4102,7 @@ def download_corpus(download_folder: Path) -> Tuple[Path, Path, Path]: return train_folder, dev_folder, test_folder +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class ANAT_EM(ColumnCorpus): """ Corpus for anatomical named entity mention recognition. @@ -4967,6 +4982,7 @@ def parse_corpus(input_file: Path) -> InternalBioNerDataset: return InternalBioNerDataset(documents=documents, entities_per_document=entities_per_document) +@deprecated(version="0.13", reason="Please use data set implementation from BigBio instead (see BIGBIO_NER_CORPUS)") class PDR(ColumnCorpus): """ Corpus of plant-disease relations from Kim et al., consisting of named entity annotations From 8c3b16db60b9ebc7d57380113448bde3b1ef84ef Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Tue, 11 Apr 2023 22:48:13 +0200 Subject: [PATCH 10/12] Fix formatting --- flair/datasets/biomedical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index a50e11a505..15d7bcbe84 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -8,7 +8,6 @@ from abc import ABC, abstractmethod from collections import defaultdict, deque from copy import copy -from deprecated import deprecated from operator import attrgetter from pathlib import Path from tarfile import ( @@ -23,6 +22,7 @@ from zipfile import BadZipFile, LargeZipFile import ftfy +from deprecated import deprecated from lxml import etree from lxml.etree import XMLSyntaxError From abd4fc028d2e9873f4557305be4cbf016e2cf6c5 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Tue, 11 Apr 2023 23:21:53 +0200 Subject: [PATCH 11/12] Fix typing problems --- flair/datasets/biomedical.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index 15d7bcbe84..efd1608c51 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -5178,9 +5178,9 @@ def __init__( base_path: Union[str, Path] = None, in_memory: bool = True, sentence_splitter: SentenceSplitter = None, - train_split_name: Union[str, bool] = None, - dev_split_name: Union[str, bool] = None, - test_split_name: Union[str, bool] = None, + train_split_name: Optional[str] = None, + dev_split_name: Optional[str] = None, + test_split_name: Optional[str] = None, ): """ :param dataset_name: Name of the dataset in the huggingface hub (e.g. nlmchem or bigbio/nlmchem) @@ -5280,7 +5280,7 @@ def to_internal_dataset(self, dataset, split: str) -> InternalBioNerDataset: Converts a dataset given in hugging datasets format to our internal corpus representation. """ id_to_text = {} - id_to_entities = {} + id_to_entities: Dict[str, List] = {} for document in dataset[split]: document_id = document["document_id"] passage_offsets = [] From a503c5b0a7bc977374e6a9bfb72a327870e5feab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Wed, 12 Apr 2023 16:30:51 +0200 Subject: [PATCH 12/12] Revise BIGBIO_NER_CORPUS initialization: store conll files in separate directories per sentence splitter (configuration) --- flair/datasets/biomedical.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py index efd1608c51..aa0f0c9335 100644 --- a/flair/datasets/biomedical.py +++ b/flair/datasets/biomedical.py @@ -5177,7 +5177,7 @@ def __init__( dataset_name: str, base_path: Union[str, Path] = None, in_memory: bool = True, - sentence_splitter: SentenceSplitter = None, + sentence_splitter: Optional[SentenceSplitter] = None, train_split_name: Optional[str] = None, dev_split_name: Optional[str] = None, test_split_name: Optional[str] = None, @@ -5212,11 +5212,12 @@ def __init__( full_dataset_name = dataset_name dataset_name = dataset_name.replace("bigbio/", "") + self.sentence_splitter = sentence_splitter if sentence_splitter else SciSpacySentenceSplitter() + dataset_dir_name = self.build_corpus_directory_name(dataset_name) - data_folder = base_path / dataset_dir_name + data_folder = base_path / dataset_dir_name / self.sentence_splitter.name train_file = data_folder / "train.conll" - # test_file = data_folder / "test.conll" # Download data if necessary # Some datasets in BigBio only have train or test splits, not both @@ -5252,10 +5253,7 @@ def __init__( if type_mapping: splits = {split: filter_and_map_entities(dataset, type_mapping) for split, dataset in splits.items()} - if sentence_splitter is None: - sentence_splitter = SciSpacySentenceSplitter() - - conll_writer = CoNLLWriter(sentence_splitter=sentence_splitter) + conll_writer = CoNLLWriter(sentence_splitter=self.sentence_splitter) conll_writer.process_dataset(splits, data_folder) super(BIGBIO_NER_CORPUS, self).__init__(