diff --git a/backend/.dockerignore b/backend/.dockerignore index 9c3825cc3..708d5096a 100644 --- a/backend/.dockerignore +++ b/backend/.dockerignore @@ -8,6 +8,8 @@ src/dev_notebooks src/data src/experiments +sample_data +notebooks # ignore caches .pytest_cache @@ -15,6 +17,8 @@ src/experiments # others .dockerignore .gitignore +.idea +.run Dockerfile Makefile README.md diff --git a/backend/requirements.txt b/backend/requirements.txt index 5e5c2fba0..5c292412d 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -6,3 +6,4 @@ scrapy-playwright==0.0.31 scrapy-selenium==0.0.7 weaviate-client==3.24.1 webdriver-manager==4.0.1 +yake==0.4.8 diff --git a/backend/src/app/preprocessing/pipeline/__init__.py b/backend/src/app/preprocessing/pipeline/__init__.py index d932a3b8b..f83368915 100644 --- a/backend/src/app/preprocessing/pipeline/__init__.py +++ b/backend/src/app/preprocessing/pipeline/__init__.py @@ -38,14 +38,17 @@ def build_text_pipeline(foo: str = "bar") -> PreprocessingPipeline: from app.preprocessing.pipeline.steps.text.extract_text_from_html_and_create_source_mapping import ( extract_text_from_html_and_create_source_mapping, ) + from app.preprocessing.pipeline.steps.text.generate_keywords import ( + generate_keywords, + ) from app.preprocessing.pipeline.steps.text.generate_named_entity_annotations import ( generate_named_entity_annotations, ) from app.preprocessing.pipeline.steps.text.generate_sentence_annotations import ( generate_sentence_annotations, ) - from app.preprocessing.pipeline.steps.text.generate_word_frequencies_and_keywords import ( - generate_word_frequncies_and_keywords, + from app.preprocessing.pipeline.steps.text.generate_word_frequencies import ( + generate_word_frequncies, ) from app.preprocessing.pipeline.steps.text.index_text_document_for_simsearch import ( index_text_document_for_simsearch, @@ -98,7 +101,12 @@ def build_text_pipeline(foo: str = "bar") -> PreprocessingPipeline: ) pipeline.register_step( - func=generate_word_frequncies_and_keywords, + func=generate_word_frequncies, + required_data=["pptd"], + ) + + pipeline.register_step( + func=generate_keywords, required_data=["pptd"], ) @@ -199,11 +207,14 @@ def build_image_pipeline(foo: str = "bar") -> PreprocessingPipeline: from app.preprocessing.pipeline.steps.image.write_ppid_to_database import ( write_ppid_to_database, ) + from app.preprocessing.pipeline.steps.text.generate_keywords import ( + generate_keywords, + ) from app.preprocessing.pipeline.steps.text.generate_sentence_annotations import ( generate_sentence_annotations, ) - from app.preprocessing.pipeline.steps.text.generate_word_frequencies_and_keywords import ( - generate_word_frequncies_and_keywords, + from app.preprocessing.pipeline.steps.text.generate_word_frequencies import ( + generate_word_frequncies, ) from app.preprocessing.pipeline.steps.text.run_spacy_pipeline import ( run_spacy_pipeline, @@ -268,7 +279,12 @@ def build_image_pipeline(foo: str = "bar") -> PreprocessingPipeline: ) pipeline.register_step( - func=generate_word_frequncies_and_keywords, + func=generate_word_frequncies, + required_data=["pptd"], + ) + + pipeline.register_step( + func=generate_keywords, required_data=["pptd"], ) diff --git a/backend/src/app/preprocessing/pipeline/steps/text/generate_keywords.py b/backend/src/app/preprocessing/pipeline/steps/text/generate_keywords.py new file mode 100644 index 000000000..9123c5b43 --- /dev/null +++ b/backend/src/app/preprocessing/pipeline/steps/text/generate_keywords.py @@ -0,0 +1,65 @@ +import yake +from loguru import logger + +from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo +from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc +from config import conf + + +def generate_keywords(cargo: PipelineCargo) -> PipelineCargo: + pptd: PreProTextDoc = cargo.data["pptd"] + out = pptd.spacy_pipeline_output + + if out is None: + logger.error( + f"spaCy PipelineOutput is None for {pptd.filename}! Please run the spaCy pipeline first!" + ) + return cargo + + language = pptd.metadata.get("language", "noLang") + if isinstance(language, list): + language = language[0] + + kw_extractor = yake.KeywordExtractor( + lan=language, + n=conf.keyword_extraction.max_ngram_size, + dedupLim=conf.keyword_extraction.deduplication_threshold, + top=conf.keyword_extraction.keyword_proposals, + ) + keyword_proposals = kw_extractor.extract_keywords(pptd.text) + keyword_proposals = [kw for kw, _ in keyword_proposals] + + tok2pos = {tok.text: tok.pos for tok in out.tokens} + + keep = [ + "NOUN", + "PROPN", + # + ["NOUN", "NOUN"], + ["PROPN", "PROPN"], + ["PROPN", "NOUN"], + ["NOUN", "PROPN"], + # + ["ADJ", "NOUN"], + ["ADJ", "PROPN"], + # + ["NOUN", "VERB"], + ["PROPN", "VERB"], + ["VERB", "NOUN"], + ["VERB", "PROPN"], + ] + keywords = [] + for kp in keyword_proposals: + ws = kp.split() + if len(ws) == 1: + if tok2pos[ws[0]] in keep: + keywords.append(kp) + elif len(ws) == 2: + if [tok2pos[w] for w in ws] in keep: + keywords.append(kp) + elif tok2pos[ws[0]] in keep and tok2pos[ws[1]] == "ADJ": + keywords.append(ws[0]) + + pptd.keywords = keywords + + return cargo diff --git a/backend/src/app/preprocessing/pipeline/steps/text/generate_word_frequencies_and_keywords.py b/backend/src/app/preprocessing/pipeline/steps/text/generate_word_frequencies.py similarity index 87% rename from backend/src/app/preprocessing/pipeline/steps/text/generate_word_frequencies_and_keywords.py rename to backend/src/app/preprocessing/pipeline/steps/text/generate_word_frequencies.py index 26fba3772..72f07ad22 100644 --- a/backend/src/app/preprocessing/pipeline/steps/text/generate_word_frequencies_and_keywords.py +++ b/backend/src/app/preprocessing/pipeline/steps/text/generate_word_frequencies.py @@ -6,7 +6,7 @@ from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc -def generate_word_frequncies_and_keywords(cargo: PipelineCargo) -> PipelineCargo: +def generate_word_frequncies(cargo: PipelineCargo) -> PipelineCargo: pptd: PreProTextDoc = cargo.data["pptd"] out = pptd.spacy_pipeline_output @@ -34,7 +34,5 @@ def generate_word_frequncies_and_keywords(cargo: PipelineCargo) -> PipelineCargo k: v for (k, v) in sorted(pptd.word_freqs.items(), key=lambda i: i[1], reverse=True) } - # use top-5 as keywords - pptd.keywords = list(pptd.word_freqs.keys())[:5] return cargo diff --git a/backend/src/configs/default.yaml b/backend/src/configs/default.yaml index 2514e1d42..7b4256dab 100644 --- a/backend/src/configs/default.yaml +++ b/backend/src/configs/default.yaml @@ -513,3 +513,8 @@ system_codes: desc: Object Category from COCO 2017 TRUCK: desc: Object Category from COCO 2017 + +keyword_extraction: + max_ngram_size: 2 + deduplication_threshold: 0.5 + keyword_proposals: 20 diff --git a/backend/src/configs/default_localhost_dev.yaml b/backend/src/configs/default_localhost_dev.yaml index 6acb3de47..a0064cb4c 100644 --- a/backend/src/configs/default_localhost_dev.yaml +++ b/backend/src/configs/default_localhost_dev.yaml @@ -513,3 +513,8 @@ system_codes: desc: Object Category from COCO 2017 TRUCK: desc: Object Category from COCO 2017 + +keyword_extraction: + max_ngram_size: 2 + deduplication_threshold: 0.5 + keyword_proposals: 20 diff --git a/docker/.env.example b/docker/.env.example index fce6aaa3b..b465409d3 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -17,11 +17,11 @@ GID=126 SHARED_REPO_ROOT=/tmp/dats # Docker tag to use for pulling the backend and celery containers -DATS_BACKEND_DOCKER_VERSION=1.0.4 +DATS_BACKEND_DOCKER_VERSION=1.0.5 # Docker tag to use for pulling the ray container -DATS_RAY_DOCKER_VERSION=1.0.4 +DATS_RAY_DOCKER_VERSION=1.0.5 # Docker tag to use for pulling the frontend -DATS_FRONTEND_DOCKER_VERSION=1.0.4 +DATS_FRONTEND_DOCKER_VERSION=1.0.5 # Which backend config file to use when running backend or celery DATS_BACKEND_CONFIG=configs/default.yaml