Merge pull request #411 from uhh-lt/better-keywords

Better keywords
uhh-lt · Aug 23, 2024 · db584ef · db584ef
2 parents f19c7ad + 0b07bd0
commit db584ef
Show file tree

Hide file tree

Showing 8 changed files with 106 additions and 12 deletions.
diff --git a/backend/.dockerignore b/backend/.dockerignore
@@ -8,13 +8,17 @@
 src/dev_notebooks
 src/data
 src/experiments
+sample_data
+notebooks
 
 # ignore caches
 .pytest_cache
 
 # others
 .dockerignore
 .gitignore
+.idea
+.run
 Dockerfile
 Makefile
 README.md
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -6,3 +6,4 @@ scrapy-playwright==0.0.31
 scrapy-selenium==0.0.7
 weaviate-client==3.24.1
 webdriver-manager==4.0.1
+yake==0.4.8
diff --git a/backend/src/app/preprocessing/pipeline/__init__.py b/backend/src/app/preprocessing/pipeline/__init__.py
@@ -38,14 +38,17 @@ def build_text_pipeline(foo: str = "bar") -> PreprocessingPipeline:
     from app.preprocessing.pipeline.steps.text.extract_text_from_html_and_create_source_mapping import (
         extract_text_from_html_and_create_source_mapping,
     )
+    from app.preprocessing.pipeline.steps.text.generate_keywords import (
+        generate_keywords,
+    )
     from app.preprocessing.pipeline.steps.text.generate_named_entity_annotations import (
         generate_named_entity_annotations,
     )
     from app.preprocessing.pipeline.steps.text.generate_sentence_annotations import (
         generate_sentence_annotations,
     )
-    from app.preprocessing.pipeline.steps.text.generate_word_frequencies_and_keywords import (
-        generate_word_frequncies_and_keywords,
+    from app.preprocessing.pipeline.steps.text.generate_word_frequencies import (
+        generate_word_frequncies,
     )
     from app.preprocessing.pipeline.steps.text.index_text_document_for_simsearch import (
         index_text_document_for_simsearch,
@@ -98,7 +101,12 @@ def build_text_pipeline(foo: str = "bar") -> PreprocessingPipeline:
     )
 
     pipeline.register_step(
-        func=generate_word_frequncies_and_keywords,
+        func=generate_word_frequncies,
+        required_data=["pptd"],
+    )
+
+    pipeline.register_step(
+        func=generate_keywords,
         required_data=["pptd"],
     )
 
@@ -199,11 +207,14 @@ def build_image_pipeline(foo: str = "bar") -> PreprocessingPipeline:
     from app.preprocessing.pipeline.steps.image.write_ppid_to_database import (
         write_ppid_to_database,
     )
+    from app.preprocessing.pipeline.steps.text.generate_keywords import (
+        generate_keywords,
+    )
     from app.preprocessing.pipeline.steps.text.generate_sentence_annotations import (
         generate_sentence_annotations,
     )
-    from app.preprocessing.pipeline.steps.text.generate_word_frequencies_and_keywords import (
-        generate_word_frequncies_and_keywords,
+    from app.preprocessing.pipeline.steps.text.generate_word_frequencies import (
+        generate_word_frequncies,
     )
     from app.preprocessing.pipeline.steps.text.run_spacy_pipeline import (
         run_spacy_pipeline,
@@ -268,7 +279,12 @@ def build_image_pipeline(foo: str = "bar") -> PreprocessingPipeline:
     )
 
     pipeline.register_step(
-        func=generate_word_frequncies_and_keywords,
+        func=generate_word_frequncies,
+        required_data=["pptd"],
+    )
+
+    pipeline.register_step(
+        func=generate_keywords,
         required_data=["pptd"],
     )
 

diff --git a/backend/src/app/preprocessing/pipeline/steps/text/generate_keywords.py b/backend/src/app/preprocessing/pipeline/steps/text/generate_keywords.py
@@ -0,0 +1,65 @@
+import yake
+from loguru import logger
+
+from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
+from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
+from config import conf
+
+
+def generate_keywords(cargo: PipelineCargo) -> PipelineCargo:
+    pptd: PreProTextDoc = cargo.data["pptd"]
+    out = pptd.spacy_pipeline_output
+
+    if out is None:
+        logger.error(
+            f"spaCy PipelineOutput is None for {pptd.filename}! Please run the spaCy pipeline first!"
+        )
+        return cargo
+
+    language = pptd.metadata.get("language", "noLang")
+    if isinstance(language, list):
+        language = language[0]
+
+    kw_extractor = yake.KeywordExtractor(
+        lan=language,
+        n=conf.keyword_extraction.max_ngram_size,
+        dedupLim=conf.keyword_extraction.deduplication_threshold,
+        top=conf.keyword_extraction.keyword_proposals,
+    )
+    keyword_proposals = kw_extractor.extract_keywords(pptd.text)
+    keyword_proposals = [kw for kw, _ in keyword_proposals]
+
+    tok2pos = {tok.text: tok.pos for tok in out.tokens}
+
+    keep = [
+        "NOUN",
+        "PROPN",
+        #
+        ["NOUN", "NOUN"],
+        ["PROPN", "PROPN"],
+        ["PROPN", "NOUN"],
+        ["NOUN", "PROPN"],
+        #
+        ["ADJ", "NOUN"],
+        ["ADJ", "PROPN"],
+        #
+        ["NOUN", "VERB"],
+        ["PROPN", "VERB"],
+        ["VERB", "NOUN"],
+        ["VERB", "PROPN"],
+    ]
+    keywords = []
+    for kp in keyword_proposals:
+        ws = kp.split()
+        if len(ws) == 1:
+            if tok2pos[ws[0]] in keep:
+                keywords.append(kp)
+        elif len(ws) == 2:
+            if [tok2pos[w] for w in ws] in keep:
+                keywords.append(kp)
+            elif tok2pos[ws[0]] in keep and tok2pos[ws[1]] == "ADJ":
+                keywords.append(ws[0])
+
+    pptd.keywords = keywords
+
+    return cargo
diff --git a/...generate_word_frequencies_and_keywords.py → ...e/steps/text/generate_word_frequencies.py b/...generate_word_frequencies_and_keywords.py → ...e/steps/text/generate_word_frequencies.py
@@ -6,7 +6,7 @@
 from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
 
 
-def generate_word_frequncies_and_keywords(cargo: PipelineCargo) -> PipelineCargo:
+def generate_word_frequncies(cargo: PipelineCargo) -> PipelineCargo:
     pptd: PreProTextDoc = cargo.data["pptd"]
     out = pptd.spacy_pipeline_output
 
@@ -34,7 +34,5 @@ def generate_word_frequncies_and_keywords(cargo: PipelineCargo) -> PipelineCargo
         k: v
         for (k, v) in sorted(pptd.word_freqs.items(), key=lambda i: i[1], reverse=True)
     }
-    # use top-5 as keywords
-    pptd.keywords = list(pptd.word_freqs.keys())[:5]
 
     return cargo
diff --git a/backend/src/configs/default.yaml b/backend/src/configs/default.yaml
@@ -513,3 +513,8 @@ system_codes:
             desc: Object Category from COCO 2017
           TRUCK:
             desc: Object Category from COCO 2017
+
+keyword_extraction:
+  max_ngram_size: 2
+  deduplication_threshold: 0.5
+  keyword_proposals: 20
diff --git a/backend/src/configs/default_localhost_dev.yaml b/backend/src/configs/default_localhost_dev.yaml
@@ -513,3 +513,8 @@ system_codes:
             desc: Object Category from COCO 2017
           TRUCK:
             desc: Object Category from COCO 2017
+
+keyword_extraction:
+  max_ngram_size: 2
+  deduplication_threshold: 0.5
+  keyword_proposals: 20
diff --git a/docker/.env.example b/docker/.env.example
@@ -17,11 +17,11 @@ GID=126
 SHARED_REPO_ROOT=/tmp/dats
 
 # Docker tag to use for pulling the backend and celery containers
-DATS_BACKEND_DOCKER_VERSION=1.0.4
+DATS_BACKEND_DOCKER_VERSION=1.0.5
 # Docker tag to use for pulling the ray container
-DATS_RAY_DOCKER_VERSION=1.0.4
+DATS_RAY_DOCKER_VERSION=1.0.5
 # Docker tag to use for pulling the frontend
-DATS_FRONTEND_DOCKER_VERSION=1.0.4
+DATS_FRONTEND_DOCKER_VERSION=1.0.5
 
 # Which backend config file to use when running backend or celery
 DATS_BACKEND_CONFIG=configs/default.yaml