Skip to content

Commit

Permalink
Merge pull request #411 from uhh-lt/better-keywords
Browse files Browse the repository at this point in the history
Better keywords
  • Loading branch information
floschne authored Aug 23, 2024
2 parents f19c7ad + 0b07bd0 commit db584ef
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 12 deletions.
4 changes: 4 additions & 0 deletions backend/.dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,17 @@
src/dev_notebooks
src/data
src/experiments
sample_data
notebooks

# ignore caches
.pytest_cache

# others
.dockerignore
.gitignore
.idea
.run
Dockerfile
Makefile
README.md
1 change: 1 addition & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ scrapy-playwright==0.0.31
scrapy-selenium==0.0.7
weaviate-client==3.24.1
webdriver-manager==4.0.1
yake==0.4.8
28 changes: 22 additions & 6 deletions backend/src/app/preprocessing/pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,17 @@ def build_text_pipeline(foo: str = "bar") -> PreprocessingPipeline:
from app.preprocessing.pipeline.steps.text.extract_text_from_html_and_create_source_mapping import (
extract_text_from_html_and_create_source_mapping,
)
from app.preprocessing.pipeline.steps.text.generate_keywords import (
generate_keywords,
)
from app.preprocessing.pipeline.steps.text.generate_named_entity_annotations import (
generate_named_entity_annotations,
)
from app.preprocessing.pipeline.steps.text.generate_sentence_annotations import (
generate_sentence_annotations,
)
from app.preprocessing.pipeline.steps.text.generate_word_frequencies_and_keywords import (
generate_word_frequncies_and_keywords,
from app.preprocessing.pipeline.steps.text.generate_word_frequencies import (
generate_word_frequncies,
)
from app.preprocessing.pipeline.steps.text.index_text_document_for_simsearch import (
index_text_document_for_simsearch,
Expand Down Expand Up @@ -98,7 +101,12 @@ def build_text_pipeline(foo: str = "bar") -> PreprocessingPipeline:
)

pipeline.register_step(
func=generate_word_frequncies_and_keywords,
func=generate_word_frequncies,
required_data=["pptd"],
)

pipeline.register_step(
func=generate_keywords,
required_data=["pptd"],
)

Expand Down Expand Up @@ -199,11 +207,14 @@ def build_image_pipeline(foo: str = "bar") -> PreprocessingPipeline:
from app.preprocessing.pipeline.steps.image.write_ppid_to_database import (
write_ppid_to_database,
)
from app.preprocessing.pipeline.steps.text.generate_keywords import (
generate_keywords,
)
from app.preprocessing.pipeline.steps.text.generate_sentence_annotations import (
generate_sentence_annotations,
)
from app.preprocessing.pipeline.steps.text.generate_word_frequencies_and_keywords import (
generate_word_frequncies_and_keywords,
from app.preprocessing.pipeline.steps.text.generate_word_frequencies import (
generate_word_frequncies,
)
from app.preprocessing.pipeline.steps.text.run_spacy_pipeline import (
run_spacy_pipeline,
Expand Down Expand Up @@ -268,7 +279,12 @@ def build_image_pipeline(foo: str = "bar") -> PreprocessingPipeline:
)

pipeline.register_step(
func=generate_word_frequncies_and_keywords,
func=generate_word_frequncies,
required_data=["pptd"],
)

pipeline.register_step(
func=generate_keywords,
required_data=["pptd"],
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import yake
from loguru import logger

from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc
from config import conf


def generate_keywords(cargo: PipelineCargo) -> PipelineCargo:
pptd: PreProTextDoc = cargo.data["pptd"]
out = pptd.spacy_pipeline_output

if out is None:
logger.error(
f"spaCy PipelineOutput is None for {pptd.filename}! Please run the spaCy pipeline first!"
)
return cargo

language = pptd.metadata.get("language", "noLang")
if isinstance(language, list):
language = language[0]

kw_extractor = yake.KeywordExtractor(
lan=language,
n=conf.keyword_extraction.max_ngram_size,
dedupLim=conf.keyword_extraction.deduplication_threshold,
top=conf.keyword_extraction.keyword_proposals,
)
keyword_proposals = kw_extractor.extract_keywords(pptd.text)
keyword_proposals = [kw for kw, _ in keyword_proposals]

tok2pos = {tok.text: tok.pos for tok in out.tokens}

keep = [
"NOUN",
"PROPN",
#
["NOUN", "NOUN"],
["PROPN", "PROPN"],
["PROPN", "NOUN"],
["NOUN", "PROPN"],
#
["ADJ", "NOUN"],
["ADJ", "PROPN"],
#
["NOUN", "VERB"],
["PROPN", "VERB"],
["VERB", "NOUN"],
["VERB", "PROPN"],
]
keywords = []
for kp in keyword_proposals:
ws = kp.split()
if len(ws) == 1:
if tok2pos[ws[0]] in keep:
keywords.append(kp)
elif len(ws) == 2:
if [tok2pos[w] for w in ws] in keep:
keywords.append(kp)
elif tok2pos[ws[0]] in keep and tok2pos[ws[1]] == "ADJ":
keywords.append(ws[0])

pptd.keywords = keywords

return cargo
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc


def generate_word_frequncies_and_keywords(cargo: PipelineCargo) -> PipelineCargo:
def generate_word_frequncies(cargo: PipelineCargo) -> PipelineCargo:
pptd: PreProTextDoc = cargo.data["pptd"]
out = pptd.spacy_pipeline_output

Expand Down Expand Up @@ -34,7 +34,5 @@ def generate_word_frequncies_and_keywords(cargo: PipelineCargo) -> PipelineCargo
k: v
for (k, v) in sorted(pptd.word_freqs.items(), key=lambda i: i[1], reverse=True)
}
# use top-5 as keywords
pptd.keywords = list(pptd.word_freqs.keys())[:5]

return cargo
5 changes: 5 additions & 0 deletions backend/src/configs/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -513,3 +513,8 @@ system_codes:
desc: Object Category from COCO 2017
TRUCK:
desc: Object Category from COCO 2017

keyword_extraction:
max_ngram_size: 2
deduplication_threshold: 0.5
keyword_proposals: 20
5 changes: 5 additions & 0 deletions backend/src/configs/default_localhost_dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -513,3 +513,8 @@ system_codes:
desc: Object Category from COCO 2017
TRUCK:
desc: Object Category from COCO 2017

keyword_extraction:
max_ngram_size: 2
deduplication_threshold: 0.5
keyword_proposals: 20
6 changes: 3 additions & 3 deletions docker/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ GID=126
SHARED_REPO_ROOT=/tmp/dats

# Docker tag to use for pulling the backend and celery containers
DATS_BACKEND_DOCKER_VERSION=1.0.4
DATS_BACKEND_DOCKER_VERSION=1.0.5
# Docker tag to use for pulling the ray container
DATS_RAY_DOCKER_VERSION=1.0.4
DATS_RAY_DOCKER_VERSION=1.0.5
# Docker tag to use for pulling the frontend
DATS_FRONTEND_DOCKER_VERSION=1.0.4
DATS_FRONTEND_DOCKER_VERSION=1.0.5

# Which backend config file to use when running backend or celery
DATS_BACKEND_CONFIG=configs/default.yaml
Expand Down

0 comments on commit db584ef

Please sign in to comment.