-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #411 from uhh-lt/better-keywords
Better keywords
- Loading branch information
Showing
8 changed files
with
106 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,4 @@ scrapy-playwright==0.0.31 | |
scrapy-selenium==0.0.7 | ||
weaviate-client==3.24.1 | ||
webdriver-manager==4.0.1 | ||
yake==0.4.8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
65 changes: 65 additions & 0 deletions
65
backend/src/app/preprocessing/pipeline/steps/text/generate_keywords.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import yake | ||
from loguru import logger | ||
|
||
from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo | ||
from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc | ||
from config import conf | ||
|
||
|
||
def generate_keywords(cargo: PipelineCargo) -> PipelineCargo: | ||
pptd: PreProTextDoc = cargo.data["pptd"] | ||
out = pptd.spacy_pipeline_output | ||
|
||
if out is None: | ||
logger.error( | ||
f"spaCy PipelineOutput is None for {pptd.filename}! Please run the spaCy pipeline first!" | ||
) | ||
return cargo | ||
|
||
language = pptd.metadata.get("language", "noLang") | ||
if isinstance(language, list): | ||
language = language[0] | ||
|
||
kw_extractor = yake.KeywordExtractor( | ||
lan=language, | ||
n=conf.keyword_extraction.max_ngram_size, | ||
dedupLim=conf.keyword_extraction.deduplication_threshold, | ||
top=conf.keyword_extraction.keyword_proposals, | ||
) | ||
keyword_proposals = kw_extractor.extract_keywords(pptd.text) | ||
keyword_proposals = [kw for kw, _ in keyword_proposals] | ||
|
||
tok2pos = {tok.text: tok.pos for tok in out.tokens} | ||
|
||
keep = [ | ||
"NOUN", | ||
"PROPN", | ||
# | ||
["NOUN", "NOUN"], | ||
["PROPN", "PROPN"], | ||
["PROPN", "NOUN"], | ||
["NOUN", "PROPN"], | ||
# | ||
["ADJ", "NOUN"], | ||
["ADJ", "PROPN"], | ||
# | ||
["NOUN", "VERB"], | ||
["PROPN", "VERB"], | ||
["VERB", "NOUN"], | ||
["VERB", "PROPN"], | ||
] | ||
keywords = [] | ||
for kp in keyword_proposals: | ||
ws = kp.split() | ||
if len(ws) == 1: | ||
if tok2pos[ws[0]] in keep: | ||
keywords.append(kp) | ||
elif len(ws) == 2: | ||
if [tok2pos[w] for w in ws] in keep: | ||
keywords.append(kp) | ||
elif tok2pos[ws[0]] in keep and tok2pos[ws[1]] == "ADJ": | ||
keywords.append(ws[0]) | ||
|
||
pptd.keywords = keywords | ||
|
||
return cargo |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters