diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py index 805969bc8..7a9c1a335 100644 --- a/src/fundus/parser/utility.py +++ b/src/fundus/parser/utility.py @@ -132,10 +132,10 @@ def get_meta_content(tree: lxml.html.HtmlElement) -> Dict[str, str]: return meta -def strip_nodes_to_text(text_nodes: List[lxml.html.HtmlElement]) -> Optional[str]: +def strip_nodes_to_text(text_nodes: List[lxml.html.HtmlElement], join_on: str = "\n\n") -> Optional[str]: if not text_nodes: return None - return "\n\n".join(([re.sub(r"\n+", " ", node.text_content()) for node in text_nodes])).strip() + return join_on.join(([re.sub(r"\n+", " ", node.text_content()) for node in text_nodes])).strip() def apply_substitution_pattern_over_list( diff --git a/src/fundus/publishers/de/dw.py b/src/fundus/publishers/de/dw.py index 111620b7c..061b75a12 100644 --- a/src/fundus/publishers/de/dw.py +++ b/src/fundus/publishers/de/dw.py @@ -13,12 +13,19 @@ generic_date_parsing, generic_text_extraction_with_css, generic_topic_parsing, + strip_nodes_to_text, ) class DWParser(ParserProxy): class V2(BaseParser): - _paragraph_selector = CSSSelector("div.rich-text > p") + VALID_UNTIL = datetime.date(2024, 1, 18) + # https://regex101.com/r/Xsadk5/1 + _author_regex = r"^([A-z]{2,3}\/)*([A-z]{2,3})\s\([A-z\s,\d]*\)$" + _paragraph_selector = XPath( + f"//div[contains(@class, 'rich-text')] /p[text() and not(re:test(text(), '{_author_regex}'))]", + namespaces={"re": "http://exslt.org/regular-expressions"}, + ) _summary_selector = CSSSelector("header > p") _subheadline_selector = CSSSelector("div.rich-text > h2") @@ -53,6 +60,18 @@ def title(self) -> Optional[str]: def topics(self) -> List[str]: return [node.text_content().strip() for node in self._topic_selector(self.precomputed.doc)] + class V2_1(V2): + VALID_UNTIL = datetime.date.today() + + _topic_selector = CSSSelector("header > div.kicker > span") + + @attribute + def topics(self) -> List[str]: + topic_nodes = self._topic_selector(self.precomputed.doc) + if (topic_string := strip_nodes_to_text(topic_nodes, join_on=", ")) is not None: + return topic_string.split(", ") + return [] + class V1(BaseParser): VALID_UNTIL = datetime.date(2023, 6, 12) diff --git a/tests/resources/parser/test_data/de/DW.json b/tests/resources/parser/test_data/de/DW.json index abbed534e..2e121628d 100644 --- a/tests/resources/parser/test_data/de/DW.json +++ b/tests/resources/parser/test_data/de/DW.json @@ -25,5 +25,16 @@ "Long COVID", "Coronavirus" ] + }, + "V2_1": { + "authors": [ + "Jennifer Pahlke" + ], + "publishing_date": "2024-01-30 14:13:12.269000+00:00", + "title": "Russland-Wahl: Nadeschdin setzt auf ein Ende der Putin-Ära", + "topics": [ + "Politik", + "Russische Föderation" + ] } } diff --git a/tests/resources/parser/test_data/de/DW_2024_01_30.html.gz b/tests/resources/parser/test_data/de/DW_2024_01_30.html.gz new file mode 100644 index 000000000..0d7653e83 Binary files /dev/null and b/tests/resources/parser/test_data/de/DW_2024_01_30.html.gz differ diff --git a/tests/resources/parser/test_data/de/meta.info b/tests/resources/parser/test_data/de/meta.info index b76dd9c3b..9009e9af5 100644 --- a/tests/resources/parser/test_data/de/meta.info +++ b/tests/resources/parser/test_data/de/meta.info @@ -15,6 +15,10 @@ "url": "https://www.dw.com/de/post-vac-syndrom-nach-covid-19-impfung-was-wissen-wir/a-65897191?maca=de-rss-de-all-1119-xml-mrss", "crawl_date": "2023-06-13 16:57:08.558047" }, + "DW_2024_01_30.html.gz": { + "url": "https://www.dw.com/de/russland-wahl-nadeschdin-setzt-auf-ein-ende-der-putin-%C3%A4ra/a-68117411?maca=de-rss-de-all-1119-xml-mrss", + "crawl_date": "2024-01-30 16:12:24.867249" + }, "DieWelt_2023_04_28.html.gz": { "url": "https://www.welt.de/wirtschaft/article245055596/BIP-Diese-Grafiken-zeigen-wie-schlecht-es-um-Deutschlands-Wirtschaft-steht.html", "crawl_date": "2023-04-28 20:22:32.033988"