-
Notifications
You must be signed in to change notification settings - Fork 85
/
Copy pathdw.py
110 lines (88 loc) · 4 KB
/
dw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import datetime
import re
from typing import List, Optional, Pattern
from lxml.cssselect import CSSSelector
from lxml.etree import XPath
from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
apply_substitution_pattern_over_list,
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_text_extraction_with_css,
generic_topic_parsing,
strip_nodes_to_text,
)
class DWParser(ParserProxy):
class V2(BaseParser):
VALID_UNTIL = datetime.date(2024, 1, 18)
_paragraph_selector = CSSSelector("div.rich-text > p")
_summary_selector = CSSSelector("header > p")
_subheadline_selector = CSSSelector("div.rich-text > h2")
_topic_selector = CSSSelector("aside[data-tracking-name=related-topics] > a")
_author_substitution_pattern: Pattern[str] = re.compile(r"Deutsche Welle")
@attribute
def body(self) -> ArticleBody:
return extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
paragraph_selector=self._paragraph_selector,
)
@attribute
def authors(self) -> List[str]:
return apply_substitution_pattern_over_list(
generic_author_parsing(self.precomputed.ld.bf_search("author")), self._author_substitution_pattern
)
@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.bf_search("headline")
@attribute
def topics(self) -> List[str]:
return [node.text_content().strip() for node in self._topic_selector(self.precomputed.doc)]
class V2_1(V2):
VALID_UNTIL = datetime.date.today()
_topic_selector = CSSSelector("header > div.kicker > span")
@attribute
def topics(self) -> List[str]:
topic_nodes = self._topic_selector(self.precomputed.doc)
if (topic_string := strip_nodes_to_text(topic_nodes, join_on=", ")) is not None:
return topic_string.split(", ")
return []
class V1(BaseParser):
VALID_UNTIL = datetime.date(2023, 6, 12)
_paragraph_selector = CSSSelector("div.longText > p")
_summary_selector = CSSSelector("p.intro")
_subheadline_selector = CSSSelector("div.longText > h2")
_title_selector = CSSSelector(".col3 h1")
_author_selector = XPath(
"normalize-space(" '//ul[@class="smallList"]' '/li[strong[contains(text(), "Auto")]]' "/text()[last()]" ")"
)
_date_selector = XPath(
"normalize-space(" '//ul[@class="smallList"]' '/li[strong[contains(text(), "Datum")]]' "/text())"
)
@attribute
def body(self) -> ArticleBody:
return extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
paragraph_selector=self._paragraph_selector,
)
@attribute
def authors(self) -> List[str]:
raw_author_string: str = self._author_selector(self.precomputed.doc)
return generic_author_parsing(raw_author_string)
@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
raw_date_str: str = self._date_selector(self.precomputed.doc)
return generic_date_parsing(raw_date_str)
@attribute
def title(self) -> Optional[str]:
return generic_text_extraction_with_css(self.precomputed.doc, self._title_selector)
@attribute
def topics(self) -> List[str]:
return generic_topic_parsing(self.precomputed.meta.get("keywords"))