Skip to content

Commit bcc79b8

Browse files
committed
filter topics
1 parent 57676a3 commit bcc79b8

File tree

2 files changed

+7
-2
lines changed

2 files changed

+7
-2
lines changed

src/fundus/publishers/jp/mainichi_shimbun.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
99
from fundus.parser.utility import (
10+
apply_substitution_pattern_over_list,
1011
extract_article_body_with_selector,
1112
generic_author_parsing,
1213
generic_date_parsing,
@@ -20,6 +21,8 @@ class MainichiShimbunParser(ParserProxy):
2021
class V1(BaseParser):
2122
_paragraph_selector = CSSSelector("#articledetail-body > p")
2223

24+
_topic_bloat_pattern = re.compile("速報")
25+
2326
@attribute
2427
def body(self) -> Optional[ArticleBody]:
2528
return extract_article_body_with_selector(
@@ -43,7 +46,10 @@ def authors(self) -> List[str]:
4346

4447
@attribute
4548
def topics(self) -> List[str]:
46-
return generic_topic_parsing(self.precomputed.meta.get("keywords"), delimiter=[",", "・"])
49+
return apply_substitution_pattern_over_list(
50+
generic_topic_parsing(self.precomputed.meta.get("keywords"), delimiter=[",", "・"]),
51+
self._topic_bloat_pattern,
52+
)
4753

4854
@attribute
4955
def images(self) -> List[Image]:

tests/resources/parser/test_data/jp/MainichiShimbun.json

-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050
"title": "イスラエルとハマスの停戦「最も合意に近い」 最終案に双方同意か",
5151
"topics": [
5252
"国際",
53-
"速報",
5453
"中東",
5554
"緊迫する中東情勢",
5655
"松岡大地",

0 commit comments

Comments
 (0)