Skip to content

Commit 0853045

Browse files
authored
Merge pull request #682 from flairNLP/add-asahi-shimbun
Add `AsahiShimbun`
2 parents dd094f1 + c408544 commit 0853045

File tree

6 files changed

+139
-0
lines changed

6 files changed

+139
-0
lines changed

docs/supported_publishers.md

+15
Original file line numberDiff line numberDiff line change
@@ -1254,6 +1254,21 @@
12541254
</tr>
12551255
</thead>
12561256
<tbody>
1257+
<tr>
1258+
<td>
1259+
<code>AsahiShimbun</code>
1260+
</td>
1261+
<td>
1262+
<div>Asahi Shimbun</div>
1263+
</td>
1264+
<td>
1265+
<a href="https://www.asahi.com/">
1266+
<span>www.asahi.com</span>
1267+
</a>
1268+
</td>
1269+
<td>&#160;</td>
1270+
<td>&#160;</td>
1271+
</tr>
12571272
<tr>
12581273
<td>
12591274
<code>TheJapanNews</code>

src/fundus/publishers/jp/__init__.py

+8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from fundus.publishers.base_objects import Publisher, PublisherGroup
2+
from fundus.publishers.jp.asahi_shimbun import AsahiShimbunParser
23
from fundus.publishers.jp.the_japan_news import TheJapanNewsParser
34
from fundus.publishers.jp.yomiuri_shimbun import YomiuriShimbunParser
45
from fundus.scraping.filter import regex_filter
@@ -28,3 +29,10 @@ class JP(metaclass=PublisherGroup):
2829
NewsMap("https://www.yomiuri.co.jp/sitemap-news-latest.xml"),
2930
],
3031
)
32+
33+
AsahiShimbun = Publisher(
34+
name="Asahi Shimbun",
35+
domain="https://www.asahi.com/",
36+
parser=AsahiShimbunParser,
37+
sources=[NewsMap("https://www.asahi.com/sitemap.xml")],
38+
)
+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import datetime
2+
import re
3+
from typing import List, Optional
4+
5+
from lxml.cssselect import CSSSelector
6+
7+
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
8+
from fundus.parser.utility import (
9+
apply_substitution_pattern_over_list,
10+
extract_article_body_with_selector,
11+
generic_author_parsing,
12+
generic_date_parsing,
13+
generic_topic_parsing,
14+
image_extraction,
15+
)
16+
17+
18+
class AsahiShimbunParser(ParserProxy):
19+
class V1(BaseParser):
20+
_summary_selector = CSSSelector("div.nfyQp > div.bv2Sj > p")
21+
_paragraph_selector = CSSSelector("div.nfyQp > p")
22+
_subtitle_selector = CSSSelector("div.nfyQp > h2")
23+
24+
topic_bloat_pattern = re.compile(r"朝日新聞デジタル|朝日新聞|ニュース|新聞|その他・話題")
25+
26+
@attribute
27+
def body(self) -> Optional[ArticleBody]:
28+
return extract_article_body_with_selector(
29+
self.precomputed.doc,
30+
paragraph_selector=self._paragraph_selector,
31+
summary_selector=self._summary_selector,
32+
subheadline_selector=self._subtitle_selector,
33+
)
34+
35+
@attribute
36+
def title(self) -> Optional[str]:
37+
return self.precomputed.meta.get("TITLE")
38+
39+
@attribute
40+
def publishing_date(self) -> Optional[datetime.datetime]:
41+
return generic_date_parsing(self.precomputed.meta.get("article:published_time"))
42+
43+
@attribute
44+
def authors(self) -> List[str]:
45+
return generic_author_parsing(self.precomputed.ld.bf_search("author"))
46+
47+
@attribute
48+
def topics(self) -> List[str]:
49+
return apply_substitution_pattern_over_list(
50+
generic_topic_parsing(self.precomputed.meta.get("keywords")), self.topic_bloat_pattern
51+
)
52+
53+
@attribute
54+
def images(self) -> List[Image]:
55+
return image_extraction(
56+
doc=self.precomputed.doc,
57+
paragraph_selector=self._paragraph_selector,
58+
author_selector=re.compile(r"、(?P<credits>[^、]*?)撮影"),
59+
relative_urls=True,
60+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
{
2+
"V1": {
3+
"authors": [
4+
"朝日新聞デジタル"
5+
],
6+
"body": {
7+
"summary": [],
8+
"sections": [
9+
{
10+
"headline": [],
11+
"paragraphs": [
12+
"気象庁は13日、午後9時19分ごろ、日向灘(北緯31.8度、東経131.6度)で震度5弱の地震があったと発表した。震源の深さは約30キロ、地震の規模(マグニチュード)は6.9と推定される。この地震で、気象庁は高知県と宮崎県に1メートルの津波注意報を出した。",
13+
"各地の震度は次のとおり。",
14+
"<震度5弱>",
15+
"宮崎県:高鍋町、新富町、宮崎市",
16+
"<震度4>",
17+
"宮崎県:延岡市、西都市、木城町、川南町、都農町、門川町、日南市*、串間市、国富町、綾町、美郷町、高千穂町、都城市、小林市、えびの市、三股町、高原町",
18+
"福岡県:久留米市",
19+
"佐賀県:神埼市、白石町",
20+
"熊本県:阿蘇市、産山村、高森町、南阿蘇村、熊本市南区、熊本市北区、八代市、菊池市、宇土市、宇城市、合志市、美里町、西原村、氷川町、人吉市、多良木町、あさぎり町、芦北町",
21+
"大分県:大分市、臼杵市、佐伯市、竹田市",
22+
"鹿児島県:鹿児島市、霧島市、いちき串木野市、南さつま市、伊佐市、姶良市、鹿屋市、垂水市、曽於市、大崎町、東串良町、肝付町"
23+
]
24+
}
25+
]
26+
},
27+
"images": [
28+
{
29+
"versions": [
30+
{
31+
"url": "https://www.asahicom.jp/imgopt/img/4ff96428f2/comm_L/AS20250113003419.jpg",
32+
"query_width": null,
33+
"size": null,
34+
"type": "image/jpeg"
35+
}
36+
],
37+
"is_cover": true,
38+
"description": "写真・図版",
39+
"caption": null,
40+
"authors": [],
41+
"position": 737
42+
}
43+
],
44+
"publishing_date": "2025-01-13 21:37:00+09:00",
45+
"title": "宮崎県で震度5弱、高知と宮崎に1メートルの津波注意報 気象庁",
46+
"topics": [
47+
"社会",
48+
"災害・気象",
49+
"宮崎県"
50+
]
51+
}
52+
}
Binary file not shown.

tests/resources/parser/test_data/jp/meta.info

+4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
{
2+
"AsahiShimbun_2025_01_13.html.gz": {
3+
"url": "https://www.asahi.com/articles/AST1F4445T1FUTIL02SM.html",
4+
"crawl_date": "2025-01-13 14:12:17.527262"
5+
},
26
"TheJapanNews_2024_10_13.html.gz": {
37
"url": "https://japannews.yomiuri.co.jp/politics/politics-government/20241013-216478/",
48
"crawl_date": "2024-10-13 16:27:01.520980"

0 commit comments

Comments
 (0)