Skip to content

Commit 7c5d78c

Browse files
authored
Merge pull request #689 from flairNLP/add-pr-times
Add `NikkanGeadai`
2 parents b037610 + fa39f5a commit 7c5d78c

File tree

7 files changed

+241
-0
lines changed

7 files changed

+241
-0
lines changed

docs/supported_publishers.md

+15
Original file line numberDiff line numberDiff line change
@@ -1299,6 +1299,21 @@
12991299
<td>&#160;</td>
13001300
<td>&#160;</td>
13011301
</tr>
1302+
<tr>
1303+
<td>
1304+
<code>NikkanGeadai</code>
1305+
</td>
1306+
<td>
1307+
<div>Nikkan Geadai</div>
1308+
</td>
1309+
<td>
1310+
<a href="https://www.nikkan-gendai.com/">
1311+
<span>www.nikkan-gendai.com</span>
1312+
</a>
1313+
</td>
1314+
<td>&#160;</td>
1315+
<td>&#160;</td>
1316+
</tr>
13021317
<tr>
13031318
<td>
13041319
<code>SankeiShimbun</code>

src/fundus/parser/utility.py

+49
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,55 @@ def get_meta_content(root: lxml.html.HtmlElement) -> Dict[str, str]:
271271
return metadata
272272

273273

274+
def transform_breaks_to_paragraphs(element: lxml.html.HtmlElement, **attribs: str) -> lxml.html.HtmlElement:
275+
"""Splits the content of <element> on <br> tags into paragraphs and transform them in <p> elements.
276+
277+
Args:
278+
element: The element on which to perform the transformation
279+
**attribs: The attributes of the wrapped paragraphs as keyword arguments. I.e. the
280+
default {"class": "br-wrap"} wil produce the following elements: <p class='br-wrap'>.
281+
To use python keywords wrap them dunder scores. __class__ for class.
282+
283+
Returns:
284+
The transformed element
285+
"""
286+
287+
if not attribs:
288+
attribs = {"class": "br-wrap"}
289+
else:
290+
attribs = {re.sub(r"^__(.*?)__$", r"\1", key): value for key, value in attribs.items()}
291+
292+
def get_paragraphs() -> List[str]:
293+
raw_html = lxml.etree.tostring(element, method="html", encoding="unicode")
294+
if match := re.match(r"^<[^>]*?>\s*(?P<content>.*?)\s*<[^>]*?>\s*$", raw_html, re.S):
295+
content = match.group("content")
296+
return list(filter(bool, (text.strip() for text in content.split("<br>"))))
297+
return []
298+
299+
def generate_attrs() -> str:
300+
return " ".join([f"{attribute}='{value}'" for attribute, value in attribs.items()]) if attribs else ""
301+
302+
def clear_element():
303+
for child in element:
304+
element.remove(child)
305+
element.tail = None
306+
element.text = None
307+
308+
# split content on <br> tags
309+
if not (paragraphs := get_paragraphs()):
310+
return element
311+
312+
# remove children, tail and text from element
313+
clear_element()
314+
315+
# add paragraphs to cleared element
316+
for paragraph in paragraphs:
317+
wrapped = f"<p{' ' + generate_attrs()}>{paragraph}</p>"
318+
element.append(lxml.html.fromstring(wrapped))
319+
320+
return element
321+
322+
274323
def strip_nodes_to_text(text_nodes: List[lxml.html.HtmlElement], join_on: str = "\n\n") -> Optional[str]:
275324
if not text_nodes:
276325
return None

src/fundus/publishers/jp/__init__.py

+14
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from .asahi_shimbun import AsahiShimbunParser
66
from .mainichi_shimbun import MainichiShimbunParser
7+
from .nikkan_geadai import NikkanGeadaiParser
78
from .nikkei import NikkeiParser
89
from .sankei_shimbun import SankeiShimbunParser
910
from .the_japan_news import TheJapanNewsParser
@@ -86,3 +87,16 @@ class JP(metaclass=PublisherGroup):
8687
NewsMap("https://www.sankei.com/feeds/sitemapindex-category/?outputType=xml"),
8788
],
8889
)
90+
91+
NikkanGeadai = Publisher(
92+
name="Nikkan Geadai",
93+
domain="https://www.nikkan-gendai.com/",
94+
parser=NikkanGeadaiParser,
95+
sources=[
96+
Sitemap(
97+
"https://www.nikkan-gendai.com/sitemap.xml",
98+
reverse=True,
99+
sitemap_filter=inverse(regex_filter(r"type=articles")),
100+
)
101+
],
102+
)
+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import datetime
2+
import re
3+
from typing import List, Optional
4+
5+
from lxml.cssselect import CSSSelector
6+
from lxml.etree import XPath
7+
8+
from fundus.parser import (
9+
ArticleBody,
10+
BaseParser,
11+
Image,
12+
ParserProxy,
13+
attribute,
14+
function,
15+
)
16+
from fundus.parser.utility import (
17+
extract_article_body_with_selector,
18+
generic_author_parsing,
19+
generic_date_parsing,
20+
generic_topic_parsing,
21+
image_extraction,
22+
transform_breaks_to_paragraphs,
23+
)
24+
25+
26+
class NikkanGeadaiParser(ParserProxy):
27+
class V1(BaseParser):
28+
_paragraph_selector = XPath(
29+
"//div[@class='article-wrap'] //p[@class='full-text'] /p[@class='br-wrap' and text()]"
30+
)
31+
32+
_full_text_selector = CSSSelector("div.article-wrap p.full-text")
33+
34+
_topic_selector = XPath("//main //div[contains(@class, 'm-keyword-list')] /ul /li //text()")
35+
36+
@function(priority=0)
37+
def _transform_br_element(self):
38+
if nodes := self._full_text_selector(self.precomputed.doc):
39+
if len(nodes) != 1:
40+
raise ValueError(f"Expected exactly one node")
41+
else:
42+
transform_breaks_to_paragraphs(nodes[0], __class__="br-wrap")
43+
44+
@attribute
45+
def body(self) -> Optional[ArticleBody]:
46+
return extract_article_body_with_selector(
47+
self.precomputed.doc,
48+
paragraph_selector=self._paragraph_selector,
49+
)
50+
51+
@attribute
52+
def title(self) -> Optional[str]:
53+
return self.precomputed.ld.bf_search("headline")
54+
55+
@attribute
56+
def authors(self) -> List[str]:
57+
return generic_author_parsing(self.precomputed.ld.bf_search("author"))
58+
59+
@attribute
60+
def publishing_date(self) -> Optional[datetime.datetime]:
61+
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
62+
63+
@attribute
64+
def topics(self) -> List[str]:
65+
if topics := self._topic_selector(self.precomputed.doc):
66+
return generic_topic_parsing(topics)
67+
return []
68+
69+
@attribute
70+
def images(self) -> List[Image]:
71+
return image_extraction(
72+
doc=self.precomputed.doc,
73+
paragraph_selector=self._paragraph_selector,
74+
upper_boundary_selector=CSSSelector("div.article-wrap"),
75+
# https://regex101.com/r/uY6o2z/1
76+
author_selector=re.compile(r"(C)(?P<credits>.*?)\s*$"),
77+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
{
2+
"V1": {
3+
"authors": [
4+
"日刊現代"
5+
],
6+
"body": {
7+
"summary": [],
8+
"sections": [
9+
{
10+
"headline": [],
11+
"paragraphs": [
12+
"元タレントの中居正広氏(52)の女性トラブルを発端にしたフジテレビ問題が、スポーツ界に飛び火している。",
13+
"今回の一件でフジテレビのガバナンスを問題視するスポンサー企業が続出。",
14+
"75社以上がCM差し止めに踏み切る中、フジテレビと放映権契約を結ぶなど、密接な関係を築いてきたスポーツの各競技団体もテンヤワンヤになっているのだ。",
15+
"サッカーのJリーグは、フジと放映権契約を結ぶ「ルヴァン杯」の開幕が3月20日に控える。フジはCMやリーグの関連番組にも関わっており、1月28日のJリーグ理事会で今後の対応について協議した。",
16+
"日本バレーボール協会(JVA)もフジとの関係は深い。昨年発足したSVリーグの今季開幕戦を地上波で生中継。今季の今後のリーグ戦の中継も、フジのCS局が行う予定。春高バレーは先日終わったものの、JVAは「本件に関して、現在、対応を検討中です。これ以上、お答えできることはございません」と回答したが、世界バレーやネーションズリーグはTBSと放映権契約を結んでおり、「鞍替え」が検討されても不思議ではない。",
17+
"ゴルフ界も対岸の火事ではない。",
18+
"国内ツアー「フジサンケイレディス」(4月)、「フジサンケイクラシック」(9月)はフジが主催。今季3戦目の「アクサレディス」(3月)はフジが後援を務める。",
19+
"日本女子プロゴルフ協会は現在、対応を検討中だが、現場ではフジサンケイレディスの開催を危ぶむ声が少なくないという。",
20+
"フィギュアは、かねて世界フィギュア選手権(3月)などをフジテレビが中継している。",
21+
"日本スケート連盟は「世界フィギュア選手権の(放映権)契約は国際スケート連盟とフジテレビとなっておりますので、本連盟はお答えする立場にはございません」と回答した。",
22+
"フジテレビからの放映権料は、各競技団体の収入源になっている。しかし、番組スポンサーが撤退すれば、放送すらおぼつかなくなっても不思議ではない。"
23+
]
24+
}
25+
]
26+
},
27+
"images": [
28+
{
29+
"versions": [
30+
{
31+
"url": "https://c799eb2b0cad47596bf7b1e050e83426.cdnext.stream.ne.jp/img/article/000/367/061/8fc28b260fd17152bf603b00ed91013420250131105559706_262_262.jpg",
32+
"query_width": null,
33+
"size": {
34+
"width": 262,
35+
"height": 262
36+
},
37+
"type": "image/jpeg"
38+
},
39+
{
40+
"url": "https://c799eb2b0cad47596bf7b1e050e83426.cdnext.stream.ne.jp/img/article/000/367/061/8fc28b260fd17152bf603b00ed91013420250131105559706_262_262.webp",
41+
"query_width": null,
42+
"size": {
43+
"width": 262,
44+
"height": 262
45+
},
46+
"type": "image/webp"
47+
},
48+
{
49+
"url": "https://c799eb2b0cad47596bf7b1e050e83426.cdnext.stream.ne.jp/img/article/000/367/061/8fc28b260fd17152bf603b00ed91013420250131105559706.jpg",
50+
"query_width": null,
51+
"size": {
52+
"width": 600,
53+
"height": 600
54+
},
55+
"type": "image/jpeg"
56+
},
57+
{
58+
"url": "https://c799eb2b0cad47596bf7b1e050e83426.cdnext.stream.ne.jp/img/article/000/367/061/8fc28b260fd17152bf603b00ed91013420250131105559706_600_resize.webp",
59+
"query_width": null,
60+
"size": {
61+
"width": 600,
62+
"height": 600
63+
},
64+
"type": "image/webp"
65+
}
66+
],
67+
"is_cover": true,
68+
"description": null,
69+
"caption": "バレー男子ネーションズリーグ、日本代表の(左から)西田、小野寺、高橋藍、石川",
70+
"authors": [
71+
"共同通信社"
72+
],
73+
"position": 228
74+
}
75+
],
76+
"publishing_date": "2025-01-31 11:30:00+09:00",
77+
"title": "フジテレビ問題でスポーツ界にも大激震!協賛企業&quot;総スカン&quot;で各競技団体のビジネスモデル完全崩壊へ",
78+
"topics": [
79+
"フジテレビ"
80+
]
81+
}
82+
}
Binary file not shown.

tests/resources/parser/test_data/jp/meta.info

+4
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
"url": "https://mainichi.jp/articles/20250114/k00/00m/030/335000c",
1212
"crawl_date": "2025-01-14 14:55:19.277555"
1313
},
14+
"NikkanGeadai_2025_01_31.html.gz": {
15+
"url": "https://www.nikkan-gendai.com/articles/view/sports/367061",
16+
"crawl_date": "2025-01-31 13:40:41.093585"
17+
},
1418
"Nikkei_2025_01_27.html.gz": {
1519
"url": "https://www.nikkei.com/article/DGXZQOUB148MY0U5A110C2000000/",
1620
"crawl_date": "2025-01-27 16:41:04.576095"

0 commit comments

Comments
 (0)