Skip to content

Commit 6afac24

Browse files
committed
add Business Insider
1 parent 76b8e8b commit 6afac24

File tree

6 files changed

+95
-0
lines changed

6 files changed

+95
-0
lines changed

docs/supported_publishers.md

+15
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,21 @@
7878
<td>&#160;</td>
7979
<td>&#160;</td>
8080
</tr>
81+
<tr>
82+
<td>
83+
<code>BusinessInsider</code>
84+
</td>
85+
<td>
86+
<div>Business Insider</div>
87+
</td>
88+
<td>
89+
<a href="https://www.businessinsider.de/">
90+
<span>www.businessinsider.de</span>
91+
</a>
92+
</td>
93+
<td>&#160;</td>
94+
<td>&#160;</td>
95+
</tr>
8196
<tr>
8297
<td>
8398
<code>DW</code>

src/fundus/publishers/de/__init__.py

+11
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from .berliner_zeitung import BerlinerZeitungParser
88
from .bild import BildParser
9+
from .business_insider import BusinessInsiderParser
910
from .die_welt import DieWeltParser
1011
from .die_zeit import DieZeitParser
1112
from .dw import DWParser
@@ -190,3 +191,13 @@ class DE(PublisherEnum):
190191
sources=[NewsMap("https://www.waz.de/sitemaps/news.xml")],
191192
parser=WAZParser,
192193
)
194+
195+
BusinessInsider = PublisherSpec(
196+
name="Business Insider",
197+
domain="https://www.businessinsider.de/",
198+
sources=[
199+
NewsMap("https://www.businessinsider.de/news-sitemap.xml"),
200+
Sitemap("https://www.businessinsider.de/sitemap_index.xml"),
201+
],
202+
parser=BusinessInsiderParser,
203+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import datetime
2+
from typing import List, Optional
3+
4+
from lxml.cssselect import CSSSelector
5+
from lxml.etree import XPath
6+
7+
from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
8+
from fundus.parser.utility import (
9+
extract_article_body_with_selector,
10+
generic_author_parsing,
11+
generic_date_parsing,
12+
generic_topic_parsing,
13+
)
14+
15+
16+
class BusinessInsiderParser(ParserProxy):
17+
class V1(BaseParser):
18+
_summary_selector = CSSSelector("article div.bi-bulletpoints > p")
19+
_subheadline_selector = CSSSelector("article h2")
20+
21+
# The mark is to remove prepended text about machine translation
22+
_paragraph_selector = XPath(
23+
"//article //div[contains(@class, 'article-body')] "
24+
"/p[not(mark[@class='has-inline-color has-cyan-bluish-gray-color'])]"
25+
)
26+
27+
@attribute
28+
def body(self) -> ArticleBody:
29+
return extract_article_body_with_selector(
30+
self.precomputed.doc,
31+
summary_selector=self._summary_selector,
32+
subheadline_selector=self._subheadline_selector,
33+
paragraph_selector=self._paragraph_selector,
34+
)
35+
36+
@attribute
37+
def authors(self) -> List[str]:
38+
return generic_author_parsing(self.precomputed.ld.bf_search("author"))
39+
40+
@attribute
41+
def publishing_date(self) -> Optional[datetime.datetime]:
42+
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
43+
44+
@attribute
45+
def title(self) -> Optional[str]:
46+
return self.precomputed.ld.bf_search("headline")
47+
48+
@attribute
49+
def topics(self) -> List[str]:
50+
return generic_topic_parsing(self.precomputed.meta.get("keywords"))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"V1": {
3+
"authors": [
4+
"Matthew Loh"
5+
],
6+
"publishing_date": "2024-01-29 19:00:45+00:00",
7+
"title": "Wie groß ist Evergrande, Chinas strauchelnder Immobilienriese?",
8+
"topics": [
9+
"China",
10+
"Immobilien",
11+
"Pleite",
12+
"Schulden"
13+
]
14+
}
15+
}
Binary file not shown.

tests/resources/parser/test_data/de/meta.info

+4
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@
77
"url": "https://www.bild.de/sport/fussball/fortuna-duesseldorf/bubi-bomber-wieder-da-thioune-fordert-geduld-mit-niemiec-83936220.bild.html",
88
"crawl_date": "2023-05-15 13:55:04.823203"
99
},
10+
"BusinessInsider_2024_01_29.html.gz": {
11+
"url": "https://www.businessinsider.de/wirtschaft/international-business/wie-gross-ist-evergrande-chinas-sterbender-immobilienriese/",
12+
"crawl_date": "2024-01-29 22:53:02.986279"
13+
},
1014
"DW_2023_04_28.html.gz": {
1115
"url": "https://www.dw.com/de/ukrainische-gegenoffensive-ziele-chancen-risiken/a-65464327?maca=de-rss-de-all-1119-xml-mrss",
1216
"crawl_date": "2023-04-28 20:25:18.143350"

0 commit comments

Comments
 (0)