Skip to content

Commit 005b328

Browse files
committed
Update documentation and initialization for Italian newspapers
1 parent d8809b1 commit 005b328

File tree

2 files changed

+119
-0
lines changed

2 files changed

+119
-0
lines changed

docs/supported_publishers.md

+30
Original file line numberDiff line numberDiff line change
@@ -1138,6 +1138,36 @@
11381138
</tr>
11391139
</thead>
11401140
<tbody>
1141+
<tr>
1142+
<td>
1143+
<code>CorriereDellaSera</code>
1144+
</td>
1145+
<td>
1146+
<div>Corriere Della Sera</div>
1147+
</td>
1148+
<td>
1149+
<a href="https://www.corriere.it">
1150+
<span>www.corriere.it</span>
1151+
</a>
1152+
</td>
1153+
<td>&#160;</td>
1154+
<td>&#160;</td>
1155+
</tr>
1156+
<tr>
1157+
<td>
1158+
<code>IlGiornale</code>
1159+
</td>
1160+
<td>
1161+
<div>Il Giornale</div>
1162+
</td>
1163+
<td>
1164+
<a href="https://www.ilgiornale.it">
1165+
<span>www.ilgiornale.it</span>
1166+
</a>
1167+
</td>
1168+
<td>&#160;</td>
1169+
<td>&#160;</td>
1170+
</tr>
11411171
<tr>
11421172
<td>
11431173
<code>LaRepubblica</code>

src/fundus/publishers/it/__init__.py

+89
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from dateutil.rrule import MONTHLY, rrule
44

55
from fundus.publishers.base_objects import Publisher, PublisherGroup
6+
from fundus.publishers.it.corriere_della_sera import CorriereDellaSeraParser
7+
from fundus.publishers.it.il_giornale import IlGiornaleParser
68
from fundus.publishers.it.la_repubblica import LaRepubblicaParser
79
from fundus.scraping.url import RSSFeed, Sitemap
810

@@ -22,3 +24,90 @@ class IT(metaclass=PublisherGroup):
2224
)
2325
],
2426
)
27+
28+
CorriereDellaSera = Publisher(
29+
name="Corriere Della Sera",
30+
domain="https://www.corriere.it",
31+
parser=CorriereDellaSeraParser,
32+
sources=[
33+
# Main RSS feeds
34+
RSSFeed("https://www.corriere.it/feed-hp/homepage.xml"),
35+
RSSFeed("https://www.corriere.it/rss/ultimora.xml"),
36+
RSSFeed("https://www.corriere.it/dynamic-feed/rss/section/Dataroom.xml"),
37+
RSSFeed("https://www.corriere.it/dynamic-feed/rss/section/lettere-al-direttore.xml"),
38+
RSSFeed("https://www.corriere.it/dynamic-feed/rss/section/lo-dico-al-corriere.xml"),
39+
RSSFeed("https://www.corriere.it/dynamic-feed/rss/section/frammenti-di-ferruccio-de-bortoli.xml"),
40+
# Main sitemaps
41+
Sitemap("https://www.corriere.it/rss/sitemap_v2.xml"),
42+
Sitemap("https://www.corriere.it/salute/sitemap-dizionario-corriere-salute.xml"),
43+
# Dynamic sitemaps - Last 100 articles
44+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/video/Corriere.xml"),
45+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Economia.xml"),
46+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Salute.xml"),
47+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Scienze.xml"),
48+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Interni.xml"),
49+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Esteri.xml"),
50+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Sport.xml"),
51+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Politica.xml"),
52+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Salute__Figli__e__Genitori.xml"),
53+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Salute__Sportello__Cancro.xml"),
54+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Elezioni.xml"),
55+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Tecnologia.xml"),
56+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Offerte__recensioni.xml"),
57+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Lotterie.xml"),
58+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Spettacoli.xml"),
59+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Scuola.xml"),
60+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Animali.xml"),
61+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Opinioni.xml"),
62+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Caffe-gramellini.xml"),
63+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Ultimo-banco.xml"),
64+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Letti-da-rifarei.xml"),
65+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Piccole-dosi.xml"),
66+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/L-angolo.xml"),
67+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Padiglione-italia.xml"),
68+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Facce-nuove.xml"),
69+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Ritorno-in-solferino.xml"),
70+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Oriente-occidente.xml"),
71+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Sette.xml"),
72+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Moda.xml"),
73+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/BuoneNotizie.xml"),
74+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/lettere__al__direttore.xml"),
75+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/lo__dico__al__corriere.xml"),
76+
Sitemap("https://www.corriere.it/dynamic-sitemap/sitemap-last-100/Frammenti-ferruccio-de-bortoli.xml"),
77+
# Section sitemaps
78+
Sitemap("https://www.corriere.it/rss/sitemap/Motori.xml"),
79+
Sitemap("https://www.corriere.it/rss/sitemap/Cultura.xml"),
80+
Sitemap("https://vivimilano.corriere.it/sitemap_index.xml"),
81+
Sitemap("https://www.corriere.it/cook/sitemap-index.xml"),
82+
Sitemap("https://www.corriere.it/oroscopo/sitemap.xml"),
83+
Sitemap("https://www.corriere.it/elezioni/sitemap/sitemap.xml"),
84+
Sitemap("https://www.corriere.it/sport/risultati-live/sitemap.xml"),
85+
Sitemap("https://www.corriere.it/salute/il-medico-risponde/sitemap.xml"),
86+
Sitemap("https://www.corriere.it/rss/sitemap/lettere-al-direttore.xml"),
87+
Sitemap("https://www.corriere.it/rss/sitemap/lo-dico-al-corriere.xml"),
88+
Sitemap("https://www.corriere.it/rss/sitemap/Cook-Last.xml"),
89+
Sitemap("https://www.corriere.it/economia/chiedi-esperto/sitemap.xml"),
90+
Sitemap("https://www.corriere.it/economia/chiedi-esperto/news/sitemap.xml"),
91+
Sitemap("https://www.corriere.it/studio/sitemap-studio.xml"),
92+
],
93+
)
94+
95+
IlGiornale = Publisher(
96+
name="Il Giornale",
97+
domain="https://www.ilgiornale.it",
98+
parser=IlGiornaleParser,
99+
request_header={
100+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
101+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
102+
"Accept-Language": "en-US,en;q=0.9",
103+
"Accept-Encoding": "gzip, deflate, br",
104+
"Connection": "keep-alive",
105+
"Upgrade-Insecure-Requests": "1",
106+
},
107+
sources=[
108+
RSSFeed("https://www.ilgiornale.it/feed.xml"),
109+
RSSFeed("https://www.ilgiornale.it/feed/rss.xml"),
110+
Sitemap("https://www.ilgiornale.it/sitemap/google-news.xml"),
111+
Sitemap("https://www.ilgiornale.it/sitemap/indice.xml"),
112+
],
113+
)

0 commit comments

Comments
 (0)