Skip to content
This repository was archived by the owner on Jul 1, 2022. It is now read-only.

Commit

Permalink
Add first version of Archivio La Stampa downloader
Browse files Browse the repository at this point in the history
The archive is under cc-by-nc-nd and about to be shut down.
  • Loading branch information
nemobis committed Dec 4, 2020
1 parent 4fc8732 commit da6b2d1
Showing 1 changed file with 131 additions and 0 deletions.
131 changes: 131 additions & 0 deletions ia-archivio-lastampa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
""" Bot to download archiviolastampa.it """
#
# (C) Federico Leva, 2020
#
# Distributed under the terms of the MIT license.
#
__version__ = '0.1.0'

import datetime
import json
import os
import re
import requests
from time import sleep

def getDayId(day, headboard='01'):
"""
Get the magic ID of the day from the index of the next day.
Takes day in datetime format and issue type (01 or 02) as string.
"""

next_day = (day + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
url = "http://www.archiviolastampa.it/index2.php?option=com_lastampa&task=issue&no_html=1&type=neighbors&headboard={}&date={}%2000:00:00".format(headboard, next_day)
index = requests.get(url)

# Expected output is something like:
# {"previousIssueId":"1319_02_1989_0242_0001","nextIssueId":"1319_02_1989_0244_0001"}
if index.status_code < 300:
return index.json()['previousIssueId']

def getIssueMetadata(identifier):
""" Issue metadata from the identifier """

url = "http://www.archiviolastampa.it/index2.php?option=com_lastampa&task=issue&no_html=1&type=info&issueid={}".format(identifier)
info = requests.get(url)

# Expected output is something like:
# {"id_testata":"02","uscita":"243","data_uscita":"1989-09-13 00:00:00","nome_testata":"Europa"}
if info.status_code < 300:
return info.json()

def makeDay(day, metadata):
""" Prepare the download of this day, if appropriate """

day_ymd = day.strftime('%Y-%m-%d')
if day_ymd in metadata['data_uscita']:
try:
os.mkdir(day_ymd)
except FileExistsError:
print("INFO: Day {} was already done".format(day_ymd))
return False

with open('{}/issue_metadata.json'.format(day_ymd), 'w') as jsonout:
jsonout.write(json.dumps(metadata))
return True
else:
# We got a different day, probably there's a gap for festivities.
return False

def downloadDay(day):
""" Retrieve data for issue, prepare files and download images """

day_ymd = day.strftime('%Y-%m-%d')
identifier = getDayId(day)
print("Found {} for {}".format(identifier, day.strftime('%Y-%m-%d')))
metadata = getIssueMetadata(identifier)
sleep(0.1)
if not metadata:
print("WARNING: could not download metadata for {}".format(day_ymd))
return False

if not makeDay(day, metadata):
# We got a different day, probably there's a gap for festivities.
return False

# Prepare a session for this issue
s = requests.Session()
# We need the parameter from the hidden input
# <input type="hidden" name="t" value="a2016dedff5843c652d2fdf4f87055cc" />
home = s.get('http://www.archiviolastampa.it/')
t = re.findall('<input type="hidden" name="t" value="([a-z0-9]+)"', home.text)[0]
sleep(0.1)

# List pages in the issue
pages = s.get('http://www.archiviolastampa.it/load.php?url=/item/getPagesInfo.do?id={}&s={}'.format(identifier, t))
with open('{}/{}_pages.json'.format(day_ymd, identifier), 'w') as pages_out:
pages_out.write(pages.text)
sleep(0.1)

for page in pages.json()['pageList']:
page_id = page['thumbnailId']
page_image = s.get('http://www.archiviolastampa.it/load.php?url=/downloadContent.do?id={}_19344595&s={}'.format(page_id, t))
sleep(0.2)
if not 'image/jpeg' in page_image.headers['Content-Type']:
print("WARNING: could not download image for {}".format(page_id))
sleep(5)
continue
with open('{}/{}.jpg'.format(day_ymd, page_id), 'wb') as page_out:
page_out.write(page_image.content)

page_data = s.get('http://www.archiviolastampa.it/load.php?url=/search/select/?wt=json&q=pageID:{}&s={}&s={}'.format(page_id, t, t))
with open('{}/{}_pagedata.json'.format(day_ymd, page_id), 'w') as page_meta:
page_meta.write(page_data.text)
sleep(0.1)

def listDates(start, end):
""" Return list of days between two dates """

first_day = datetime.datetime.strptime(start, '%Y-%m-%d')
last_day = datetime.datetime.strptime(end, '%Y-%m-%d')
return [first_day + datetime.timedelta(days=x) for x in range(0, (last_day-first_day).days)]

def main():
retry = open('retry.log', 'a')
for day in listDates('1868-01-01', '2005-12-31'):
try:
download = downloadDay(day)
sleep(2)
except Exception as e:
print(e)
download = False
if download is False:
print("ERROR: Something went wrong with {}, please retry. Sleeping now.".format(day))
retry.write("{}\n".format(day))
sleep(30)

retry.close()
if __name__ == "__main__":
main()

0 comments on commit da6b2d1

Please sign in to comment.