-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathderedactie_spider.py
139 lines (123 loc) · 7.65 KB
/
deredactie_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import scrapy
from scrapy.contrib.spiders import CrawlSpider
from scrapy.exceptions import CloseSpider
from epu_scrapy.items import Article
from datetime import datetime, timedelta
from time import strptime, strftime, mktime
import re
import json
import os
def set_start_urls(settings):
"""
Based on the dates given in the settings file, construct the start urls for the spider
"""
term = settings['term']
if type(settings['period']) is not dict:
today = datetime.today()
if settings['period'] != 'yesterday':
CloseSpider("unknown period setting. See the scrapers README for more information.")
search_day = today - timedelta(days=1) # search for articles of yesterday
search_day_str = '{0}/{1}/{2}'.format(search_day.day, search_day.month, search_day.year % 100)
start_urls = ['http://deredactie.be/cm/vrtnieuws/1.516538?text={0}&type=text&range=atdate&isdate={1}&sort=date&action=submit&advancedsearch=on'.format(term, search_day_str)]
else:
start = datetime(*strptime(settings['period']['start'], '%Y-%m-%d')[:6]) # awkward syntax to convert struct time to datetime (see: http://stackoverflow.com/questions/1697815/how-do-you-convert-a-python-time-struct-time-object-into-a-datetime-object)
start_str = '{0}/{1}/{2}'.format(start.day, start.month, start.year % 100)
end = datetime(*strptime(settings['period']['end'], '%Y-%m-%d')[:6])
end_str = '{0}/{1}/{2}'.format(end.day, end.month, end.year % 100)
start_urls = ['http://deredactie.be/cm/vrtnieuws/1.516538?text={0}&type=text&range=betweendate&startdate={1}&enddate={2}&sort=date&action=submit&advancedsearch=on'.format(term, start_str, end_str)]
return start_urls
class DeredactieSpider(CrawlSpider):
name = 'deredactie' # name of the spider, to be used when running from command line
allowed_domains = ['deredactie.be']
settings = json.load(open(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'crawling_settings.json')))
start_urls = set_start_urls(settings)
def parse(self, response):
"""
Parse the first search page to determine the number of articles returned. Use the urls offset parameter
to iterate over all response pages and yield scrapy.Request objects that will be parsed with the
parse_list_page function
"""
nr_of_articles_element = response.xpath('//li[contains(concat(" ", normalize-space(@class), " "), " searchcounter ")]')
if len(nr_of_articles_element) is 2:
# nr of articles is mentioned above list of articles and below. So the number of elements that match the xpath selector is 2
nr_of_articles_text = ''.join(nr_of_articles_element[0].xpath('descendant-or-self::*/text()').extract())
# Explaining the regular expression at line 53:
# (?P<offset>\d+) => matches a number (\d+) and assigns it to group "offset"
# (?P<pagesize>\d+) => matches a number (\d+) and assigns it to group "pagesize"
# \s+van\s+ => matches the word "van" surrounded by whitespace (spaces, tabs etc)
# (?P<nr_of_articles>\d+) => matches a number (\d+) and assigns it to group "nr_of_articles"
m = re.search('(?P<offset>\d+)-(?P<pagesize>\d+)\s+van\s+(?P<nr_of_articles>\d+)', nr_of_articles_text)
if m:
pagesize = int(m.group('pagesize')) - int(m.group('offset')) + 1
nr_of_articles = int(m.group('nr_of_articles'))
for i in range(0, nr_of_articles, pagesize):
# Note that the offset parameter starts at 0
yield scrapy.Request(self.start_urls[0] + '&offset={0}'.format(i), callback=self.parse_list_page)
else:
raise scrapy.exceptions.CloseSpider('Could not parse number of articles from {0}'.format(response.url))
else:
raise scrapy.exceptions.CloseSpider('Element containing the number of articles was not found at {0}'.format(response.url))
def parse_published_datetime(self, datetime_element_parts):
"""
Helper method to parse a datetime from a html element
"""
datetime_str_parts = [x.encode('utf-8') for x in datetime_element_parts]
datetime_str = ' '.join(datetime_str_parts).strip()
datetime_str_stripped = re.findall('[0-9]+/[0-9]+/[0-9]+[^0-9]+[0-9]+:[0-9]+', datetime_str)[0]
dt = datetime(*strptime(datetime_str_stripped, '%d/%m/%Y - %H:%M')[0:6])
return dt.isoformat()
def parse_list_page(self, response):
"""
Parse a single page returned by the search query. Find all links referring to articles and yield
scrapy.Request objects for every link found. The parsing of these links is done by the parse_article
function.
"""
print response.url
links = response.xpath('//div[contains(concat(" ", normalize-space(@class), " "), " searchresults ")]/descendant::a/@href').extract()
link_set = set([x.encode('utf-8') for x in links])
for l in link_set:
if l != '#':
# an article link can point to a single article page, or a storyline page, which includes several articles.
# in both cases, the id of the actual article that is pointed to can be found in the url. In the case
# of a storyline, the url is like /cm/vrtnieuws/buitenland/<storylineid>?eid=<articleid> while for a
# single article page, the url is /cm/vrtnieuws/binnenland/<articleid>. Both a storylineid and a articleid
# look something like 1.193019, which will be matched by the regular expression pattern [0-9.]+
article_id = re.findall('[0-9.]+', l)[-1] # the last string that matches this pattern in the url is the article id
l = 'http://deredactie.be/cm/' + article_id
yield scrapy.Request(l, callback=self.parse_article)
def parse_article(self, response):
"""
Parse the article content page
"""
# search for article title
title_parts = response.xpath('//div[@id="articlehead"]/h1/text()').extract()
if len(title_parts) > 0:
title = ' '.join(set(title_parts)).encode('utf-8').strip()
else:
title = ''
# search for article published date
datetime_element_parts = response.xpath('//small[@id="pubdate"]/strong/text()').extract()
if len(datetime_element_parts) > 0:
datetime_iso_str = self.parse_published_datetime(datetime_element_parts)
else:
datetime_iso_str = ''
# search for article intro text
article_intro_parts = response.xpath('//div[@id="intro"]/strong/text()').extract()
article_intro = ' '.join([x.strip() for x in article_intro_parts]).strip()
# search for article full text
article_full_text_fragments = response.xpath('//div[@id="articlebody"]/descendant::p/descendant-or-self::*/text()').extract()
article_full_text = ' '.join([x.strip() for x in article_full_text_fragments]).strip()
# reconstruct the url to the nicely rendered page
url_parts = response.url.split('/')
article_id = url_parts.pop()
url_parts.append('vrtnieuws')
url_parts.append(article_id)
url = '/'.join(url_parts)
# now create an Article item, and return it. All Articles created during scraping can be written to an output file when the -o option is given.
article = Article()
article['url'] = url
article['intro'] = article_intro
article['title'] = title
article['published_at'] = datetime_iso_str
article['text'] = article_full_text
return article