-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbookstoscrape.py
34 lines (28 loc) · 1.39 KB
/
bookstoscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# -*- coding: utf-8 -*-
import scrapy
class BooksToScrapeSpider(scrapy.Spider):
name = 'books-toscraps'
allowed_domains = ['toscrape.com']
start_urls = [
'http://books.toscrape.com',
]
custom_settings = {
'AUTOTHROTTLE_ENABLED': True,
'HTTPCACHE_ENABLED': True,
}
def parse(self, response):
for category in response.css('div.side_categories > ul > li > ul > li'):
category_href = category.css('a::attr(href)').extract_first()
yield response.follow(category_href, self.parse_category_books)
def parse_category_books(self, response):
for books in response.css('article.product_pod'):
yield {
# Used title rather than text because some book's text had ellipsis
'book title': books.css('h3 > a::attr(title)').extract_first(),
'book price': books.css('p.price_color::text').extract_first(),
# Used response.urljoin to make absolute URL
'book image URL': response.urljoin(books.css('div.image_container > a > img::attr(src)').extract_first()),
'book details page URL': response.urljoin(books.css('div.image_container > a::attr(href)').extract_first())
}
for next_href in response.xpath('//li/a[text()="next"]'):
yield response.follow(next_href, self.parse_category_books)