-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
165 lines (129 loc) · 5.97 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
from time import sleep
import urllib
import urllib2
from bs4 import BeautifulSoup
from wordpress_xmlrpc.methods import media
class Scrape:
"""
for scrapering
"""
def __init__(self, **kwargs):
self.base_url = kwargs['base_url']
self.local_img_dir = kwargs['local_img_dir']
self.base_url_wallpapers = kwargs['base_url_wallpapers']
def grab_all_page_urls(self, page_min, page_max):
"""
grab all urls for detail pages as found on a range of index pages
it counts backward from attributes page_max to page_min
"""
page_urls = []
print self.base_url
# construct the index page urls from self.base_url and page no
all_links = []
urls_by_page = {}
for i in range(page_max + 1, page_min-1, -1):
url = self.base_url + '?page=%s' % str(i)
page_urls.append(self.base_url + '?page=%s' % str(i))
try:
index_page = urllib2.urlopen(url).read()
except urllib2.HTTPError:
print 'urlopen fail ' + url
continue
try:
index_soup = BeautifulSoup(index_page)
all_cells = index_soup.findAll('td') # each listing is in a table cell
for cell in all_cells:
detail_url = '/'.join(self.base_url.split('/')[:-2]) + '/' + cell.a.get('href').split('/')[1:][0]
if detail_url == 'http://hirise.lpl.arizona.edu/releases':
continue ## we don want this one ever
all_links.append(detail_url)
urls_by_page[detail_url] = i
except:
pass # malformity
return all_links, urls_by_page
def grab_content_from_detail_page(self, detail_url):
"""
returns content from detail page
"""
try:
index_page = urllib2.urlopen(detail_url).read()
except urllib2.HTTPError:
print 'FAIL could not urlopen ' + detail_url
return False
soup = BeautifulSoup(index_page)
try:
title = soup.findAll('a', {'id': 'example1'})[0].get('title')
except IndexError:
print 'could not find title'
return False # no title no post move along
# scrape the content and clean it up a bit..
content = re.findall(r'<div class="caption-text">\s*(.*?)\s*<div class="social">', ' '.join(str(soup).splitlines()))[0]
soup_content = BeautifulSoup(content)
content = soup_content.prettify()
content = self.prepare_content(content, detail_url)
linkback = '<div class = "linkback" data-linkback="%s"></div>' % detail_url
content = linkback + content
return (title, content, detail_url)
def prepare_content(self, content, detail_url):
"""
this is a place to put hacking of the text content
like replacing their internal links to display as external links on our site
"""
if not content:
print 'empty content passed to scrape.prepare_content'
# makes some inline reletive links into direct links
content = content.replace('\n', ' ')
content = content.replace('href="images/', 'target = "_blank" href="http://hirise.lpl.arizona.edu/images/')
content = content.replace('href="E', 'target = "_blank" href="http://hirise.lpl.arizona.edu/E')
content = content.replace('href="T', 'target = "_blank" href="http://hirise.lpl.arizona.edu/T')
content = content.replace('href="r', 'target = "_blank" href="http://hirise.lpl.arizona.edu/r')
content = content.replace('href="j', 'target = "_blank" href="http://hirise.lpl.arizona.edu/j')
content = content.replace('href="p', 'target = "_blank" href="http://hirise.lpl.arizona.edu/p')
content = content.replace('href="d', 'target = "_blank" href="http://hirise.lpl.arizona.edu/d')
content = content.replace('href="e', 'target = "_blank" href="http://hirise.lpl.arizona.edu/e')
content = content.replace('href="P', 'target = "_blank" href="http://hirise.lpl.arizona.edu/P')
# add credit
html_more_info = '<p>More info and image formats at <a target = "_blank" href = "%s">%s</a></p>'
content += html_more_info % (detail_url, detail_url)
content += '<p>Image: NASA/JPL/University of Arizona </p>'
if not content:
print 'prepare_content deleted the content?'
return content
def fetch_remote_file(self, url, repeat):
local_file = self.local_img_dir + url.split('/')[-1]
# check that remote file exists
try:
resp = urllib2.urlopen(url)
except urllib2.HTTPError:
return False # url does not exist
try:
urllib.urlretrieve(url, local_file)
with open(local_file):
print "got image " + url + " at " + local_file
pass
return local_file
except IOError:
if repeat: # try again..
sleep(3)
return self.fetch_remote_file(url, False)
else:
print "can't fetch remote file " + url
return False
def fetch_featured_image(self, detail_url):
img_id = detail_url.split('/')[-1]
if not img_id:
print "could not find img_id from " + detail_url
return False
# add the base wallpapers url you can view all the sizes they have available..
# we are doing to go looking for sizes, this lists the sizes we want in order of
# our preference
size_list = ['1280', '1440','1600','1920','2048','2560','2880','1152','1024','800']
for sz in size_list:
url = '%s%s/%s.jpg' % (self.base_url_wallpapers, sz, img_id)
local_file = self.fetch_remote_file(url, True)
if local_file:
return local_file, url
return False