-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathweb_crawler.py
211 lines (186 loc) · 8.94 KB
/
web_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#!/usr/bin/env python
""" WebCrawler: fetch images from various search engines and download them"""
from __future__ import print_function
from math import ceil
import sys
import os
import json
import images_downloader
__author__ = "Amine BENDAHMANE (@AmineHorseman)"
__email__ = "[email protected]"
__license__ = "GPL"
__date__ = "May 2nd, 2016"
class WebCrawler(object):
""" Fetch images from various search engines and download them"""
search_engines = ['google', 'flickr']
api_keys = []
images_links = {}
keywords = ''
def __init__(self, api_keys):
for engine, _ in api_keys.items():
if engine not in self.search_engines:
error_msg = "Search engine " + engine + \
" not supported. Please use only the following engines: "
error_msg += ', '.join(self.search_engines)
self.error(error_msg)
self.api_keys = api_keys
@staticmethod
def error(msg):
"""Display an error message and exit"""
print("Error: ", msg)
exit()
def collect_links_from_web(self, keywords, number_links_per_engine,
remove_duplicated_links=False):
""" Crawl search engines to collect images links matching a list of keywords.
Each keyword is processed seperately."""
# validate params:
number_links = int(number_links_per_engine)
if number_links <= 0:
print("Warning: number_links_per_engine must be positive, \
value changed to default (100 links)")
number_links = 100
if not keywords:
self.error("Error: No keyword")
elif isinstance(keywords, basestring):
keywords = [keywords]
self.keywords = [keywords]
# call methods for fetching image links in the selected search engines:
print("Start fetching...")
for keyword in keywords:
print("\nFetching for '", keyword, "' images...")
extracted_links = []
for engine, keys in self.api_keys.items():
try:
method = getattr(self, 'fetch_from_' + engine)
except AttributeError:
self.error('funciton fetch_from_' + engine + '() not defined')
temporary_links = method(keyword, keys[0], keys[1], number_links)
extracted_links += temporary_links
print("\r >> ", len(temporary_links), " links extracted", end="\n")
# remove duplicated links:
if remove_duplicated_links:
links_count = len(extracted_links)
extracted_links = list(set(extracted_links))
print(" >> ", links_count - len(extracted_links),
" duplicated links removed, ", len(extracted_links), " kept")
# store the links into the global list:
self.images_links[keyword] = extracted_links
def fetch_from_google(self, keyword, api_key, engine_id, number_links=50):
"""Fetch images from google using API Key"""
items_per_page = 10 # max 10 for google api
if number_links < 10:
items_per_page = number_links
pages_nbr = int(ceil(number_links / items_per_page))
from googleapiclient.discovery import build # no need to import the package if the user didn't choose google in the list ??!
links = []
# get first page and store:
print("Crawling Google Search Engine...")
service = build("customsearch", "v1", developerKey=api_key)
response = service.cse().list(q=keyword,
cx=engine_id,
searchType='image',
num=items_per_page,
fileType='jpg&img;png&img;bmp&img;gif&img',
fields='items/link, queries',
).execute()
items = response['items']
for image in items:
links.append(image['link'])
# get next pages:
for i in range(1, pages_nbr):
sys.stdout.flush()
response = service.cse().list(q=keyword,
cx=engine_id,
searchType='image',
num=items_per_page,
fileType='jpg&img;png&img;bmp&img;gif&img',
fields='items/link, queries',
start=response['queries']['nextPage'][0]['startIndex'],
).execute()
items = response['items']
for image in items:
links.append(image['link'])
print("\r >> ", len(links), " links extracted...", end="")
# store and reduce the number of images if too much:
return links
def fetch_from_flickr(self, keyword, api_key, api_secret, number_links=50):
""" Fetch images from Flikr """
from flickrapi import FlickrAPI # we import flikcr API only if needed
# calculate number of pages:
if number_links < 200:
items_per_page = number_links
else:
items_per_page = 200 # max 200 for flikr
pages_nbr = int(ceil(number_links / items_per_page))
links = []
# get links from the first page:
print("Carwling Flickr Search...")
flickr = FlickrAPI(api_key, api_secret)
response = flickr.photos_search(api_key=api_key,
text=keyword,
per_page=items_per_page,
media='photos',
sort='relevance')
images = [im for im in list(response.iter()) if im.tag == 'photo']
for photo in images:
photo_url = "https://farm{0}.staticflickr.com/{1}/{2}_{3}.jpg". format(
photo.get('farm'), photo.get('server'), photo.get('id'), photo.get('secret'))
links.append(photo_url)
print(" >> ", len(links), " links extracted...", end="")
# get next pages:
for i in range(1, pages_nbr):
response = flickr.photos_search(api_key=api_key,
text=keyword,
per_page=items_per_page,
media='photos',
page = i + 1,
sort='relevance')
images = [im for im in list(response.iter()) if im.tag == 'photo']
for photo in images:
link = "https://farm{0}.staticflickr.com/{1}/{2}_{3}.jpg". format(
photo.get('farm'), photo.get('server'), photo.get('id'), photo.get('secret'))
links.append(link)
print("\r >> ", len(links), " links extracted...", end="")
# store and reduce the number of images if too much:
return links
def save_urls_to_json(self, filename):
""" Save links to disk """
folder, _ = os.path.split(filename)
if filename and not os.path.exists(folder):
os.makedirs(folder)
with open(filename, 'w') as links_file:
json.dump(self.images_links, links_file)
print("\nLinks saved to '", filename, "'")
def save_urls(self, filename):
""" Save links to disk """
folder, _ = os.path.split(filename)
if filename and not os.path.exists(folder):
os.makedirs(folder)
with open(filename, 'w') as links_file:
for keyword in self.images_links:
for link in self.images_links[keyword]:
links_file.write(link)
print("\nLinks saved to '", filename, "'")
def load_urls(self, filename):
""" Load links from a file"""
if not os.path.isfile(filename):
self.error("Failed to load URLs, file '" + filename + "' does not exist")
with open(filename) as links_file:
self.images_links['images'] = []
for link in links_file:
self.images_links['images'].append(link)
print("\nLinks loaded from ", filename)
def load_urls_from_json(self, filename):
""" Load links from a json file"""
if not os.path.isfile(filename):
self.error("Failed to load URLs, file '" + filename + "' does not exist")
with open(filename) as links_file:
self.images_links = json.load(links_file)
print("\nLinks loaded from ", filename)
def download_images(self, target_folder='./data'):
""" Download images and store them in the specified folder """
print(" ")
downloader = images_downloader.ImagesDownloader()
if not target_folder:
target_folder = './data'
downloader.download(self.images_links, target_folder)