-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathGoogleScraper.py
170 lines (143 loc) · 6.64 KB
/
GoogleScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""
GoogleScraper is a simple image scraper, used to scrape google by a specific query tag.
For any further changes, please let me know what you think about this piece of code.
This code is one of a project solutions in the computer science department at INSA Rouen.
This is made to give (only) an illustration of Web Scraping concept.
This is not a full complete solution to scrape huge quantity of data.
Keep tracking this repository to get the latest updates.
"""
import os
import shutil
import requests
import time
from selenium import webdriver
from datetime import datetime
from selenium.webdriver.chrome.options import Options
class GoogleScraper:
"""
# Parameters :
------------------------------------------------------------------------------------
query : (or tag) a string which is used to search for the images
name : the scraper name
url_source : the main url string that is used to get the page's data dictionary
google_urls : list of urls which are taken from the page's data dictionary
images_urls : list of images direct urls (parsed urls from google_urls)
# Functions :
------------------------------------------------------------------------------------
get_urls() : the function which is used to get images_urls from the url_source dictionary
parse_urls() : the function help pre-process the previous scraped urls to images_urls
load_images() : the function which is used to download the images from images_urls
"""
query = ''
name = 'google_scraper'
url_source = ''
google_urls = []
images_urls = []
def __init__(self, query, size):
"""
GoogleScraper's constructor
:param query: the tag that is used to search for the images
:param size: the images size. 'l' for long, 'm' for medium and 's' for small
"""
if size in ['l', 'm', 's']:
self.url_source = 'https://www.google.com/search?q={}&tbm=isch&tbs=isz:{}'.format(query, size)
self.query = query
self.size = size
else:
raise ValueError("The given size is not correct. Please make sure it's 'l', 'm' or 's")
def get_urls(self):
"""
This function scrape the google images urls from the data dictionary in the url_source
"""
# urls_list is the urls list that we will scrape
urls = []
# verify if the tag is not provided
if not self.query:
print("The current object doesn't have a tag or the current tag is empty. Please provide one.")
else:
# instantiate a chrome options object to set the size and headless preference
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1920x1080")
# get the chrome_driver from the current directory
chrome_driver = os.getcwd() + '/chromedriver'
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chrome_driver)
driver.get(self.url_source)
hyperlink_tags = driver.find_elements_by_xpath("//div[@class='rg_bx rg_di rg_el ivg-i']/a")
for tag in hyperlink_tags:
tmp = tag.get_attribute("href")
to_eliminate = self.url_source + '#'
if tmp != to_eliminate:
urls.append(tmp)
driver.close()
self.google_urls = urls
def parse_urls(self):
"""
This function takes the google urls and pre-process them to get the direct urls
"""
if len(self.google_urls) == len(self.images_urls):
print("The urls have been already parsed. You may use now load_images to download.")
else:
auth_urls_avoided = ['photobucket']
urls = []
for url in self.google_urls:
if url != self.url_source+'#' and auth_urls_avoided[0] not in url:
# remove the google image response signature
url_google = url.split('https://www.google.com/imgres?imgurl=')[1]
# parse the rest of the url to get the right image url
url_image = url_google.split('&')[0].replace('%2F', '/').replace('%3A', ':')
urls.append(url_image)
self.images_urls = urls
print("The urls have been parsed successfully.")
def load_images(self):
"""
This function takes the parsed urls strings 'images_urls' and download the associated images
"""
if not self.images_urls:
print("The scraping object doesn't have a dictionary of urls to scrape."
+ "\nPlease use .get_urls() to load the images urls.")
else:
number_error = 0
i = 1
for url in self.images_urls:
url = verify_url(url)
# print('\n'+url+'\n')
if url is not '':
try:
response = requests.get(url, stream=True)
except requests.exceptions.ConnectionError:
continue
directory = 'images/' + self.query
if not os.path.exists(directory):
os.makedirs(directory)
if response.status_code == 200:
tmp_time_str = str(datetime.now()).split('.')[0].replace(' ', '_')
with open(directory + '/google_{}.jpg'.format(str(i) + '_' + tmp_time_str), 'wb') as f:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, f)
i = i+1
else:
number_error += 1
print("Error [{}] : Cannot get the url! :(".format(str(response.status_code)))
time.sleep(0.5)
else:
number_error += 1
print("OhOh ! : An invalid link :(")
if number_error == 0:
print("{} images have been downloaded successfully.".format(str(i-1)))
else:
print("{} images have been downloaded except ".format(str(i-1))
+ "{} images with an error.".format(str(number_error)))
print('Google scraping has finished')
def verify_url(url):
"""
Verify the given url and return a valid url
:param url: image url
:return: Valid url
"""
# NOTE : For now, we accept .jpg format only
valid_url = ''
if 'http' in url or 'https' in url:
if '.jpg' in url:
valid_url = '.jpg'.join(url.split('.jpg')[0:-1])+'.jpg'
return valid_url