-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathInstaScraper.py
98 lines (84 loc) · 3.93 KB
/
InstaScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
InstaScraper is an images scraping spider class, used to scrape instagram by a given tag.
The url we are using is helping us to get a full dictionary of an instagram page. In this case,
we don't need to scrape the urls in the html DOM, a simple request retrieves the dictionary as
mentioned above.
For any further changes, let us know what you think about this program.
This program is created at the end of a project in the INSA Rouen engineering school.
This is a simple Web Scraping example or illustration and not a complete solution for images scraping.
"""
import shutil
import requests
import os
import time
from random import randint
class InstaScraper:
"""
# Parameters :
------------------------------------------------------------------------------------
tag : a string which is used in order to search for the images
name : the scraper name
url_source : the main url string, it is used to get the page's data dictionary
urls : list of urls which are taken from the page's data dictionary
# Functions :
------------------------------------------------------------------------------------
get_urls() : the function which is used to get the images urls from the url_source dictionary
load_images() : the function which is used to download the images by using self.urls
"""
tag = ''
name = 'insta_scraper'
url_source = ''
urls = ''
def __init__(self, tag):
"""
:param tag
"""
self.tag = tag
self.url_source = 'https://www.instagram.com/explore/tags/' + tag + '/?__a=1'
def get_urls(self):
"""
This function get the images urls from the url_source dictionary
"""
# urls_list is the urls list that we will scrape
urls_list = []
# verify if the tag is not provided
if self.tag == '':
print('The current object doesn\'t have a tag or the current tag is empty. Please provide one.')
else:
# request the url_source and get its content
response = requests.get(self.url_source)
# get the response's content as a dictionary
data_dict = response.json()
# urls_number is the number of available urls
urls_number = len(data_dict['graphql']['hashtag']['edge_hashtag_to_media']['edges'])
for i in range(0, urls_number):
is_video = data_dict['graphql']['hashtag']['edge_hashtag_to_media']['edges'][i]['node']['is_video']
# verify if the current element is a video, in case it is, we ignore it
if not is_video:
display_url = data_dict['graphql']['hashtag']['edge_hashtag_to_media']['edges'][i]['node'][
'display_url']
urls_list.append(display_url)
self.urls = urls_list
def load_images(self):
"""
This function takes the urls strings 'urls' and download the associated images
"""
# verify if self.urls is empty. in case it is not, we download the images from the urls list
if self.urls == '':
print('The scraping object doesn\'t have a dictionary of urls to scrape.'
+ '\n Please use .get_urls() to load the urls.')
else:
i = 1
for url in self.urls:
response = requests.get(url, stream = True)
directory = 'images/' + self.tag
if not os.path.exists(directory):
os.makedirs(directory)
if response.status_code == 200:
with open('./images/{}/insta_{}_direct.jpg'.format(self.tag, str(i)), 'wb') as f:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, f)
i = i+1
else:
print("Error [{}] : Cannot get the url! :(".format(str(response.status_code)))
time.sleep(randint(1, 3))