-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathamazonScrapper.py
134 lines (100 loc) · 4.27 KB
/
amazonScrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from bs4 import BeautifulSoup
import re # import Regular expression operations module
import requests
from time import gmtime, strftime
a = []
url_str = input("Enter the product category you want to search for: ")
url = []
# Hacky fix
words = url_str.split()
var = len(words)
if var == 1:
url = "https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=" + words[0]
if var == 2:
url = "https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=" + words[0] + "+" + words[1]
elif var == 3:
url = "https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=" + words[0] + "+" + words[1] + "+" + words[2]
# Add header
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
r = requests.get(url, headers=headers)
# print(r) # print request to see if Response 200
soup = BeautifulSoup(r.content, "html.parser")
# Csv writing setup
filename = "products.csv"
f = open(filename, "w", encoding='utf-8')
# Grab time and format string
strftime("%Y-%m-%d %H:%M:%S", gmtime())
headers ="Asin, Name," + "Price : " + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + ", Number of Reviews\n"
f.write(headers)
# Regex if needed
# a = re.compile((?<=data-asin))
# Used this to beautify and inspect the html/xml data ----> http://jsbeautifier.org/
# Style 1 of Amazon's product display
containers1 = soup.findAll("li", {"class":"s-result-item s-result-card-for-container a-declarative celwidget "})
print("containers style 1: ", len(containers1))
#page could be styled different, invoke query second style
containers2 = soup.findAll("li", {"class":"s-result-item s-result-card-for-container s-carded-grid celwidget "})
print("containers style 2: ", len(containers2))
#check for sponsored containers
sponsored_containers = soup.findAll("li", {"class":"s-result-item celwidget AdHolder"})
print("containers style 3 sponsored: ", len(sponsored_containers))
#check for the most common style
common_containers = soup.findAll("li", {"class":"s-result-item celwidget "})
print("containers style 4 common: ", len(common_containers))
#check for special style
containers3 = soup.findAll("li", {"class":"s-result-item s-col-span-12 celwidget "})
print("containers style 5 special", len(containers3))
for container in sponsored_containers:
# Product Asin
asin = (container["data-asin"])
# Product Name
try:
title_container = container.findAll("a", {"class":"a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"})
name = title_container[0]["title"]
except:
name = "N/A"
# Product Price # span class="a-offscreen"
price_container = container.findAll("span", {"class":"a-offscreen"})
price = price_container[1].text
#Number of reviews
num_review_container = container.findAll("a", {"class":"a-size-small a-link-normal a-text-normal"})
try:
if (len(num_review_container) > 1):
num_reviews = num_review_container[1].text
else:
num_reviews = num_review_container[0].text
except:
num_reviews = "0"
f.write(asin + ',' + name.replace(",", "|") + ',' + price.replace("$", "") + "," + num_reviews.replace(",", "") + "\n") # + ',' + name.replace(",", "|") + ',' + price + "," + num_reviews + "\n")
for container in common_containers:
# Product Asin
asin = (container["data-asin"])
# Product Name
try:
title_container = container.findAll("a", {"class":"a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"})
name = title_container[0]["title"]
except:
name = "N/A"
# Product Price # span class="a-offscreen"
price_container = container.findAll("span", {"class":"a-offscreen"})
# print(price_container)
try:
price = price_container[0].text
except:
price = "N/A"
#Number of reviews
num_review_container = container.findAll("a", {"class":"a-size-small a-link-normal a-text-normal"})
try:
if (len(num_review_container) > 1):
num_reviews = num_review_container[1].text
else:
num_reviews = num_review_container[0].text
except:
num_reviews = "0"
#try:
# num_review2 = num_review_container[1].text.strip()
# print(num_review2)
# except list index out of range:
# num_review2 = 0;
f.write(asin + ',' + name.replace(",", "|") + ',' + price.replace("$", "") + "," + num_reviews.replace(",", "") + "\n")
f.close()