-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_collector.py
95 lines (82 loc) · 3.09 KB
/
data_collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from datetime import datetime
def scrapeData(url):
DRIVER_PATH = '/path/to/chromedriver'
#Headless Mode
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get(url)
asin=url.split('dp/')[1]
price="-"
availability="-"
title="-"
imgLink="-"
bestSellerData={"-":"-"}
rating="-"
aboutItem="-"
technicalDetails={"-":"-"}
additionalDetails={"-":"-"}
title=driver.find_element(By.XPATH,"//span[@id='productTitle'][@class='a-size-large product-title-word-break']").text
imgLink=driver.find_element(By.XPATH,"//span[@class='a-declarative']/div[@id='imgTagWrapperId']/img").get_attribute('src')
availability=driver.find_element(By.XPATH,"//div[@id='availability']/span").text
if "In stock" in availability:
price = driver.find_element(By.XPATH,"//span[@class='a-price-whole']").text
try:
rating=driver.find_element(By.XPATH,"//th[text()='Customer Reviews']/../td").text
except:
pass
try:
bestSellerRankText=driver.find_element(By.XPATH,"//th[text()=' Best Sellers Rank ']/../td").text
bestSellerList=bestSellerRankText.split("\n")
bestSellerData={}
for i in bestSellerList:
i=i.split(" in ")
rank=int(i[0].split("#")[1].replace(',',''))
category=i[1].split(" (")[0]
bestSellerData[category]=rank
except:
pass
try:
aboutUL=driver.find_elements(By.XPATH,"//div[@id='feature-bullets']/h1[text()=' About this item ']/../ul/li")
aboutItem=[i.text for i in aboutUL]
aboutItem="\n".join(aboutItem)
except:
pass
try:
technicalDetailsTable=driver.find_elements(By.XPATH,"//table[@class='a-keyvalue prodDetTable'][@id='productDetails_techSpec_section_1']/tbody/tr")
technicalDetails={}
for e in technicalDetailsTable:
property= e.find_element(By.XPATH,".//th").text
value = e.find_element(By.XPATH,".//td").text
technicalDetails[property]=value
except:
pass
try:
additionalDetailsTable=driver.find_elements(By.XPATH,"//table[@class='a-keyvalue prodDetTable'][@id='productDetails_detailBullets_sections1']/tbody/tr")
additionalDetails={}
for e in additionalDetailsTable:
property= e.find_element(By.XPATH,".//th").text
value = e.find_element(By.XPATH,".//td").text
additionalDetails[property]=value
except:
pass
currTime=datetime.now().strftime('%d/%m/%y %H:%M:%S')
data= {
"url":url,
"asin":asin,
currTime:{
"title":title,
"availability":availability,
"price":price,
"image":imgLink,
"rating":rating,
"about":aboutItem,
**technicalDetails,
**additionalDetails
}
}
return data