-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_finviz_article.py
101 lines (86 loc) · 3.76 KB
/
scrape_finviz_article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
"""
This script provides a base class with the general utility for any Selenium based web scraping needed for the simplifeed project.
It is not intended to be used directly but instead inherited by a child classes that accounts for any major publisher's differences in website formatting.
Selenium is used as many of the websites require javascript files to be run before displaying properly.
"""
class SeleniumScraper:
"""
The driver is used to load the javascript on websites as if a real user was using it
"""
@staticmethod
def get_driver(url):
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--log-level=3')
chrome_options.add_argument("--enable-javascript")
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-software-rasterizer')
serv = Service(os.getcwd()+'/chromedriver')
driver = webdriver.Chrome(options=chrome_options, service=serv)
driver.get(url)
return driver
def __init__(self, url, title=None, body=None, date=None):
self.article_url = url
self.driver = self.get_driver(self.article_url)
self.article_body = body
self.article_title = title
self.article_date = date
"""
The default _getBody should be modified in most inherited classes based on the respective publisher's website format
"""
def _getBody(self):
total_text = ""
ps = self.driver.find_elements(By.TAG_NAME, value='p')
for p in ps:
total_text += p.text + "\n"
self.article_body = total_text
# this is intended to be the public method that the main scraping script calls
def getBody(self):
self._getBody()
return self.article_body
"""
Utility method to check that the XPath intended for use actually exists in the current context.
XPath uses "path like" syntax to identify and navigate nodes in a website's HTML (or more generally in any XML document).
For more info refer to https://www.w3schools.com/xml/xpath_intro.asp
"""
def check_exists_by_xpath(self, xpath):
try:
self.driver.find_element(By.XPATH, value=xpath)
except NoSuchElementException:
return False
return True
class ScraperReuters(SeleniumScraper):
"""
Scrape news articles from Reuters
"""
def _getBody(self):
total_text = ""
para_no = 0
xpath = f"//p[@data-testid='paragraph-{para_no}']"
while self.check_exists_by_xpath(xpath):
reuters_para = self.driver.find_element(By.XPATH, value=xpath)
total_text += reuters_para.text + "\n"
para_no += 1
xpath = f"//p[@data-testid='paragraph-{para_no}']"
self.article_body = total_text
class ScraperMarketWatch(SeleniumScraper):
"""
Scrape news articles from MarketWatch
"""
def _getBody(self):
total_text = ""
xpath = f'//div[@id="js-article__body"]/descendant::p'
if self.check_exists_by_xpath(xpath):
marketwatch_paras = self.driver.find_elements(By.XPATH, value=xpath)
for marketwatch_para in marketwatch_paras:
total_text += marketwatch_para.text + "\n"
self.article_body = total_text