-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlist_info.py
54 lines (50 loc) · 2.03 KB
/
list_info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import sleep
from bs4 import BeautifulSoup
import re
import requests
def get_html(url,s,email,password):
driver = webdriver.Chrome(service=s)
driver.get(url)
driver.find_element(By.XPATH, '/html/body/header/div[1]/div/div[2]/ul/li[6]/a[2]/strong').click()
sleep(3)
driver.find_element(By.XPATH, '/html/body/div[4]/form/div[1]/div[1]/input').send_keys(email)
driver.find_element(By.XPATH, '/html/body/div[4]/form/div[1]/div[2]/input').send_keys(password)
driver.find_element(By.XPATH, '/html/body/div[4]/form/div[2]/button[3]').click()
sleep(3)
html = driver.page_source
#driver.quit()
return html, driver
def get_total_page_number():
req = requests.get("http://securities.stanford.edu/filings.html")
html = req.text
soup = BeautifulSoup(html, 'html.parser')
page = str(soup.find("div", class_="span6"))
pattern = r"\((?P<number>\d+)\)"
object = re.search(pattern, page)
num = int(object.group("number"))
# print(num)
page_number = num // 30 + 1
return page_number
def get_all_cases_in_one_page(url_,pn,s,email,password):
url = url_ + str(pn)
# req = requests.get(url)
# html = req.text
html, driver = get_html(url,s,email,password)
soup = BeautifulSoup(html, 'html.parser')
# <tr class="table-link" page="filings" onclick="window.location='filings-case.html?id=107058'">
all_line = soup.find_all("tr", class_="table-link", page="filings")
return all_line, driver
def get_basic(one):
pattern_link = r"id=(?P<id>\d*)"
id = (re.search(pattern_link, str(one))).group("id")
link_fix = "http://securities.stanford.edu/filings-case.html?id="
link = link_fix + id
info = one.find_all("td", class_="")
name = info[0].get_text(strip=True)
date = info[1].get_text(strip=True)
court = info[2].get_text(strip=True)
exchange = info[3].get_text(strip=True)
ticker = info[4].get_text(strip=True)
return name, date, court, exchange, ticker, link