-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapebook.py
22 lines (21 loc) · 869 Bytes
/
scrapebook.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import requests
import os,sys
from bs4 import BeautifulSoup
def scrape():
os.mkdir("book_data")
for i in range(1,376):
n_url=-1
for page in open("book"+str(i)+".txt",'r'):
url=page.strip()
if(url!=""):
n_url+=1
filename="00"+'0'*(3-len(str(i)))+str(i)+'0'*(3-len(str(n_url)))+str(n_url)+".utf8"
if(os.path.isfile("book_data/"+filename)==False):
print(str(i)+"-->"+str(n_url))
resp=requests.get(url)
if resp.status_code==200:
soup=BeautifulSoup(resp.text,'html.parser')
f=open("book_data/"+filename,'a+',encoding='utf_8')
for paras in soup.findAll("p"):
f.write(str(paras.get_text()))
scrape()