-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbilibili.py
48 lines (36 loc) · 1.16 KB
/
bilibili.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# -*- encoding:utf8 -*-
from lxml import etree
import requests
from multiprocessing.dummy import Pool
import re
import time
myheader = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
urls = []
html_str = ''
result_name_list= []
# result_describ_list = []
def get_sourse(url,header=myheader):
html = requests.get(url,headers=header)
time.sleep(1)
return html.text
for i in range(1,10000):
newpage = 'http://bangumi.bilibili.com/anime/'+ str(i)
urls.append(newpage)
if not requests.get(newpage):
urls.remove(newpage)
pool = Pool(4)
result_html = pool.map(get_sourse,urls)
pool.close()
pool.join()
for i in result_html:
html_str += str(i)
html = etree.HTML(html_str)
result_name = html.xpath('//title')
# result_describ = html.xpath('//meta/@content')
for i in result_name:
result_name_list +=re.findall(r'(.*?)_番剧.*?',i.text)
# for i in result_describ_list:
# result_describ_list += i
with open('spider.txt','w') as f:
f.writelines([line+'\n' for line in result_name_list])
# f.writelines([line+'\n' for line in result_describ_list])