-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgepuwang.py
62 lines (49 loc) · 1.62 KB
/
gepuwang.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#
import requests
import re
import scrapy
import json
import os
session = requests.session()
gDoMain = "http://www.gepuwang.net"
# -*- coding:utf-8 -*-
from selenium import webdriver
option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(chrome_options=option)
# driver = webdriver.Chrome()
# driver = webdriver.PhantomJS()
driver.get('https://www.baidu.com/')
print('打开浏览器')
print(driver.title)
driver.find_element_by_id('kw').send_keys('测试')
print('关闭')
driver.quit()
print('测试完成')
# from selenium import webdriver
# driver = webdriver.Chrome()
# driver.get("http://www.cheerby.com")
def fetch_url(url):
return session.get(url).content.decode('gbk')
def getPageNum(content):#获取总页码
result = re.findall('(?<=共 <strong>)\d*(?=</strong>页)', content)[0] #特殊字符之间包含的表达式
iPageNum = int(result)
return iPageNum
def analyzeList(listContent): #分析列表数据
listResult = re.findall('(?<=<h3><a href=")/hexianpu/\d*.html(?=" target="_blank">)', listContent)
for slink in listResult:
analyzeDetail(gDoMain + slink)
def analyzeDetail(detailUrl):#分析详情页面
sDetailContent = fetch_url(detailUrl)
print(detailUrl)
def main():
url = 'http://www.gepuwang.net/hexianpu/list_62_1.html' #列表页
content = fetch_url(url)
iPageNum = getPageNum(content) #页数
iPageNum = 1
for i in range(iPageNum):
sUrl = 'http://www.gepuwang.net/hexianpu/list_62_' + str(i+1) +'.html'
sContent = fetch_url(sUrl)
analyzeList(sContent)
if __name__ == "__main__":
main()