Skip to content

Commit

Permalink
import client
Browse files Browse the repository at this point in the history
  • Loading branch information
lili19 committed May 17, 2021
1 parent 8a82a8d commit 698b05b
Show file tree
Hide file tree
Showing 6 changed files with 640 additions and 0 deletions.
44 changes: 44 additions & 0 deletions client/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import ctypes
import configparser
from crawler.__main__ import main
import os

def has_admin():
try:
is_admin = os.getuid() == 0
except AttributeError:
is_admin = ctypes.windll.shell32.IsUserAnAdmin() != 0
return is_admin

if __name__ == '__main__':
if not os.path.isfile("config.ini"):
ctypes.windll.user32.MessageBoxW(0, "当前目录下没有config.ini", "配置文件不存在", 0)
exit(-1)

parser = configparser.ConfigParser()
parser.read('config.ini', encoding="UTF-8")
lock_input = parser.get('basic', 'lock_input', fallback='False')
print("lock input {}".format(lock_input))
is_admin = has_admin()
print("is_admin {}".format(is_admin))
if lock_input.lower() == 'true':
if not is_admin:
ctypes.windll.user32.MessageBoxW(0, "请以管理员运行程序或者关掉lock_input", "没有权限", 0)
else:
BlockInput = ctypes.windll.user32.BlockInput
BlockInput.argtypes = [ctypes.wintypes.BOOL]
BlockInput.restype = ctypes.wintypes.BOOL

blocked = BlockInput(True)
if blocked:
try:
main(parser)
except:
print("except")
finally:
unblocked = BlockInput(False) # unblock in any case
else:
ctypes.windll.user32.MessageBoxW(0, "请联系开发人员处理", "锁定输入失败", 0)
else:
main(parser)

12 changes: 12 additions & 0 deletions client/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[basic]
#wechat_path =
java_server = http://172.23.115.78:4567
#java_server = http://127.0.0.1:4567
max_crawl_pages = 3
win_width = 1000
win_height = 600
lock_input = False
first_pages = 200
latest_date = 2021-05-13
first_max_crawl_time = 3600
switch_gongzhonghao = 北京动物园
Empty file added client/crawler/__init__.py
Empty file.
241 changes: 241 additions & 0 deletions client/crawler/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
import os
import configparser
from pathlib import Path
import time
from datetime import datetime
import requests
import json
import traceback
import re
import ctypes

from .wechatautomator import WechatAutomator

def url_in_states(url, states):
for state in states:
if url == state['url']:
return True
return False


def send_heart_beat(wechat_id, type, java_server):
try:
page = {"wechatId": wechat_id,
"activityType": type
}
print("heart-beat: {}".format(page))
s = requests.post(java_server + "/heartbeat", json=page)
res = json.loads(s.text)
if not res["success"]:
print("heartbeat失败:{}".format(res))
except:
print("heartbeat失败")

def send_debug_info(wechat_id, debug_info, java_server):
try:
print("heart-beat: {}".format(debug_info))
s = requests.post(java_server + "/debuginfo?wechatId="+wechat_id, json=debug_info)
res = json.loads(s.text)
if not res["success"]:
print("send-debug-info失败:{}".format(res))
except:
print("send-debug-info失败")

def add_to_detail(s, detail):
detail.append(time.strftime("%Y-%m-%d %H:%M:%S")+" "+str(s))

def main(parser):
debug_info = {}
wechat_path = parser.get('basic', 'wechat_path', fallback=None)
if wechat_path is not None and wechat_path.strip() == '':
wechat_path = None
java_server = parser.get('basic', 'java_server', fallback=None)
if java_server is not None and java_server.strip() == '':
java_server = None
if java_server is None:
java_server = "http://localhost:4567"

win_width = int(parser.get('basic', 'win_width', fallback=1000))
win_height = int(parser.get('basic', 'win_height', fallback=600))
crawl_interval = int(parser.get('basic', 'crawl_interval', fallback=1))
crawl_pages = int(parser.get('basic', 'crawl_pages', fallback=3))
max_crawl_pages = int(parser.get('basic', 'max_crawl_pages', fallback=6))

debug_count = int(parser.get('basic', "debug_count", fallback="10"))
latest_date = parser.get('basic', 'latest_date', fallback=None)
first_pages = int(parser.get('basic', 'first_pages', fallback="1"))

first_max_crawl_time = int(parser.get('basic', 'first_max_crawl_time', fallback="86400"))
switch_gongzhonghao = parser.get('basic', 'switch_gongzhonghao', fallback=None)



print("max_crawl_pages: {}, crawl_pages: {}".format(max_crawl_pages, crawl_pages))
print("width: {}, height: {}".format(win_width, win_height))
print("java_server: {}".format(java_server))
print("wechat_path: {}".format(wechat_path))
print("crawl_interval: {} hours".format(crawl_interval))
print("latest_date: {}".format(latest_date))
print("first_max_crawl_time: {}".format(first_max_crawl_time))
print("switch_gongzhonghao: {}".format(switch_gongzhonghao))
print("first_pages: {}".format(first_pages))

cwd = os.getcwd()
print("current directory {}".format(cwd))
debug_info["max_crawl_pages"] = max_crawl_pages
debug_info["crawl_pages"] = crawl_pages
debug_info["win_width"] = win_width
debug_info["win_height"] = win_height
debug_info["java_server"] = java_server
debug_info["wechat_path"] = wechat_path
debug_info["crawl_interval"] = crawl_interval
debug_info["cwd"] = cwd
debug_info["debug_count"] = debug_count
debug_info["latest_date"] = latest_date
debug_info["first_max_crawl_time"] = first_max_crawl_time
debug_info["switch_gongzhonghao"] = switch_gongzhonghao
debug_info["first_pages"] = first_pages

automator = WechatAutomator()
try:
automator.init_window()
except:
print("微信未启动或未登陆,请启动微信并扫码登陆后再运行本程序。")
return
wechat_id = automator.get_wechat_id()
print("wechat id {}".format(wechat_id))
debug_info["wechat_id"] = wechat_id
debug_info["details"] = []

while True:
automator.move_window()
start_time = int(time.time())
my_file = Path("gongzhonghao.txt")
if not my_file.is_file():
s = "gongzhonghao.txt文件不存在,请创建后再运行"
debug_info["error_msg"] = s
print(s)
time.sleep(60)
continue
with open('gongzhonghao.txt', encoding="UTF-8") as f:
lines = f.read().splitlines()
print("抓取的公众号列表:")
for line in lines:
print("\t{}".format(line))

if len(lines) == 1 and switch_gongzhonghao is None:
s = "只有一个公众号要抓取,需要配置 switch_gongzhonghao"
ctypes.windll.user32.MessageBoxW(0, "请在config.ini配置switch_gongzhonghao或者增加公众号数量", "没有switch_gongzhonghao", 0)
return
if len(lines) == 1:
try:
automator.locate_user(switch_gongzhonghao)
except:
pass
for line in lines:
line = line.strip()
if line == '':
continue
details = debug_info["details"]
detail = []
details.append(detail)
# 只保留
if len(details) > debug_count:
details = details[-debug_count:]
debug_info["details"] = details

try:
s = "开始抓取: {}".format(line)
print(s)
add_to_detail(s, detail)
send_heart_beat(wechat_id, "start-"+line, java_server)

articles = []
# get states from java server
page = {"pubName": line}
s = requests.post(java_server + "/getstate", json=page)
rsp = json.loads(s.text)
if not rsp["success"]:
s = "获取states失败:{}".format(rsp["msg"])
add_to_detail(s, detail)
print(s)
continue
states = rsp["data"]
i = 0
for state in states:
i += 1
if i < 50:
add_to_detail("state: {}".format(state), detail)
print(state)
force_counter = False
curr_crawl_pages = crawl_pages
curr_max_pages = max_crawl_pages
curr_latest_date = None

if len(states) == 0:
s = "首次抓取 {}".format(line)
add_to_detail(s, detail)
print(s)
curr_time = int(time.time())
if curr_time - start_time >= first_max_crawl_time:
s = "时间太长,跳过首次抓取 {}-{}".format(start_time, curr_time)
print(s)
add_to_detail(s, detail)
try:
automator.locate_user(line)
except:
pass
continue

curr_max_pages = max(max_crawl_pages, first_pages)
force_counter = True
curr_latest_date = latest_date
s = "curr_max: {}, curr_pages: {}".format(curr_max_pages, curr_crawl_pages)
print(s)
add_to_detail(s, detail)
result = automator.crawl_gongzhonghao(line, articles,
states=states, max_pages=curr_max_pages,
detail=detail, latest_date=curr_latest_date)
s = "抓取 {} 成功: {}".format(line, result)
add_to_detail(s, detail)
print(s)
if result:
for article in articles:
url, _, title, html = article
if not url_in_states(url, states):
page = {"url": url,
"crawlWechatId": wechat_id,
"title": title,
"pubName": line,
"html": html}
s = "addurl: {}".format(page["url"])
add_to_detail(s, detail)
print(s)
s = requests.post(java_server + "/addurl", json=page)
res = json.loads(s.text)
if not res["success"]:
s = "addurl失败:{}".format(res)
add_to_detail(s, detail)
print(s)
continue


except:
traceback.print_exc()
finally:
send_debug_info(wechat_id, debug_info, java_server)

while True:
current_time = int(time.time())
time_sleep = 3600 * crawl_interval + start_time - current_time
if time_sleep > 0:
time.sleep(min(5*60, time_sleep))
test_id = automator.get_wechat_id()
succ = test_id == wechat_id
send_heart_beat(wechat_id, "heart-beat-{}".format(succ), java_server)
else:
break




Loading

0 comments on commit 698b05b

Please sign in to comment.