From c37ec0ba1944dc4066a603021b7cea514cc5c237 Mon Sep 17 00:00:00 2001 From: rootphantomer Date: Wed, 17 Jul 2024 17:10:49 +0800 Subject: [PATCH] =?UTF-8?q?style(auto=5Fcomment=5Fplus,jdspider):=20?= =?UTF-8?q?=E4=BC=98=E5=8C=96=E4=BB=A3=E7=A0=81=E6=A0=BC=E5=BC=8F=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 优化代码格式,更符合 python 最新代码格式标准 --- auto_comment_plus.py | 26 +++++++++++++++----------- jdspider.py | 41 +++++++++++++++++++++++------------------ 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/auto_comment_plus.py b/auto_comment_plus.py index 6495419..46ad1b0 100644 --- a/auto_comment_plus.py +++ b/auto_comment_plus.py @@ -5,12 +5,12 @@ import argparse import copy -import urllib import logging import os import random import sys import time +import urllib import jieba # just for linting import jieba.analyse @@ -20,7 +20,6 @@ import jdspider - # constants CONFIG_PATH = "./config.yml" USER_CONFIG_PATH = "./config.user.yml" @@ -90,6 +89,7 @@ def format(self, record): # 评价生成 def generation(pname, _class: int = 0, _type: int = 1, opts: object = None): + result = [] opts = opts or {} items = ["商品名"] items.clear() @@ -135,6 +135,7 @@ def generation(pname, _class: int = 0, _type: int = 1, opts: object = None): opts["logger"].debug("_class is 1. Directly return name") return name else: + num = 0 if _type == 1: num = 6 elif _type == 0: @@ -318,11 +319,11 @@ def ordinary(N, opts=None): if not opts.get("dry_run"): opts["logger"].debug("Sending comment request") pj2 = requests.post(url2, headers=headers2, data=data2) + opts["logger"].info( + "发送请求后的状态码:{},text:{}".format(pj2.status_code, pj2.text) + ) else: opts["logger"].debug("Skipped sending comment request in dry run") - opts["logger"].info( - "发送请求后的状态码:{},text:{}".format(pj2.status_code, pj2.text) - ) if pj2.status_code == 200 and pj2.json()["success"]: # 当发送后的状态码 200,并且返回值里的 success 是 true 才是晒图成功,此外所有状态均为晒图失败 opts["logger"].info(f"\t{i}.评价订单\t{oname}[{oid}]并晒图成功") @@ -513,11 +514,11 @@ def review(N, opts=None): if not opts.get("dry_run"): opts["logger"].debug("Sending comment request") pj1 = requests.post(url1, headers=headers2, data=data1) + opts["logger"].debug( + "发送请求后的状态码:{},text:{}".format(pj1.status_code, pj1.text) + ) else: opts["logger"].debug("Skipped sending comment request in dry run") - opts["logger"].debug( - "发送请求后的状态码:{},text:{}".format(pj1.status_code, pj1.text) - ) opts["logger"].info("完成") opts["logger"].debug("Sleep time (s): %.1f", REVIEW_SLEEP_SEC) time.sleep(REVIEW_SLEEP_SEC) @@ -766,7 +767,8 @@ def main(opts=None): headers2 = { "cookie": ck.encode("utf-8"), - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/114.0.5735.110 Safari/537.36", "Connection": "keep-alive", "Cache-Control": "max-age=0", "X-Requested-With": "XMLHttpRequest", @@ -787,7 +789,8 @@ def main(opts=None): } headers = { "cookie": ck.encode("utf-8"), - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/98.0.4758.82 Safari/537.36", "Connection": "keep-alive", "Cache-Control": "max-age=0", "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"', @@ -795,7 +798,8 @@ def main(opts=None): "sec-ch-ua-platform": '"Windows"', "DNT": "1", "Upgrade-Insecure-Requests": "1", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8," + "application/signed-exchange;v=b3;q=0.9", "Sec-Fetch-Site": "same-site", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", diff --git a/jdspider.py b/jdspider.py index 103fdad..513f645 100644 --- a/jdspider.py +++ b/jdspider.py @@ -7,14 +7,14 @@ import random import re import sys -import time, yaml +import time from urllib.parse import quote, urlencode import requests +import yaml import zhon.hanzi from lxml import etree - # Reference: https://github.com/fxsjy/jieba/blob/1e20c89b66f56c9301b0feed211733ffaa1bd72a/jieba/__init__.py#L27 with open("./config.yml", "r", encoding="utf-8") as f: cfg = yaml.safe_load(f) @@ -35,7 +35,8 @@ def __init__(self, categlory): ) self.commentBaseUrl = "https://sclub.jd.com/comment/productPageComments.action?" self.headers = { - "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng," + "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "max-age=0", @@ -47,10 +48,12 @@ def __init__(self, categlory): "sec-fetch-site": "none", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/98.0.4758.82 Safari/537.36", } self.headers2 = { - "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng," + "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "max-age=0", @@ -62,7 +65,8 @@ def __init__(self, categlory): "sec-fetch-site": "none", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/98.0.4758.82 Safari/537.36", "cookie": cookie, } self.productsId = self.getId() @@ -72,10 +76,10 @@ def __init__(self, categlory): def getParamUrl(self, productid: str, page: str, score: str) -> tuple[dict, str]: params = { # 用于控制页数,页面信息数的数据,非常重要,必不可少,要不然会被JD识别出来,爬不出相应的数据。 - "productId": "%s" % (productid), - "score": "%s" % (score), # 1表示差评,2表示中评,3表示好评 + "productId": "%s" % productid, + "score": "%s" % score, # 1表示差评,2表示中评,3表示好评 "sortType": "5", - "page": "%s" % (page), + "page": "%s" % page, "pageSize": "10", "isShadowSku": "0", "rid": "0", @@ -90,8 +94,9 @@ def getHeaders( dict ): # 和初始的self.header不同,这是爬取某个商品的header,加入了商品id,我也不知道去掉了会怎样。 header = { - "Referer": "https://item.jd.com/%s.html" % (productid), - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36", + "Referer": "https://item.jd.com/%s.html" % productid, + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/75.0.3770.142 Safari/537.36", # "cookie": cookie, } return header @@ -110,7 +115,7 @@ def getId( def getData( self, maxPage: int, - score: str, + score: int, ): # maxPage是爬取评论的最大页数,每页10条数据。差评和好评的最大一般页码不相同,一般情况下:好评>>差评>中评 # maxPage遇到超出的页码会自动跳出,所以设大点也没有关系。 # score是指那种评价类型,好评3、中评2、差评1。 @@ -121,14 +126,14 @@ def getData( "爬取商品数量最多为8个,请耐心等待,也可以自行修改jdspider文件" ) if len(self.productsId) < 8: # limit the sum of products - sum = len(self.productsId) + sum_ = len(self.productsId) else: - sum: int = 3 - for j in range(sum): - id: str = self.productsId[j] + sum_: int = 3 + for j in range(sum_): + id_: str = self.productsId[j] # header = self.getHeaders(id) for i in range(1, maxPage): - param, url = self.getParamUrl(id, i, score) + param, url = self.getParamUrl(id_, str(i), str(score)) default_logger.info( f"正在爬取当前商品的评论信息>>>>>>>>>第:%d 个,第 %d 页" % (j + 1, i) @@ -152,7 +157,7 @@ def getData( continue if len((res_json["comments"])) == 0: default_logger.warning( - "页面次数已到:%d,超出范围(或未爬取到评论)" % (i) + "页面次数已到:%d,超出范围(或未爬取到评论)" % i ) break for cdit in res_json["comments"]: