Skip to content

Commit

Permalink
style(auto_comment_plus,jdspider): 优化代码格式问题
Browse files Browse the repository at this point in the history
优化代码格式,更符合 python 最新代码格式标准
  • Loading branch information
rootphantomer committed Jul 17, 2024
1 parent 1f4e4a7 commit c37ec0b
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 29 deletions.
26 changes: 15 additions & 11 deletions auto_comment_plus.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

import argparse
import copy
import urllib
import logging
import os
import random
import sys
import time
import urllib

import jieba # just for linting
import jieba.analyse
Expand All @@ -20,7 +20,6 @@

import jdspider


# constants
CONFIG_PATH = "./config.yml"
USER_CONFIG_PATH = "./config.user.yml"
Expand Down Expand Up @@ -90,6 +89,7 @@ def format(self, record):

# 评价生成
def generation(pname, _class: int = 0, _type: int = 1, opts: object = None):
result = []
opts = opts or {}
items = ["商品名"]
items.clear()
Expand Down Expand Up @@ -135,6 +135,7 @@ def generation(pname, _class: int = 0, _type: int = 1, opts: object = None):
opts["logger"].debug("_class is 1. Directly return name")
return name
else:
num = 0
if _type == 1:
num = 6
elif _type == 0:
Expand Down Expand Up @@ -318,11 +319,11 @@ def ordinary(N, opts=None):
if not opts.get("dry_run"):
opts["logger"].debug("Sending comment request")
pj2 = requests.post(url2, headers=headers2, data=data2)
opts["logger"].info(
"发送请求后的状态码:{},text:{}".format(pj2.status_code, pj2.text)
)
else:
opts["logger"].debug("Skipped sending comment request in dry run")
opts["logger"].info(
"发送请求后的状态码:{},text:{}".format(pj2.status_code, pj2.text)
)
if pj2.status_code == 200 and pj2.json()["success"]:
# 当发送后的状态码 200,并且返回值里的 success 是 true 才是晒图成功,此外所有状态均为晒图失败
opts["logger"].info(f"\t{i}.评价订单\t{oname}[{oid}]并晒图成功")
Expand Down Expand Up @@ -513,11 +514,11 @@ def review(N, opts=None):
if not opts.get("dry_run"):
opts["logger"].debug("Sending comment request")
pj1 = requests.post(url1, headers=headers2, data=data1)
opts["logger"].debug(
"发送请求后的状态码:{},text:{}".format(pj1.status_code, pj1.text)
)
else:
opts["logger"].debug("Skipped sending comment request in dry run")
opts["logger"].debug(
"发送请求后的状态码:{},text:{}".format(pj1.status_code, pj1.text)
)
opts["logger"].info("完成")
opts["logger"].debug("Sleep time (s): %.1f", REVIEW_SLEEP_SEC)
time.sleep(REVIEW_SLEEP_SEC)
Expand Down Expand Up @@ -766,7 +767,8 @@ def main(opts=None):

headers2 = {
"cookie": ck.encode("utf-8"),
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.110 Safari/537.36",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/114.0.5735.110 Safari/537.36",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"X-Requested-With": "XMLHttpRequest",
Expand All @@ -787,15 +789,17 @@ def main(opts=None):
}
headers = {
"cookie": ck.encode("utf-8"),
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/98.0.4758.82 Safari/537.36",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"DNT": "1",
"Upgrade-Insecure-Requests": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,"
"application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site": "same-site",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
Expand Down
41 changes: 23 additions & 18 deletions jdspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
import random
import re
import sys
import time, yaml
import time
from urllib.parse import quote, urlencode

import requests
import yaml
import zhon.hanzi
from lxml import etree


# Reference: https://github.com/fxsjy/jieba/blob/1e20c89b66f56c9301b0feed211733ffaa1bd72a/jieba/__init__.py#L27
with open("./config.yml", "r", encoding="utf-8") as f:
cfg = yaml.safe_load(f)
Expand All @@ -35,7 +35,8 @@ def __init__(self, categlory):
)
self.commentBaseUrl = "https://sclub.jd.com/comment/productPageComments.action?"
self.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
Expand All @@ -47,10 +48,12 @@ def __init__(self, categlory):
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/98.0.4758.82 Safari/537.36",
}
self.headers2 = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "max-age=0",
Expand All @@ -62,7 +65,8 @@ def __init__(self, categlory):
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/98.0.4758.82 Safari/537.36",
"cookie": cookie,
}
self.productsId = self.getId()
Expand All @@ -72,10 +76,10 @@ def __init__(self, categlory):

def getParamUrl(self, productid: str, page: str, score: str) -> tuple[dict, str]:
params = { # 用于控制页数,页面信息数的数据,非常重要,必不可少,要不然会被JD识别出来,爬不出相应的数据。
"productId": "%s" % (productid),
"score": "%s" % (score), # 1表示差评,2表示中评,3表示好评
"productId": "%s" % productid,
"score": "%s" % score, # 1表示差评,2表示中评,3表示好评
"sortType": "5",
"page": "%s" % (page),
"page": "%s" % page,
"pageSize": "10",
"isShadowSku": "0",
"rid": "0",
Expand All @@ -90,8 +94,9 @@ def getHeaders(
dict
): # 和初始的self.header不同,这是爬取某个商品的header,加入了商品id,我也不知道去掉了会怎样。
header = {
"Referer": "https://item.jd.com/%s.html" % (productid),
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
"Referer": "https://item.jd.com/%s.html" % productid,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/75.0.3770.142 Safari/537.36",
# "cookie": cookie,
}
return header
Expand All @@ -110,7 +115,7 @@ def getId(
def getData(
self,
maxPage: int,
score: str,
score: int,
): # maxPage是爬取评论的最大页数,每页10条数据。差评和好评的最大一般页码不相同,一般情况下:好评>>差评>中评
# maxPage遇到超出的页码会自动跳出,所以设大点也没有关系。
# score是指那种评价类型,好评3、中评2、差评1。
Expand All @@ -121,14 +126,14 @@ def getData(
"爬取商品数量最多为8个,请耐心等待,也可以自行修改jdspider文件"
)
if len(self.productsId) < 8: # limit the sum of products
sum = len(self.productsId)
sum_ = len(self.productsId)
else:
sum: int = 3
for j in range(sum):
id: str = self.productsId[j]
sum_: int = 3
for j in range(sum_):
id_: str = self.productsId[j]
# header = self.getHeaders(id)
for i in range(1, maxPage):
param, url = self.getParamUrl(id, i, score)
param, url = self.getParamUrl(id_, str(i), str(score))
default_logger.info(
f"正在爬取当前商品的评论信息>>>>>>>>>第:%d 个,第 %d 页"
% (j + 1, i)
Expand All @@ -152,7 +157,7 @@ def getData(
continue
if len((res_json["comments"])) == 0:
default_logger.warning(
"页面次数已到:%d,超出范围(或未爬取到评论)" % (i)
"页面次数已到:%d,超出范围(或未爬取到评论)" % i
)
break
for cdit in res_json["comments"]:
Expand Down

0 comments on commit c37ec0b

Please sign in to comment.