This commit is contained in:
dylan 2022-10-02 13:38:20 +08:00
parent 6eb59df025
commit ee95f69065
2 changed files with 81 additions and 49 deletions

View File

@ -1,7 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# @Time : 2022/2/8 20:50 # 自动带图评价、追评、服务评价需电脑端CK
# @Author : @qiu-lzsnmb and @Dimlitter # @Time : 2022/10/2
# @File : auto_comment_plus.py # @Author : @qiu-lzsnmb and @Dimlitter @Dylan
# @File : auto_comment.py
'''
new Env('自动评价');
8 8 2 10 * https://raw.githubusercontent.com/6dylan6/auto_comment/main/jd_comment.py
'''
import argparse import argparse
import copy import copy
@ -10,12 +15,24 @@ import os
import random import random
import sys import sys
import time import time
import requests
try:
import jieba # just for linting import jieba # just for linting
import jieba.analyse import jieba.analyse
import requests #import yaml
import yaml
from lxml import etree from lxml import etree
except:
os.system('pip3 install lxml &> /dev/null')
os.system('pip3 install jieba &> /dev/null')
os.system('pip3 install zhon &> /dev/null')
import jieba # just for linting
import jieba.analyse
#import yaml
from lxml import etree
import jdspider import jdspider
@ -95,7 +112,7 @@ def generation(pname, _class=0, _type=1, opts=None):
for i, item in enumerate(items): for i, item in enumerate(items):
opts['logger'].debug('Loop: %d / %d', i + 1, loop_times) opts['logger'].debug('Loop: %d / %d', i + 1, loop_times)
opts['logger'].debug('Current item: %s', item) opts['logger'].debug('Current item: %s', item)
spider = jdspider.JDSpider(item) spider = jdspider.JDSpider(item,ck)
opts['logger'].debug('Successfully created a JDSpider instance') opts['logger'].debug('Successfully created a JDSpider instance')
# 增加对增值服务的评价鉴别 # 增加对增值服务的评价鉴别
if "赠品" in pname or "非实物" in pname or "增值服务" in pname: if "赠品" in pname or "非实物" in pname or "增值服务" in pname:
@ -219,7 +236,7 @@ def sunbw(N, opts=None):
'//*[@id="main"]/div[2]/div[2]/table') '//*[@id="main"]/div[2]/div[2]/table')
opts['logger'].debug('Count of fetched order data: %d', len(elems)) opts['logger'].debug('Count of fetched order data: %d', len(elems))
Order_data.extend(elems) Order_data.extend(elems)
opts['logger'].info(f"当前共有{N['待评价订单']}个需要晒单。") opts['logger'].info(f"当前共有{N['待评价订单']}个需要评价晒单。")
opts['logger'].debug('Commenting on items') opts['logger'].debug('Commenting on items')
for i, Order in enumerate(Order_data): for i, Order in enumerate(Order_data):
if i > 1: if i > 1:
@ -270,7 +287,7 @@ def sunbw(N, opts=None):
imgurl = imgdata["imgComments"]["imgList"][0]["imageUrl"] imgurl = imgdata["imgComments"]["imgList"][0]["imageUrl"]
opts['logger'].debug('Image URL: %s', imgurl) opts['logger'].debug('Image URL: %s', imgurl)
opts['logger'].info(f'\t\t图片url={imgurl}') opts['logger'].info(f'\t图片url={imgurl}')
# 提交晒单 # 提交晒单
opts['logger'].debug('Preparing for commenting') opts['logger'].debug('Preparing for commenting')
url2 = "https://club.jd.com/myJdcomments/saveProductComment.action" url2 = "https://club.jd.com/myJdcomments/saveProductComment.action"
@ -307,7 +324,6 @@ def review(N, opts=None):
req_et = [] req_et = []
Order_data = [] Order_data = []
loop_times = 2 loop_times = 2
print(loop_times)
opts['logger'].debug('Fetching website data') opts['logger'].debug('Fetching website data')
opts['logger'].debug('Total loop times: %d', loop_times) opts['logger'].debug('Total loop times: %d', loop_times)
for i in range(loop_times): for i in range(loop_times):
@ -350,7 +366,7 @@ def review(N, opts=None):
opts['logger'].info(f"当前共有{N['待追评']}个需要追评。") opts['logger'].info(f"当前共有{N['待追评']}个需要追评。")
opts['logger'].debug('Commenting on items') opts['logger'].debug('Commenting on items')
for i, Order in enumerate(Order_data): for i, Order in enumerate(Order_data):
if i > 1: if i + 1 > 1:
opts['logger'].info(f'\t已评价10个订单跳出') opts['logger'].info(f'\t已评价10个订单跳出')
break break
oname = Order.xpath('td[1]/div/div[2]/div/a/text()')[0] oname = Order.xpath('td[1]/div/div[2]/div/a/text()')[0]
@ -366,7 +382,7 @@ def review(N, opts=None):
opts['logger'].debug('oid: %s', oid) opts['logger'].debug('oid: %s', oid)
opts['logger'].info(f'\t开始第{i+1}个订单: {oid}') opts['logger'].info(f'\t开始第{i+1}个订单: {oid}')
_, context = generation(oname, _type=0, opts=opts) _, context = generation(oname, _type=0, opts=opts)
opts['logger'].info(f'\t\t追评内容:{context}') opts['logger'].info(f'\t追评内容:{context}')
data1 = { data1 = {
'orderId': oid, 'orderId': oid,
'productId': pid, 'productId': pid,
@ -384,8 +400,6 @@ def review(N, opts=None):
opts['logger'].debug('Sleep time (s): %.1f', REVIEW_SLEEP_SEC) opts['logger'].debug('Sleep time (s): %.1f', REVIEW_SLEEP_SEC)
time.sleep(REVIEW_SLEEP_SEC) time.sleep(REVIEW_SLEEP_SEC)
N['待追评'] -= 1 N['待追评'] -= 1
if i + 1 > 1:
break
return N return N
@ -436,7 +450,7 @@ def Service_rating(N, opts=None):
opts['logger'].info(f"当前共有{N['服务评价']}个需要服务评价。") opts['logger'].info(f"当前共有{N['服务评价']}个需要服务评价。")
opts['logger'].debug('Commenting on items') opts['logger'].debug('Commenting on items')
for i, Order in enumerate(Order_data): for i, Order in enumerate(Order_data):
if i > 1: if i + 1 > 1:
opts['logger'].info(f'\t已评价10个订单跳出') opts['logger'].info(f'\t已评价10个订单跳出')
break break
oname = Order.xpath('td[1]/div[1]/div[2]/div/a/text()')[0] oname = Order.xpath('td[1]/div[1]/div[2]/div/a/text()')[0]
@ -487,7 +501,7 @@ def main(opts=None):
N = No(opts) N = No(opts)
opts['logger'].debug('N value after executing No(): %s', N) opts['logger'].debug('N value after executing No(): %s', N)
if not N: if not N:
opts['logger'].error('Ck错误请检查重新抓取') opts['logger'].error('CK错误请确认是否电脑版CK')
exit() exit()
if N['待评价订单'] != 0: if N['待评价订单'] != 0:
opts['logger'].info("1.开始评价晒单") opts['logger'].info("1.开始评价晒单")
@ -553,11 +567,11 @@ if __name__ == '__main__':
# NOTE: The alignment number should set to 19 considering the style # NOTE: The alignment number should set to 19 considering the style
# controling characters. When it comes to file logger, the number should # controling characters. When it comes to file logger, the number should
# set to 8. # set to 8.
formatter = StyleFormatter('%(asctime)s %(levelname)-19s %(message)s') formatter = StyleFormatter('%(asctime)s %(levelname)-19s %(message)s',"%F %T")
rawformatter = StyleFormatter('%(asctime)s %(levelname)-8s %(message)s', use_style=False) rawformatter = StyleFormatter('%(asctime)s %(levelname)-8s %(message)s', use_style=False)
console = logging.StreamHandler() console = logging.StreamHandler()
console.setLevel(_logging_level) console.setLevel(_logging_level)
console.setFormatter(formatter) console.setFormatter(logging.Formatter('%(message)s'))
logger.addHandler(console) logger.addHandler(console)
opts['logger'] = logger opts['logger'] = logger
# It's a hack!!! # It's a hack!!!
@ -595,24 +609,24 @@ if __name__ == '__main__':
logger.debug(' SERVICE_RATING_SLEEP_SEC: %s', SERVICE_RATING_SLEEP_SEC) logger.debug(' SERVICE_RATING_SLEEP_SEC: %s', SERVICE_RATING_SLEEP_SEC)
# parse configurations # parse configurations
logger.debug('Reading the configuration file') #logger.debug('Reading the configuration file')
if os.path.exists(USER_CONFIG_PATH): #if os.path.exists(USER_CONFIG_PATH):
logger.debug('User configuration file exists') #logger.debug('User configuration file exists')
_cfg_path = USER_CONFIG_PATH #_cfg_path = USER_CONFIG_PATH
else: #else:
logger.debug('User configuration file doesn\'t exist, fallback to the default one') #logger.debug('User configuration file doesn\'t exist, fallback to the default one')
_cfg_path = CONFIG_PATH #_cfg_path = CONFIG_PATH
with open(_cfg_path, 'r', encoding='utf-8') as f: # with open(_cfg_path, 'r', encoding='utf-8') as f:
cfg = yaml.safe_load(f) #cfg = yaml.safe_load(f)
logger.debug('Closed the configuration file') #print()
logger.debug('Configurations in Python-dict format: %s', cfg) #logger.debug('Closed the configuration file')
if "JD_COOKIE" in os.environ: #logger.debug('Configurations in Python-dict format: %s', cfg)
if "PC_COOKIE" in os.environ:
if len(os.environ["PC_COOKIE"]) > 1: if len(os.environ["PC_COOKIE"]) > 1:
ck = os.environ["PC_COOKIE"] ck = os.environ["PC_COOKIE"]
logger.info ("已获取并使用Env环境 Cookie") logger.info ("已获取并使用Env环境 Cookie")
#ck = cfg['user']['cookie'] #ck = cfg['user']['cookie']
headers = { headers = {
'cookie': ck.encode("utf-8"), 'cookie': ck.encode("utf-8"),
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36',

View File

@ -1,5 +1,5 @@
# @Time : 2022/2/8 20:50 # @Time : 2022/2/10
# @Author :@Zhang Jiale and @Dimlitter # @Author :@Zhang Jiale and @Dimlitter @Dylan
# @File : jdspider.py # @File : jdspider.py
import json import json
@ -24,7 +24,7 @@ default_logger.addHandler(log_console)
class JDSpider: class JDSpider:
# 爬虫实现类传入商品类别如手机、电脑构造实例。然后调用getData搜集数据。 # 爬虫实现类传入商品类别如手机、电脑构造实例。然后调用getData搜集数据。
def __init__(self, categlory): def __init__(self, categlory, ck):
# jD起始搜索页面 # jD起始搜索页面
self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % ( self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % (
quote(categlory)) quote(categlory))
@ -42,11 +42,12 @@ class JDSpider:
'sec-fetch-site': 'none', 'sec-fetch-site': 'none',
'sec-fetch-user': '?1', 'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1', 'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36' 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
} }
self.productsId = self.getId() self.productsId = self.getId()
self.comtype = {1: "nagetive", 2: "medium", 3: "positive"} self.comtype = {1: "差评", 2: "中评", 3: "好评"}
self.categlory = categlory self.categlory = categlory
self.ck = ck
self.iplist = { self.iplist = {
'http': [], 'http': [],
'https': [] 'https': []
@ -67,15 +68,32 @@ class JDSpider:
return params, url return params, url
def getHeaders(self, productid): # 和初始的self.header不同这是搜集某个商品的header加入了商品id我也不知道去掉了会怎样。 def getHeaders(self, productid): # 和初始的self.header不同这是搜集某个商品的header加入了商品id我也不知道去掉了会怎样。
header = {"Referer": "https://item.jd.com/%s.html" % (productid), header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36" "Referer": "https://item.jd.com/%s.html" % (productid),
"Host": "sclub.jd.com",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"sec-ch-ua": '"Chromium";v="21", " Not;A Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "Windows",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"upgrade-insecure-requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
"Cookie": self.ck
} }
return header return header
def getId(self): # 获取商品id为了得到具体商品页面的网址。结果保存在self.productId的数组里 def getId(self): # 获取商品id为了得到具体商品页面的网址。结果保存在self.productId的数组里
response = requests.get(self.startUrl, headers=self.headers) response = requests.get(self.startUrl, headers=self.headers)
if response.status_code != 200: if response.status_code != 200:
default_logger.warning("状态码错误,爬虫连接异常!") default_logger.warning("状态码错误,连接异常!")
html = etree.HTML(response.text) html = etree.HTML(response.text)
return html.xpath('//li[@class="gl-item"]/@data-sku') return html.xpath('//li[@class="gl-item"]/@data-sku')
@ -85,10 +103,10 @@ class JDSpider:
comments = [] comments = []
scores = [] scores = []
if len(self.productsId) < 6: # limit the sum of products if len(self.productsId) < 4: # limit the sum of products
sum = len(self.productsId) sum = len(self.productsId)
else: else:
sum = 5 sum = 3
for j in range(sum): for j in range(sum):
id = self.productsId[j] id = self.productsId[j]
header = self.getHeaders(id) header = self.getHeaders(id)
@ -115,7 +133,7 @@ class JDSpider:
if len((res_json['comments'])) == 0: if len((res_json['comments'])) == 0:
default_logger.warning("本页无评价数据,跳过") default_logger.warning("本页无评价数据,跳过")
break break
default_logger.info("正在搜集%s %s" % default_logger.info("正在搜集 %s %s信息" %
(self.categlory, self.comtype[score])) (self.categlory, self.comtype[score]))
for cdit in res_json['comments']: for cdit in res_json['comments']:
comment = cdit['content'].replace( comment = cdit['content'].replace(
@ -123,7 +141,7 @@ class JDSpider:
comments.append(comment) comments.append(comment)
scores.append(cdit['score']) scores.append(cdit['score'])
# savepath = './'+self.categlory+'_'+self.comtype[score]+'.csv' # savepath = './'+self.categlory+'_'+self.comtype[score]+'.csv'
default_logger.warning("已搜集%d %s 评价信息" % default_logger.warning("已搜集%d%s信息" %
(len(comments), self.comtype[score])) (len(comments), self.comtype[score]))
# 存入列表,简单处理评价 # 存入列表,简单处理评价
remarks = [] remarks = []