From ee95f69065c2ad2a4c86eb41b7e1ee432f65fb33 Mon Sep 17 00:00:00 2001 From: dylan <58234511@qq.com> Date: Sun, 2 Oct 2022 13:38:20 +0800 Subject: [PATCH] 1 --- jd_comment.py | 86 ++++++++++++++++++++++++++++++--------------------- jdspider.py | 44 ++++++++++++++++++-------- 2 files changed, 81 insertions(+), 49 deletions(-) diff --git a/jd_comment.py b/jd_comment.py index 08bf8db..82564c8 100644 --- a/jd_comment.py +++ b/jd_comment.py @@ -1,7 +1,12 @@ # -*- coding: utf-8 -*- -# @Time : 2022/2/8 20:50 -# @Author : @qiu-lzsnmb and @Dimlitter -# @File : auto_comment_plus.py +# 自动带图评价、追评、服务评价,需电脑端CK +# @Time : 2022/10/2 +# @Author : @qiu-lzsnmb and @Dimlitter @Dylan +# @File : auto_comment.py +''' +new Env('自动评价'); +8 8 2 10 * https://raw.githubusercontent.com/6dylan6/auto_comment/main/jd_comment.py +''' import argparse import copy @@ -10,12 +15,24 @@ import os import random import sys import time - -import jieba # just for linting -import jieba.analyse import requests -import yaml -from lxml import etree + +try: + import jieba # just for linting + import jieba.analyse + #import yaml + from lxml import etree +except: + os.system('pip3 install lxml &> /dev/null') + os.system('pip3 install jieba &> /dev/null') + os.system('pip3 install zhon &> /dev/null') + import jieba # just for linting + import jieba.analyse + #import yaml + from lxml import etree + + + import jdspider @@ -95,7 +112,7 @@ def generation(pname, _class=0, _type=1, opts=None): for i, item in enumerate(items): opts['logger'].debug('Loop: %d / %d', i + 1, loop_times) opts['logger'].debug('Current item: %s', item) - spider = jdspider.JDSpider(item) + spider = jdspider.JDSpider(item,ck) opts['logger'].debug('Successfully created a JDSpider instance') # 增加对增值服务的评价鉴别 if "赠品" in pname or "非实物" in pname or "增值服务" in pname: @@ -219,7 +236,7 @@ def sunbw(N, opts=None): '//*[@id="main"]/div[2]/div[2]/table') opts['logger'].debug('Count of fetched order data: %d', len(elems)) Order_data.extend(elems) - opts['logger'].info(f"当前共有{N['待评价订单']}个需要晒单。") + opts['logger'].info(f"当前共有{N['待评价订单']}个需要评价晒单。") opts['logger'].debug('Commenting on items') for i, Order in enumerate(Order_data): if i > 1: @@ -270,7 +287,7 @@ def sunbw(N, opts=None): imgurl = imgdata["imgComments"]["imgList"][0]["imageUrl"] opts['logger'].debug('Image URL: %s', imgurl) - opts['logger'].info(f'\t\t图片url={imgurl}') + opts['logger'].info(f'\t图片url={imgurl}') # 提交晒单 opts['logger'].debug('Preparing for commenting') url2 = "https://club.jd.com/myJdcomments/saveProductComment.action" @@ -307,7 +324,6 @@ def review(N, opts=None): req_et = [] Order_data = [] loop_times = 2 - print(loop_times) opts['logger'].debug('Fetching website data') opts['logger'].debug('Total loop times: %d', loop_times) for i in range(loop_times): @@ -350,7 +366,7 @@ def review(N, opts=None): opts['logger'].info(f"当前共有{N['待追评']}个需要追评。") opts['logger'].debug('Commenting on items') for i, Order in enumerate(Order_data): - if i > 1: + if i + 1 > 1: opts['logger'].info(f'\t已评价10个订单,跳出') break oname = Order.xpath('td[1]/div/div[2]/div/a/text()')[0] @@ -366,7 +382,7 @@ def review(N, opts=None): opts['logger'].debug('oid: %s', oid) opts['logger'].info(f'\t开始第{i+1}个订单: {oid}') _, context = generation(oname, _type=0, opts=opts) - opts['logger'].info(f'\t\t追评内容:{context}') + opts['logger'].info(f'\t追评内容:{context}') data1 = { 'orderId': oid, 'productId': pid, @@ -384,8 +400,6 @@ def review(N, opts=None): opts['logger'].debug('Sleep time (s): %.1f', REVIEW_SLEEP_SEC) time.sleep(REVIEW_SLEEP_SEC) N['待追评'] -= 1 - if i + 1 > 1: - break return N @@ -436,7 +450,7 @@ def Service_rating(N, opts=None): opts['logger'].info(f"当前共有{N['服务评价']}个需要服务评价。") opts['logger'].debug('Commenting on items') for i, Order in enumerate(Order_data): - if i > 1: + if i + 1 > 1: opts['logger'].info(f'\t已评价10个订单,跳出') break oname = Order.xpath('td[1]/div[1]/div[2]/div/a/text()')[0] @@ -487,7 +501,7 @@ def main(opts=None): N = No(opts) opts['logger'].debug('N value after executing No(): %s', N) if not N: - opts['logger'].error('Ck错误,请检查重新抓取!') + opts['logger'].error('CK错误,请确认是否电脑版CK!') exit() if N['待评价订单'] != 0: opts['logger'].info("1.开始评价晒单") @@ -553,11 +567,11 @@ if __name__ == '__main__': # NOTE: The alignment number should set to 19 considering the style # controling characters. When it comes to file logger, the number should # set to 8. - formatter = StyleFormatter('%(asctime)s %(levelname)-19s %(message)s') + formatter = StyleFormatter('%(asctime)s %(levelname)-19s %(message)s',"%F %T") rawformatter = StyleFormatter('%(asctime)s %(levelname)-8s %(message)s', use_style=False) console = logging.StreamHandler() console.setLevel(_logging_level) - console.setFormatter(formatter) + console.setFormatter(logging.Formatter('%(message)s')) logger.addHandler(console) opts['logger'] = logger # It's a hack!!! @@ -595,24 +609,24 @@ if __name__ == '__main__': logger.debug(' SERVICE_RATING_SLEEP_SEC: %s', SERVICE_RATING_SLEEP_SEC) # parse configurations - logger.debug('Reading the configuration file') - if os.path.exists(USER_CONFIG_PATH): - logger.debug('User configuration file exists') - _cfg_path = USER_CONFIG_PATH - else: - logger.debug('User configuration file doesn\'t exist, fallback to the default one') - _cfg_path = CONFIG_PATH - with open(_cfg_path, 'r', encoding='utf-8') as f: - cfg = yaml.safe_load(f) - logger.debug('Closed the configuration file') - logger.debug('Configurations in Python-dict format: %s', cfg) - if "JD_COOKIE" in os.environ: - if len (os.environ["PC_COOKIE"]) > 1: - ck = os.environ["PC_COOKIE"] - logger.info ("已获取并使用Env环境 Cookie") + #logger.debug('Reading the configuration file') + #if os.path.exists(USER_CONFIG_PATH): + #logger.debug('User configuration file exists') + #_cfg_path = USER_CONFIG_PATH + #else: + #logger.debug('User configuration file doesn\'t exist, fallback to the default one') + #_cfg_path = CONFIG_PATH + # with open(_cfg_path, 'r', encoding='utf-8') as f: + #cfg = yaml.safe_load(f) + #print() + #logger.debug('Closed the configuration file') + #logger.debug('Configurations in Python-dict format: %s', cfg) + if "PC_COOKIE" in os.environ: + if len(os.environ["PC_COOKIE"]) > 1: + ck = os.environ["PC_COOKIE"] + logger.info ("已获取并使用Env环境 Cookie") #ck = cfg['user']['cookie'] - headers = { 'cookie': ck.encode("utf-8"), 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36', diff --git a/jdspider.py b/jdspider.py index 084078f..e0a40ec 100644 --- a/jdspider.py +++ b/jdspider.py @@ -1,5 +1,5 @@ -# @Time : 2022/2/8 20:50 -# @Author :@Zhang Jiale and @Dimlitter +# @Time : 2022/2/10 +# @Author :@Zhang Jiale and @Dimlitter @Dylan # @File : jdspider.py import json @@ -24,7 +24,7 @@ default_logger.addHandler(log_console) class JDSpider: # 爬虫实现类:传入商品类别(如手机、电脑),构造实例。然后调用getData搜集数据。 - def __init__(self, categlory): + def __init__(self, categlory, ck): # jD起始搜索页面 self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % ( quote(categlory)) @@ -42,11 +42,12 @@ class JDSpider: 'sec-fetch-site': 'none', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', - 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36' + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36' } self.productsId = self.getId() - self.comtype = {1: "nagetive", 2: "medium", 3: "positive"} + self.comtype = {1: "差评", 2: "中评", 3: "好评"} self.categlory = categlory + self.ck = ck self.iplist = { 'http': [], 'https': [] @@ -67,15 +68,32 @@ class JDSpider: return params, url def getHeaders(self, productid): # 和初始的self.header不同,这是搜集某个商品的header,加入了商品id,我也不知道去掉了会怎样。 - header = {"Referer": "https://item.jd.com/%s.html" % (productid), - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36" - } + header = { + "Referer": "https://item.jd.com/%s.html" % (productid), + "Host": "sclub.jd.com", + "Connection": "keep-alive", + "Pragma": "no-cache", + "Cache-Control": "no-cache", + "sec-ch-ua": '"Chromium";v="21", " Not;A Brand";v="99"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "Windows", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "Sec-Fetch-Site": "none", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-User": "?1", + "Sec-Fetch-Dest": "document", + "Accept-Encoding": "gzip, deflate, br", + "Accept-Language": "zh-CN,zh;q=0.9", + "upgrade-insecure-requests": "1", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36", + "Cookie": self.ck + } return header def getId(self): # 获取商品id,为了得到具体商品页面的网址。结果保存在self.productId的数组里 response = requests.get(self.startUrl, headers=self.headers) if response.status_code != 200: - default_logger.warning("状态码错误,爬虫连接异常!") + default_logger.warning("状态码错误,连接异常!") html = etree.HTML(response.text) return html.xpath('//li[@class="gl-item"]/@data-sku') @@ -85,10 +103,10 @@ class JDSpider: comments = [] scores = [] - if len(self.productsId) < 6: # limit the sum of products + if len(self.productsId) < 4: # limit the sum of products sum = len(self.productsId) else: - sum = 5 + sum = 3 for j in range(sum): id = self.productsId[j] header = self.getHeaders(id) @@ -115,7 +133,7 @@ class JDSpider: if len((res_json['comments'])) == 0: default_logger.warning("本页无评价数据,跳过") break - default_logger.info("正在搜集%s %s" % + default_logger.info("正在搜集 %s 的%s信息" % (self.categlory, self.comtype[score])) for cdit in res_json['comments']: comment = cdit['content'].replace( @@ -123,7 +141,7 @@ class JDSpider: comments.append(comment) scores.append(cdit['score']) # savepath = './'+self.categlory+'_'+self.comtype[score]+'.csv' - default_logger.warning("已搜集%d 条 %s 评价信息" % + default_logger.warning("已搜集%d条%s信息" % (len(comments), self.comtype[score])) # 存入列表,简单处理评价 remarks = []