auto_comment/jdspider.py

223 lines
10 KiB
Python
Raw Normal View History

2022-10-02 05:38:20 +00:00
# @Time : 2022/2/10
# @Author :@Zhang Jiale and @Dimlitter @Dylan
2022-10-02 02:47:37 +00:00
# @File : jdspider.py
import json
import logging
import random
import re
import sys
import time
from urllib.parse import quote, urlencode
import requests
import zhon.hanzi
from lxml import etree
# Reference: https://github.com/fxsjy/jieba/blob/1e20c89b66f56c9301b0feed211733ffaa1bd72a/jieba/__init__.py#L27
log_console = logging.StreamHandler(sys.stderr)
default_logger = logging.getLogger('jdspider')
default_logger.setLevel(logging.DEBUG)
default_logger.addHandler(log_console)
class JDSpider:
# 爬虫实现类传入商品类别如手机、电脑构造实例。然后调用getData搜集数据。
2022-10-02 05:38:20 +00:00
def __init__(self, categlory, ck):
2022-10-02 02:47:37 +00:00
# jD起始搜索页面
self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % (
quote(categlory))
self.commentBaseUrl = "https://sclub.jd.com/comment/productPageComments.action?"
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'dnt': '1',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
2022-10-02 05:38:20 +00:00
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
2022-10-02 02:47:37 +00:00
}
self.productsId = self.getId()
2022-10-02 05:38:20 +00:00
self.comtype = {1: "差评", 2: "中评", 3: "好评"}
2022-10-02 02:47:37 +00:00
self.categlory = categlory
2022-10-02 05:38:20 +00:00
self.ck = ck
2022-10-02 02:47:37 +00:00
self.iplist = {
'http': [],
'https': []
}
def getParamUrl(self, productid, page, score):
params = { # 用于控制页数页面信息数的数据非常重要必不可少要不然会被JD识别出来爬不出相应的数据。
"productId": "%s" % (productid),
"score": "%s" % (score), # 1表示差评2表示中评3表示好评
"sortType": "5",
"page": "%s" % (page),
"pageSize": "10",
"isShadowSku": "0",
"rid": "0",
"fold": "1"
}
url = self.commentBaseUrl + urlencode(params)
return params, url
def getHeaders(self, productid): # 和初始的self.header不同这是搜集某个商品的header加入了商品id我也不知道去掉了会怎样。
2022-10-02 05:38:20 +00:00
header = {
"Referer": "https://item.jd.com/%s.html" % (productid),
"Host": "sclub.jd.com",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"sec-ch-ua": '"Chromium";v="21", " Not;A Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "Windows",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"upgrade-insecure-requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
"Cookie": self.ck
}
2022-10-02 02:47:37 +00:00
return header
def getId(self): # 获取商品id为了得到具体商品页面的网址。结果保存在self.productId的数组里
response = requests.get(self.startUrl, headers=self.headers)
if response.status_code != 200:
2022-10-02 05:38:20 +00:00
default_logger.warning("状态码错误,连接异常!")
2022-10-02 02:47:37 +00:00
html = etree.HTML(response.text)
return html.xpath('//li[@class="gl-item"]/@data-sku')
def getData(self, maxPage, score,): # maxPage是搜集评论的最大页数每页10条数据。差评和好评的最大一般页码不相同一般情况下好评>>差评>中评
# maxPage遇到超出的页码会自动跳出所以设大点也没有关系。
# score是指那种评价类型好评3、中评2、差评1。
comments = []
scores = []
2022-10-02 05:38:20 +00:00
if len(self.productsId) < 4: # limit the sum of products
2022-10-02 02:47:37 +00:00
sum = len(self.productsId)
else:
2022-10-02 05:38:20 +00:00
sum = 3
2022-10-02 02:47:37 +00:00
for j in range(sum):
id = self.productsId[j]
header = self.getHeaders(id)
for i in range(1, maxPage):
param, url = self.getParamUrl(id, i, score)
default_logger.info("正在搜集第%d个商品第%d页的评论信息" % (j+1, i))
try:
response = requests.get(url, headers=header, params=param)
except Exception as e:
default_logger.warning(e)
break
if response.status_code != 200:
default_logger.warning("状态码错误,连接异常")
continue
time.sleep(random.randint(5, 10)) # 设置时延防止被封IP
if response.text == '':
default_logger.warning("未搜集到信息")
continue
try:
res_json = json.loads(response.text)
except Exception as e:
default_logger.warning(e)
continue
if len((res_json['comments'])) == 0:
default_logger.warning("本页无评价数据,跳过")
break
2022-10-02 05:38:20 +00:00
default_logger.info("正在搜集 %s%s信息" %
2022-10-02 02:47:37 +00:00
(self.categlory, self.comtype[score]))
for cdit in res_json['comments']:
comment = cdit['content'].replace(
"\n", ' ').replace('\r', ' ')
comments.append(comment)
scores.append(cdit['score'])
# savepath = './'+self.categlory+'_'+self.comtype[score]+'.csv'
2022-10-02 05:38:20 +00:00
default_logger.warning("已搜集%d%s信息" %
2022-10-02 02:47:37 +00:00
(len(comments), self.comtype[score]))
# 存入列表,简单处理评价
remarks = []
for i in range(len(comments)):
rst = comments[i]
rst = re.findall(zhon.hanzi.sentence, comments[i])
if len(rst) == 0 or rst == [''] or rst == [''] or rst == [''] or rst == ['.'] or rst == [','] or rst == ['?'] or rst == ['!']:
#default_logger.warning("拆分失败或结果不符(去除空格和标点符号)%s" % (rst))
continue
else:
remarks.append(rst)
result = self.solvedata(remarks=remarks)
if len(result) == 0:
default_logger.warning("当前商品没有评价,使用默认评价")
result = ["考虑买这个$之前我是有担心过的,因为我不知道$的质量和品质怎么样,但是看了评论后我就放心了。",
"买这个$之前我是有看过好几家店,最后看到这家店的评价不错就决定在这家店买 ",
"看了好几家店,也对比了好几家店,最后发现还是这一家的$评价最好。",
"看来看去最后还是选择了这家。",
"之前在这家店也买过其他东西,感觉不错,这次又来啦。",
"这家的$的真是太好用了,用了第一次就还想再用一次。",
"收到货后我非常的开心,因为$的质量和品质真的非常的好!",
"拆开包装后惊艳到我了,这就是我想要的$!",
"快递超快!包装的很好!!很喜欢!!!",
"包装的很精美!$的质量和品质非常不错!",
"收到快递后迫不及待的拆了包装。$我真的是非常喜欢",
"真是一次难忘的购物,这辈子没见过这么好用的东西!!",
"经过了这次愉快的购物,我决定如果下次我还要买$的话,我一定会再来这家店买的。",
"不错不错!",
"我会推荐想买$的朋友也来这家店里买",
"真是一次愉快的购物!",
"大大的好评!以后买$再来你们店!( ̄▽ ̄)",
"真是一次愉快的购物!"
]
return result
def solvedata(self, remarks):
# 将数据拆分成句子
sentences = []
for i in range(len(remarks)):
for j in range(len(remarks[i])):
sentences.append(remarks[i][j])
#default_logger.info("搜集的评价结果:" + str(sentences))
return sentences
# 存入mysql数据库
'''
db = pymysql.connect(host='主机名',user='用户名',password='密码',db='数据库名',charset='utf8mb4')
mycursor = db.cursor()
mycursor.execute("use jd") # 根据自己的数据库名称更改
mycursor.execute("TRUNCATE table jd")
for i in range(len(comments)):
sql = "insert into jd(i,scores,comments) values('%s','%s','%s')"%(id,scores[i],comments[i]) # 根据自己的表结构更改
try:
mycursor.execute(sql)
db.commit()
except Exception as e:
logging.warning(e)
db.rollback()
mycursor.close()
db.close()
logging.warning("已存入数据库")
'''
# 存入csv文件
'''
with open(savepath,'a+',encoding ='utf8') as f:
for i in range(len(comments)):
f.write("%d\t%s\t%s\n"%(i,scores[i],comments[i]))
logging.warning("数据已保存在 %s"%(savepath))
'''
# 测试用例
if __name__ == "__main__":
jdlist = ['笔筒台灯插座 手机支架多功能USB充电LED护眼灯遥控定时学生学习阅 读灯宿舍寝室卧室床头书桌台灯插排 笔筒台灯 4插位+2USB 1.8米(不带遥控)']
for item in jdlist:
spider = JDSpider(item)
spider.getData(4, 3)