fix: xhs帖子详情问题更新

This commit is contained in:
Relakkes 2024-10-20 00:59:08 +08:00
parent 9fe3e47b0f
commit 03e393949a
6 changed files with 85 additions and 36 deletions

View File

@ -57,18 +57,26 @@ MAX_CONCURRENCY_NUM = 1
ENABLE_GET_IMAGES = False ENABLE_GET_IMAGES = False
# 是否开启爬评论模式, 默认开启爬评论 # 是否开启爬评论模式, 默认开启爬评论
ENABLE_GET_COMMENTS = False ENABLE_GET_COMMENTS = True
# 是否开启爬二级评论模式, 默认不开启爬二级评论 # 是否开启爬二级评论模式, 默认不开启爬二级评论
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
ENABLE_GET_SUB_COMMENTS = False ENABLE_GET_SUB_COMMENTS = False
# 指定小红书需要爬虫的笔记ID列表 # 已废弃⚠指定小红书需要爬虫的笔记ID列表
XHS_SPECIFIED_ID_LIST = [ # 已废弃⚠️⚠️⚠️ 指定笔记ID笔记列表会因为缺少xsec_token和xsec_source参数导致爬取失败
"6422c2750000000027000d88", # XHS_SPECIFIED_ID_LIST = [
# "66fad51c000000001b0224b8",
# # ........................
# ]
# 指定小红书需要爬虫的笔记URL列表, 目前要携带xsec_token和xsec_source参数
XHS_SPECIFIED_NOTE_URL_LIST = [
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
# ........................ # ........................
] ]
# 指定抖音需要爬取的ID列表 # 指定抖音需要爬取的ID列表
DY_SPECIFIED_ID_LIST = [ DY_SPECIFIED_ID_LIST = [
"7280854932641664319", "7280854932641664319",

View File

@ -99,6 +99,13 @@ class XiaoHongShuClient(AbstractApiClient):
**kwargs **kwargs
) )
if response.status_code == 471 or response.status_code == 461:
# someday someone maybe will bypass captcha
verify_type = response.headers['Verifytype']
verify_uuid = response.headers['Verifyuuid']
raise Exception(
f"出现验证码请求失败Verifytype: {verify_type}Verifyuuid: {verify_uuid}, Response: {response}")
if return_response: if return_response:
return response.text return response.text
data: Dict = response.json() data: Dict = response.json()
@ -228,8 +235,8 @@ class XiaoHongShuClient(AbstractApiClient):
"source_note_id": note_id, "source_note_id": note_id,
"image_formats": ["jpg", "webp", "avif"], "image_formats": ["jpg", "webp", "avif"],
"extra": {"need_body_topic": 1}, "extra": {"need_body_topic": 1},
# "xsec_source": xsec_source, "xsec_source": xsec_source,
# "xsec_token": xsec_token "xsec_token": xsec_token
} }
uri = "/api/sns/web/v1/feed" uri = "/api/sns/web/v1/feed"
res = await self.post(uri, data) res = await self.post(uri, data)
@ -454,13 +461,15 @@ class XiaoHongShuClient(AbstractApiClient):
return await self.post(uri, data=data, return_response=True) return await self.post(uri, data=data, return_response=True)
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
async def get_note_by_id_from_html(self, note_id: str): async def get_note_by_id_from_html(self, note_id: str, xsec_source: str, xsec_token: str) -> Dict:
""" """
通过解析网页版的笔记详情页HTML获取笔记详情, 该接口可能会出现失败的情况这里尝试重试3次 通过解析网页版的笔记详情页HTML获取笔记详情, 该接口可能会出现失败的情况这里尝试重试3次
copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259 copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259
thanks for ReaJason thanks for ReaJason
Args: Args:
note_id: note_id:
xsec_source:
xsec_token:
Returns: Returns:
@ -488,7 +497,7 @@ class XiaoHongShuClient(AbstractApiClient):
dict_new[new_key] = value dict_new[new_key] = value
return dict_new return dict_new
url = "https://www.xiaohongshu.com/explore/" + note_id url = "https://www.xiaohongshu.com/explore/" + note_id + f"?xsec_token={xsec_token}&xsec_source={xsec_source}"
html = await self.request(method="GET", url=url, return_response=True, headers=self.headers) html = await self.request(method="GET", url=url, return_response=True, headers=self.headers)
state = re.findall(r"window.__INITIAL_STATE__=({.*})</script>", html)[0].replace("undefined", '""') state = re.findall(r"window.__INITIAL_STATE__=({.*})</script>", html)[0].replace("undefined", '""')
if state != "{}": if state != "{}":

View File

@ -21,6 +21,7 @@ from tenacity import RetryError
import config import config
from base.base_crawler import AbstractCrawler from base.base_crawler import AbstractCrawler
from model.m_xiaohongshu import NoteUrlInfo
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import xhs as xhs_store from store import xhs as xhs_store
from tools import utils from tools import utils
@ -29,6 +30,7 @@ from var import crawler_type_var, source_keyword_var
from .client import XiaoHongShuClient from .client import XiaoHongShuClient
from .exception import DataFetchError from .exception import DataFetchError
from .field import SearchSortType from .field import SearchSortType
from .help import parse_note_info_from_note_url
from .login import XiaoHongShuLogin from .login import XiaoHongShuLogin
@ -191,48 +193,40 @@ class XiaoHongShuCrawler(AbstractCrawler):
await xhs_store.update_xhs_note(note_detail) await xhs_store.update_xhs_note(note_detail)
async def get_specified_notes(self): async def get_specified_notes(self):
"""Get the information and comments of the specified post""" """
Get the information and comments of the specified post
must be specified note_id, xsec_source, xsec_token
Returns:
async def get_note_detail_from_html_task(note_id: str, semaphore: asyncio.Semaphore) -> Dict: """
async with semaphore: get_note_detail_task_list = []
try: for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST:
_note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id) note_url_info: NoteUrlInfo = parse_note_info_from_note_url(full_note_url)
if not _note_detail: utils.logger.info(f"[XiaoHongShuCrawler.get_specified_notes] Parse note url info: {note_url_info}")
utils.logger.error( crawler_task = self.get_note_detail_async_task(
f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error, note_id: {note_id}") note_id=note_url_info.note_id,
return {} xsec_source=note_url_info.xsec_source,
return _note_detail xsec_token=note_url_info.xsec_token,
except DataFetchError as ex: semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error: {ex}") )
return {} get_note_detail_task_list.append(crawler_task)
except KeyError as ex:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_from_html] have not fund note detail note_id:{note_id}, err: {ex}")
return {}
except RetryError as ex:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_from_html] Retry error, note_id:{note_id}, err: {ex}")
get_note_detail_task_list = [
get_note_detail_from_html_task(note_id=note_id, semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)) for
note_id in config.XHS_SPECIFIED_ID_LIST
]
need_get_comment_note_ids = [] need_get_comment_note_ids = []
note_details = await asyncio.gather(*get_note_detail_task_list) note_details = await asyncio.gather(*get_note_detail_task_list)
for note_detail in note_details: for note_detail in note_details:
if note_detail: if note_detail:
need_get_comment_note_ids.append(note_detail.get("note_id")) need_get_comment_note_ids.append(note_detail.get("note_id", ""))
await xhs_store.update_xhs_note(note_detail) await xhs_store.update_xhs_note(note_detail)
await self.batch_get_note_comments(need_get_comment_note_ids) await self.batch_get_note_comments(need_get_comment_note_ids)
async def get_note_detail_async_task(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \ async def get_note_detail_async_task(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \
Optional[Dict]: Optional[Dict]:
"""Get note detail""" """Get note detail"""
async with semaphore: async with semaphore:
try: try:
# note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id) note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token)
note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token) # note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
if not note_detail: if not note_detail:
utils.logger.error( utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}") f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}")

View File

@ -15,6 +15,9 @@ import random
import time import time
import urllib.parse import urllib.parse
from model.m_xiaohongshu import NoteUrlInfo
from tools.crawler_util import extract_url_params_to_dict
def sign(a1="", b1="", x_s="", x_t=""): def sign(a1="", b1="", x_s="", x_t=""):
""" """
@ -288,6 +291,21 @@ def get_trace_id(img_url: str):
return f"spectrum/{img_url.split('/')[-1]}" if img_url.find("spectrum") != -1 else img_url.split("/")[-1] return f"spectrum/{img_url.split('/')[-1]}" if img_url.find("spectrum") != -1 else img_url.split("/")[-1]
def parse_note_info_from_note_url(url: str) -> NoteUrlInfo:
"""
从小红书笔记url中解析出笔记信息
Args:
url: "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
Returns:
"""
note_id = url.split("/")[-1].split("?")[0]
params = extract_url_params_to_dict(url)
xsec_token = params.get("xsec_token", "")
xsec_source = params.get("xsec_source", "")
return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source)
if __name__ == '__main__': if __name__ == '__main__':
_img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3" _img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
# 获取一个图片地址在多个cdn下的url地址 # 获取一个图片地址在多个cdn下的url地址

View File

@ -10,3 +10,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from pydantic import BaseModel, Field
class NoteUrlInfo(BaseModel):
note_id: str = Field(title="note id")
xsec_token: str = Field(title="xsec token")
xsec_source: str = Field(title="xsec source")

View File

@ -18,6 +18,8 @@ import base64
import json import json
import random import random
import re import re
import urllib
import urllib.parse
from io import BytesIO from io import BytesIO
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
@ -192,3 +194,12 @@ def extract_text_from_html(html: str) -> str:
# Remove all other tags # Remove all other tags
clean_text = re.sub(r'<[^>]+>', '', clean_html).strip() clean_text = re.sub(r'<[^>]+>', '', clean_html).strip()
return clean_text return clean_text
def extract_url_params_to_dict(url: str) -> Dict:
"""Extract URL parameters to dict"""
url_params_dict = dict()
if not url:
return url_params_dict
parsed_url = urllib.parse.urlparse(url)
url_params_dict = dict(urllib.parse.parse_qsl(parsed_url.query))
return url_params_dict