diff --git a/config/base_config.py b/config/base_config.py index 2bd1631..52c6fc9 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -57,18 +57,26 @@ MAX_CONCURRENCY_NUM = 1 ENABLE_GET_IMAGES = False # 是否开启爬评论模式, 默认开启爬评论 -ENABLE_GET_COMMENTS = False +ENABLE_GET_COMMENTS = True # 是否开启爬二级评论模式, 默认不开启爬二级评论 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 ENABLE_GET_SUB_COMMENTS = False -# 指定小红书需要爬虫的笔记ID列表 -XHS_SPECIFIED_ID_LIST = [ - "6422c2750000000027000d88", +# 已废弃⚠️⚠️⚠️指定小红书需要爬虫的笔记ID列表 +# 已废弃⚠️⚠️⚠️ 指定笔记ID笔记列表会因为缺少xsec_token和xsec_source参数导致爬取失败 +# XHS_SPECIFIED_ID_LIST = [ +# "66fad51c000000001b0224b8", +# # ........................ +# ] + +# 指定小红书需要爬虫的笔记URL列表, 目前要携带xsec_token和xsec_source参数 +XHS_SPECIFIED_NOTE_URL_LIST = [ + "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search" # ........................ ] + # 指定抖音需要爬取的ID列表 DY_SPECIFIED_ID_LIST = [ "7280854932641664319", diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index c9855e7..d088234 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -99,6 +99,13 @@ class XiaoHongShuClient(AbstractApiClient): **kwargs ) + if response.status_code == 471 or response.status_code == 461: + # someday someone maybe will bypass captcha + verify_type = response.headers['Verifytype'] + verify_uuid = response.headers['Verifyuuid'] + raise Exception( + f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}") + if return_response: return response.text data: Dict = response.json() @@ -228,8 +235,8 @@ class XiaoHongShuClient(AbstractApiClient): "source_note_id": note_id, "image_formats": ["jpg", "webp", "avif"], "extra": {"need_body_topic": 1}, - # "xsec_source": xsec_source, - # "xsec_token": xsec_token + "xsec_source": xsec_source, + "xsec_token": xsec_token } uri = "/api/sns/web/v1/feed" res = await self.post(uri, data) @@ -454,13 +461,15 @@ class XiaoHongShuClient(AbstractApiClient): return await self.post(uri, data=data, return_response=True) @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) - async def get_note_by_id_from_html(self, note_id: str): + async def get_note_by_id_from_html(self, note_id: str, xsec_source: str, xsec_token: str) -> Dict: """ 通过解析网页版的笔记详情页HTML,获取笔记详情, 该接口可能会出现失败的情况,这里尝试重试3次 copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259 thanks for ReaJason Args: note_id: + xsec_source: + xsec_token: Returns: @@ -488,7 +497,7 @@ class XiaoHongShuClient(AbstractApiClient): dict_new[new_key] = value return dict_new - url = "https://www.xiaohongshu.com/explore/" + note_id + url = "https://www.xiaohongshu.com/explore/" + note_id + f"?xsec_token={xsec_token}&xsec_source={xsec_source}" html = await self.request(method="GET", url=url, return_response=True, headers=self.headers) state = re.findall(r"window.__INITIAL_STATE__=({.*})", html)[0].replace("undefined", '""') if state != "{}": diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 69a40e0..e51ff9b 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -21,6 +21,7 @@ from tenacity import RetryError import config from base.base_crawler import AbstractCrawler +from model.m_xiaohongshu import NoteUrlInfo from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import xhs as xhs_store from tools import utils @@ -29,6 +30,7 @@ from var import crawler_type_var, source_keyword_var from .client import XiaoHongShuClient from .exception import DataFetchError from .field import SearchSortType +from .help import parse_note_info_from_note_url from .login import XiaoHongShuLogin @@ -191,48 +193,40 @@ class XiaoHongShuCrawler(AbstractCrawler): await xhs_store.update_xhs_note(note_detail) async def get_specified_notes(self): - """Get the information and comments of the specified post""" + """ + Get the information and comments of the specified post + must be specified note_id, xsec_source, xsec_token⚠️⚠️⚠️ + Returns: - async def get_note_detail_from_html_task(note_id: str, semaphore: asyncio.Semaphore) -> Dict: - async with semaphore: - try: - _note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id) - if not _note_detail: - utils.logger.error( - f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error, note_id: {note_id}") - return {} - return _note_detail - except DataFetchError as ex: - utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error: {ex}") - return {} - except KeyError as ex: - utils.logger.error( - f"[XiaoHongShuCrawler.get_note_detail_from_html] have not fund note detail note_id:{note_id}, err: {ex}") - return {} - except RetryError as ex: - utils.logger.error( - f"[XiaoHongShuCrawler.get_note_detail_from_html] Retry error, note_id:{note_id}, err: {ex}") - - get_note_detail_task_list = [ - get_note_detail_from_html_task(note_id=note_id, semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)) for - note_id in config.XHS_SPECIFIED_ID_LIST - ] + """ + get_note_detail_task_list = [] + for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST: + note_url_info: NoteUrlInfo = parse_note_info_from_note_url(full_note_url) + utils.logger.info(f"[XiaoHongShuCrawler.get_specified_notes] Parse note url info: {note_url_info}") + crawler_task = self.get_note_detail_async_task( + note_id=note_url_info.note_id, + xsec_source=note_url_info.xsec_source, + xsec_token=note_url_info.xsec_token, + semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + ) + get_note_detail_task_list.append(crawler_task) need_get_comment_note_ids = [] note_details = await asyncio.gather(*get_note_detail_task_list) for note_detail in note_details: if note_detail: - need_get_comment_note_ids.append(note_detail.get("note_id")) + need_get_comment_note_ids.append(note_detail.get("note_id", "")) await xhs_store.update_xhs_note(note_detail) await self.batch_get_note_comments(need_get_comment_note_ids) + async def get_note_detail_async_task(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \ Optional[Dict]: """Get note detail""" async with semaphore: try: - # note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id) - note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token) + note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token) + # note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token) if not note_detail: utils.logger.error( f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}") diff --git a/media_platform/xhs/help.py b/media_platform/xhs/help.py index c14e8b7..3d96811 100644 --- a/media_platform/xhs/help.py +++ b/media_platform/xhs/help.py @@ -15,6 +15,9 @@ import random import time import urllib.parse +from model.m_xiaohongshu import NoteUrlInfo +from tools.crawler_util import extract_url_params_to_dict + def sign(a1="", b1="", x_s="", x_t=""): """ @@ -288,6 +291,21 @@ def get_trace_id(img_url: str): return f"spectrum/{img_url.split('/')[-1]}" if img_url.find("spectrum") != -1 else img_url.split("/")[-1] +def parse_note_info_from_note_url(url: str) -> NoteUrlInfo: + """ + 从小红书笔记url中解析出笔记信息 + Args: + url: "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search" + Returns: + + """ + note_id = url.split("/")[-1].split("?")[0] + params = extract_url_params_to_dict(url) + xsec_token = params.get("xsec_token", "") + xsec_source = params.get("xsec_source", "") + return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source) + + if __name__ == '__main__': _img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3" # 获取一个图片地址在多个cdn下的url地址 diff --git a/model/m_xiaohongshu.py b/model/m_xiaohongshu.py index e907b1d..53294c6 100644 --- a/model/m_xiaohongshu.py +++ b/model/m_xiaohongshu.py @@ -10,3 +10,12 @@ # -*- coding: utf-8 -*- + + +from pydantic import BaseModel, Field + + +class NoteUrlInfo(BaseModel): + note_id: str = Field(title="note id") + xsec_token: str = Field(title="xsec token") + xsec_source: str = Field(title="xsec source") \ No newline at end of file diff --git a/tools/crawler_util.py b/tools/crawler_util.py index 51de153..2e3e1a4 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -18,6 +18,8 @@ import base64 import json import random import re +import urllib +import urllib.parse from io import BytesIO from typing import Dict, List, Optional, Tuple @@ -192,3 +194,12 @@ def extract_text_from_html(html: str) -> str: # Remove all other tags clean_text = re.sub(r'<[^>]+>', '', clean_html).strip() return clean_text + +def extract_url_params_to_dict(url: str) -> Dict: + """Extract URL parameters to dict""" + url_params_dict = dict() + if not url: + return url_params_dict + parsed_url = urllib.parse.urlparse(url) + url_params_dict = dict(urllib.parse.parse_qsl(parsed_url.query)) + return url_params_dict