fix: xhs帖子详情问题更新

2024-10-20 00:59:08 +08:00 · 2024-10-20 00:59:08 +08:00 · 03e393949a
parent 9fe3e47b0f
commit 03e393949a
6 changed files with 85 additions and 36 deletions
--- a/config/base_config.py
+++ b/config/base_config.py
@ -57,18 +57,26 @@ MAX_CONCURRENCY_NUM = 1
 ENABLE_GET_IMAGES = False

 # 是否开启爬评论模式, 默认开启爬评论
-ENABLE_GET_COMMENTS = False
+ENABLE_GET_COMMENTS = True

 # 是否开启爬二级评论模式, 默认不开启爬二级评论
 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
 ENABLE_GET_SUB_COMMENTS = False

-# 指定小红书需要爬虫的笔记ID列表
-XHS_SPECIFIED_ID_LIST = [
-    "6422c2750000000027000d88",
+# 已废弃⚠️⚠️⚠️指定小红书需要爬虫的笔记ID列表
+# 已废弃⚠️⚠️⚠️ 指定笔记ID笔记列表会因为缺少xsec_token和xsec_source参数导致爬取失败
+# XHS_SPECIFIED_ID_LIST = [
+#     "66fad51c000000001b0224b8",
+#     # ........................
+# ]
+
+# 指定小红书需要爬虫的笔记URL列表, 目前要携带xsec_token和xsec_source参数
+XHS_SPECIFIED_NOTE_URL_LIST = [
+    "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
    # ........................
 ]

+
 # 指定抖音需要爬取的ID列表
 DY_SPECIFIED_ID_LIST = [
    "7280854932641664319",
--- a/media_platform/xhs/client.py
+++ b/media_platform/xhs/client.py
@ -99,6 +99,13 @@ class XiaoHongShuClient(AbstractApiClient):
                **kwargs
            )

+        if response.status_code == 471 or response.status_code == 461:
+            # someday someone maybe will bypass captcha
+            verify_type = response.headers['Verifytype']
+            verify_uuid = response.headers['Verifyuuid']
+            raise Exception(
+                f"出现验证码，请求失败，Verifytype: {verify_type}，Verifyuuid: {verify_uuid}, Response: {response}")
+
        if return_response:
            return response.text
        data: Dict = response.json()
@ -228,8 +235,8 @@ class XiaoHongShuClient(AbstractApiClient):
            "source_note_id": note_id,
            "image_formats": ["jpg", "webp", "avif"],
            "extra": {"need_body_topic": 1},
-            # "xsec_source": xsec_source,
-            # "xsec_token": xsec_token
+            "xsec_source": xsec_source,
+            "xsec_token": xsec_token
        }
        uri = "/api/sns/web/v1/feed"
        res = await self.post(uri, data)
@ -454,13 +461,15 @@ class XiaoHongShuClient(AbstractApiClient):
        return await self.post(uri, data=data, return_response=True)

    @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
-    async def get_note_by_id_from_html(self, note_id: str):
+    async def get_note_by_id_from_html(self, note_id: str, xsec_source: str, xsec_token: str) -> Dict:
        """
        通过解析网页版的笔记详情页HTML，获取笔记详情, 该接口可能会出现失败的情况，这里尝试重试3次
        copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259
        thanks for ReaJason
        Args:
            note_id:
+            xsec_source:
+            xsec_token:

        Returns:

@ -488,7 +497,7 @@ class XiaoHongShuClient(AbstractApiClient):
                    dict_new[new_key] = value
            return dict_new

-        url = "https://www.xiaohongshu.com/explore/" + note_id
+        url = "https://www.xiaohongshu.com/explore/" + note_id + f"?xsec_token={xsec_token}&xsec_source={xsec_source}"
        html = await self.request(method="GET", url=url, return_response=True, headers=self.headers)
        state = re.findall(r"window.__INITIAL_STATE__=({.*})</script>", html)[0].replace("undefined", '""')
        if state != "{}":
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@ -21,6 +21,7 @@ from tenacity import RetryError

 import config
 from base.base_crawler import AbstractCrawler
+from model.m_xiaohongshu import NoteUrlInfo
 from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
 from store import xhs as xhs_store
 from tools import utils
@ -29,6 +30,7 @@ from var import crawler_type_var, source_keyword_var
 from .client import XiaoHongShuClient
 from .exception import DataFetchError
 from .field import SearchSortType
+from .help import parse_note_info_from_note_url
 from .login import XiaoHongShuLogin


@ -191,48 +193,40 @@ class XiaoHongShuCrawler(AbstractCrawler):
                await xhs_store.update_xhs_note(note_detail)

    async def get_specified_notes(self):
-        """Get the information and comments of the specified post"""
+        """
+        Get the information and comments of the specified post
+        must be specified note_id, xsec_source, xsec_token⚠️⚠️⚠️
+        Returns:

-        async def get_note_detail_from_html_task(note_id: str, semaphore: asyncio.Semaphore) -> Dict:
-            async with semaphore:
-                try:
-                    _note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
-                    if not _note_detail:
-                        utils.logger.error(
-                            f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error, note_id: {note_id}")
-                        return {}
-                    return _note_detail
-                except DataFetchError as ex:
-                    utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error: {ex}")
-                    return {}
-                except KeyError as ex:
-                    utils.logger.error(
-                        f"[XiaoHongShuCrawler.get_note_detail_from_html] have not fund note detail note_id:{note_id}, err: {ex}")
-                    return {}
-                except RetryError as ex:
-                    utils.logger.error(
-                        f"[XiaoHongShuCrawler.get_note_detail_from_html] Retry error, note_id:{note_id}, err: {ex}")
-
-        get_note_detail_task_list = [
-            get_note_detail_from_html_task(note_id=note_id, semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)) for
-            note_id in config.XHS_SPECIFIED_ID_LIST
-        ]
+        """
+        get_note_detail_task_list = []
+        for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST:
+            note_url_info: NoteUrlInfo = parse_note_info_from_note_url(full_note_url)
+            utils.logger.info(f"[XiaoHongShuCrawler.get_specified_notes] Parse note url info: {note_url_info}")
+            crawler_task = self.get_note_detail_async_task(
+                note_id=note_url_info.note_id,
+                xsec_source=note_url_info.xsec_source,
+                xsec_token=note_url_info.xsec_token,
+                semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+            )
+            get_note_detail_task_list.append(crawler_task)

        need_get_comment_note_ids = []
        note_details = await asyncio.gather(*get_note_detail_task_list)
        for note_detail in note_details:
            if note_detail:
-                need_get_comment_note_ids.append(note_detail.get("note_id"))
+                need_get_comment_note_ids.append(note_detail.get("note_id", ""))
                await xhs_store.update_xhs_note(note_detail)
        await self.batch_get_note_comments(need_get_comment_note_ids)

+
    async def get_note_detail_async_task(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \
            Optional[Dict]:
        """Get note detail"""
        async with semaphore:
            try:
-                # note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
-                note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
+                note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token)
+                # note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
                if not note_detail:
                    utils.logger.error(
                        f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}")
--- a/media_platform/xhs/help.py
+++ b/media_platform/xhs/help.py
@ -15,6 +15,9 @@ import random
 import time
 import urllib.parse

+from model.m_xiaohongshu import NoteUrlInfo
+from tools.crawler_util import extract_url_params_to_dict
+

 def sign(a1="", b1="", x_s="", x_t=""):
    """
@ -288,6 +291,21 @@ def get_trace_id(img_url: str):
    return f"spectrum/{img_url.split('/')[-1]}" if img_url.find("spectrum") != -1 else img_url.split("/")[-1]


+def parse_note_info_from_note_url(url: str) -> NoteUrlInfo:
+    """
+    从小红书笔记url中解析出笔记信息
+    Args:
+        url: "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
+    Returns:
+
+    """
+    note_id = url.split("/")[-1].split("?")[0]
+    params = extract_url_params_to_dict(url)
+    xsec_token = params.get("xsec_token", "")
+    xsec_source = params.get("xsec_source", "")
+    return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source)
+
+
 if __name__ == '__main__':
    _img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
    # 获取一个图片地址在多个cdn下的url地址
--- a/model/m_xiaohongshu.py
+++ b/model/m_xiaohongshu.py
@ -10,3 +10,12 @@


 # -*- coding: utf-8 -*-
+
+
+from pydantic import BaseModel, Field
+
+
+class NoteUrlInfo(BaseModel):
+    note_id: str = Field(title="note id")
+    xsec_token: str = Field(title="xsec token")
+    xsec_source: str = Field(title="xsec source")
--- a/tools/crawler_util.py
+++ b/tools/crawler_util.py
@ -18,6 +18,8 @@ import base64
 import json
 import random
 import re
+import urllib
+import urllib.parse
 from io import BytesIO
 from typing import Dict, List, Optional, Tuple

@ -192,3 +194,12 @@ def extract_text_from_html(html: str) -> str:
    # Remove all other tags
    clean_text = re.sub(r'<[^>]+>', '', clean_html).strip()
    return clean_text
+
+def extract_url_params_to_dict(url: str) -> Dict:
+    """Extract URL parameters to dict"""
+    url_params_dict = dict()
+    if not url:
+        return url_params_dict
+    parsed_url = urllib.parse.urlparse(url)
+    url_params_dict = dict(urllib.parse.parse_qsl(parsed_url.query))
+    return url_params_dict