From 65699aa1cbb4e4e8dd50c409ab217ce094115352 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Sat, 24 Aug 2024 06:07:33 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20xhs=E6=94=AF=E6=8C=81=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E8=AF=84=E8=AE=BA=E7=9A=84=E7=82=B9=E8=B5=9E=E6=95=B0=E9=87=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- media_platform/xhs/client.py | 3 +- media_platform/xhs/core.py | 4 +++ schema/tables.sql | 4 +++ store/xhs/__init__.py | 55 ++++++++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 2b1379d..fd9589a 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -442,9 +442,10 @@ class XiaoHongShuClient(AbstractApiClient): } return await self.post(uri, data=data, return_response=True) + @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) async def get_note_by_id_from_html(self, note_id: str): """ - 通过解析网页版的笔记详情页HTML,获取笔记详情 + 通过解析网页版的笔记详情页HTML,获取笔记详情, 该接口可能会出现失败的情况,这里尝试重试3次 copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259 thanks for ReaJason Args: diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 18952f2..68e55b8 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -6,6 +6,7 @@ from typing import Dict, List, Optional, Tuple from playwright.async_api import (BrowserContext, BrowserType, Page, async_playwright) +from tenacity import RetryError import config from base.base_crawler import AbstractCrawler @@ -197,6 +198,9 @@ class XiaoHongShuCrawler(AbstractCrawler): utils.logger.error( f"[XiaoHongShuCrawler.get_note_detail_from_html] have not fund note detail note_id:{note_id}, err: {ex}") return {} + except RetryError as ex: + utils.logger.error( + f"[XiaoHongShuCrawler.get_note_detail_from_html] Retry error, note_id:{note_id}, err: {ex}") get_note_detail_task_list = [ get_note_detail_from_html_task(note_id=note_id, semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)) for diff --git a/schema/tables.sql b/schema/tables.sql index 0dec6dd..bcde31b 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -425,3 +425,7 @@ CREATE TABLE `weibo_creator` `tag_list` longtext COMMENT '标签列表', PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博博主'; + + +ALTER TABLE `xhs_note_comment` + ADD COLUMN `like_count` VARCHAR(64) DEFAULT NULL COMMENT '评论点赞数量'; \ No newline at end of file diff --git a/store/xhs/__init__.py b/store/xhs/__init__.py index eb58f2d..e3d1b7a 100644 --- a/store/xhs/__init__.py +++ b/store/xhs/__init__.py @@ -28,6 +28,14 @@ class XhsStoreFactory: def get_video_url_arr(note_item: Dict) -> List: + """ + 获取视频url数组 + Args: + note_item: + + Returns: + + """ if note_item.get('type') != 'video': return [] @@ -47,6 +55,14 @@ def get_video_url_arr(note_item: Dict) -> List: async def update_xhs_note(note_item: Dict): + """ + 更新小红书笔记 + Args: + note_item: + + Returns: + + """ note_id = note_item.get("note_id") user_info = note_item.get("user", {}) interact_info = note_item.get("interact_info", {}) @@ -86,6 +102,15 @@ async def update_xhs_note(note_item: Dict): async def batch_update_xhs_note_comments(note_id: str, comments: List[Dict]): + """ + 批量更新小红书笔记评论 + Args: + note_id: + comments: + + Returns: + + """ if not comments: return for comment_item in comments: @@ -93,6 +118,15 @@ async def batch_update_xhs_note_comments(note_id: str, comments: List[Dict]): async def update_xhs_note_comment(note_id: str, comment_item: Dict): + """ + 更新小红书笔记评论 + Args: + note_id: + comment_item: + + Returns: + + """ user_info = comment_item.get("user_info", {}) comment_id = comment_item.get("id") comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])] @@ -110,12 +144,22 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict): "pictures": ",".join(comment_pictures), "parent_comment_id": target_comment.get("id", 0), "last_modify_ts": utils.get_current_timestamp(), + "like_count": comment_item.get("like_count", 0), } utils.logger.info(f"[store.xhs.update_xhs_note_comment] xhs note comment:{local_db_item}") await XhsStoreFactory.create_store().store_comment(local_db_item) async def save_creator(user_id: str, creator: Dict): + """ + 保存小红书创作者 + Args: + user_id: + creator: + + Returns: + + """ user_info = creator.get('basicInfo', {}) follows = 0 @@ -148,5 +192,16 @@ async def save_creator(user_id: str, creator: Dict): async def update_xhs_note_image(note_id, pic_content, extension_file_name): + """ + 更新小红书笔 + Args: + note_id: + pic_content: + extension_file_name: + + Returns: + + """ + await XiaoHongShuImage().store_image( {"notice_id": note_id, "pic_content": pic_content, "extension_file_name": extension_file_name})