From 7e53c4acfc9650073c0e59c2ea55885af639857f Mon Sep 17 00:00:00 2001
From: unknown <2862182666@qq.com>
Date: Wed, 23 Oct 2024 16:32:02 +0800
Subject: [PATCH] All_platform_comments_restrict

---
 config/base_config.py             |  4 ++--
 media_platform/bilibili/client.py |  9 +++++++--
 media_platform/bilibili/core.py   |  3 ++-
 media_platform/douyin/client.py   |  6 +++++-
 media_platform/douyin/core.py     |  3 ++-
 media_platform/kuaishou/client.py |  8 +++++---
 media_platform/kuaishou/core.py   |  3 ++-
 media_platform/tieba/client.py    | 10 +++++++---
 media_platform/tieba/core.py      |  3 ++-
 media_platform/weibo/client.py    |  9 +++++++--
 media_platform/weibo/core.py      |  3 ++-
 media_platform/xhs/client.py      |  2 +-
 12 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/config/base_config.py b/config/base_config.py
index a78ab5e..8e55b99 100644
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -11,7 +11,7 @@
 
 # 基础配置
 PLATFORM = "xhs"
-KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置，以英文逗号分隔
+KEYWORDS = "城市" # 关键词搜索配置，以英文逗号分隔
 LOGIN_TYPE = "qrcode"  # qrcode or phone or cookie
 COOKIES = ""
 # 具体值参见media_platform.xxx.field下的枚举值，暂时只支持小红书
@@ -48,7 +48,7 @@ USER_DATA_DIR = "%s_user_data_dir"  # %s will be replaced by platform name
 START_PAGE = 1
 
 # 爬取视频/帖子的数量控制
-CRAWLER_MAX_NOTES_COUNT = 200
+CRAWLER_MAX_NOTES_COUNT = 10
 
 # 并发爬虫数量控制
 MAX_CONCURRENCY_NUM = 1
diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py
index 9bca496..c8338c8 100644
--- a/media_platform/bilibili/client.py
+++ b/media_platform/bilibili/client.py
@@ -222,20 +222,23 @@ class BilibiliClient(AbstractApiClient):
         return await self.get(uri, post_data)
 
     async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
-                                     callback: Optional[Callable] = None, ):
+                                     callback: Optional[Callable] = None,
+                                     max_count: int = 10,):
         """
         get video all comments include sub comments
         :param video_id:
         :param crawl_interval:
         :param is_fetch_sub_comments:
         :param callback:
+        max_count: 一次笔记爬取的最大评论数量
+
         :return:
         """
 
         result = []
         is_end = False
         next_page = 0
-        while not is_end:
+        while not is_end and len(result) < max_count:
             comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
             cursor_info: Dict = comments_res.get("cursor")
             comment_list: List[Dict] = comments_res.get("replies", [])
@@ -249,6 +252,8 @@ class BilibiliClient(AbstractApiClient):
                             await self.get_video_all_level_two_comments(
                                 video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval,  callback)
                         }
+            if len(result) + len(comment_list) > max_count:
+                comment_list = comment_list[:max_count - len(result)]
             if callback:  # 如果有回调函数，就执行回调函数
                 await callback(video_id, comment_list)
             await asyncio.sleep(crawl_interval)
diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py
index d377e90..e698bbd 100644
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@@ -182,7 +182,8 @@ class BilibiliCrawler(AbstractCrawler):
                     video_id=video_id,
                     crawl_interval=random.random(),
                     is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
-                    callback=bilibili_store.batch_update_bilibili_video_comments
+                    callback=bilibili_store.batch_update_bilibili_video_comments,
+                    max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
                 )
 
             except DataFetchError as ex:
diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py
index a06e9da..37951dd 100644
--- a/media_platform/douyin/client.py
+++ b/media_platform/douyin/client.py
@@ -230,6 +230,7 @@ class DOUYINClient(AbstractApiClient):
             crawl_interval: float = 1.0,
             is_fetch_sub_comments=False,
             callback: Optional[Callable] = None,
+            max_count: int = 10,
     ):
         """
         获取帖子的所有评论，包括子评论
@@ -237,18 +238,21 @@ class DOUYINClient(AbstractApiClient):
         :param crawl_interval: 抓取间隔
         :param is_fetch_sub_comments: 是否抓取子评论
         :param callback: 回调函数，用于处理抓取到的评论
+        :param max_count: 一次帖子爬取的最大评论数量
         :return: 评论列表
         """
         result = []
         comments_has_more = 1
         comments_cursor = 0
-        while comments_has_more:
+        while comments_has_more and len(result) < max_count:
             comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
             comments_has_more = comments_res.get("has_more", 0)
             comments_cursor = comments_res.get("cursor", 0)
             comments = comments_res.get("comments", [])
             if not comments:
                 continue
+            if len(result) + len(comments) > max_count:
+                comments = comments[:max_count - len(result)]
             result.extend(comments)
             if callback:  # 如果有回调函数，就执行回调函数
                 await callback(aweme_id, comments)
diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py
index 3aea4a1..426b33b 100644
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@@ -179,7 +179,8 @@ class DouYinCrawler(AbstractCrawler):
                     aweme_id=aweme_id,
                     crawl_interval=random.random(),
                     is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
-                    callback=douyin_store.batch_update_dy_aweme_comments
+                    callback=douyin_store.batch_update_dy_aweme_comments,
+                    max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
                 )
                 utils.logger.info(
                     f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py
index 9aaa690..e728ed8 100644
--- a/media_platform/kuaishou/client.py
+++ b/media_platform/kuaishou/client.py
@@ -189,27 +189,29 @@ class KuaiShouClient(AbstractApiClient):
         photo_id: str,
         crawl_interval: float = 1.0,
         callback: Optional[Callable] = None,
+        max_count: int = 10,
     ):
         """
         get video all comments include sub comments
         :param photo_id:
         :param crawl_interval:
         :param callback:
+        :param max_count:
         :return:
         """
 
         result = []
         pcursor = ""
 
-        while pcursor != "no_more":
+        while pcursor != "no_more" and len(result) < max_count:
             comments_res = await self.get_video_comments(photo_id, pcursor)
             vision_commen_list = comments_res.get("visionCommentList", {})
             pcursor = vision_commen_list.get("pcursor", "")
             comments = vision_commen_list.get("rootComments", [])
-
+            if len(result) + len(comments) > max_count:
+                comments = comments[:max_count - len(result)]
             if callback:  # 如果有回调函数，就执行回调函数
                 await callback(photo_id, comments)
-
             result.extend(comments)
             await asyncio.sleep(crawl_interval)
             sub_comments = await self.get_comments_all_sub_comments(
diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py
index 0456f01..609f705 100644
--- a/media_platform/kuaishou/core.py
+++ b/media_platform/kuaishou/core.py
@@ -186,7 +186,8 @@ class KuaishouCrawler(AbstractCrawler):
                 await self.ks_client.get_video_all_comments(
                     photo_id=video_id,
                     crawl_interval=random.random(),
-                    callback=kuaishou_store.batch_update_ks_video_comments
+                    callback=kuaishou_store.batch_update_ks_video_comments,
+                    max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
                 )
             except DataFetchError as ex:
                 utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py
index 25cac59..1adbbe4 100644
--- a/media_platform/tieba/client.py
+++ b/media_platform/tieba/client.py
@@ -204,21 +204,23 @@ class BaiduTieBaClient(AbstractApiClient):
         return self._page_extractor.extract_note_detail(page_content)
 
     async def get_note_all_comments(self, note_detail: TiebaNote, crawl_interval: float = 1.0,
-                                    callback: Optional[Callable] = None) -> List[TiebaComment]:
+                                    callback: Optional[Callable] = None,
+                                    max_count: int = 10,
+                                    ) -> List[TiebaComment]:
         """
         获取指定帖子下的所有一级评论，该方法会一直查找一个帖子下的所有评论信息
         Args:
             note_detail: 帖子详情对象
             crawl_interval: 爬取一次笔记的延迟单位（秒）
             callback: 一次笔记爬取结束后
-
+            max_count: 一次帖子爬取的最大评论数量
         Returns:
 
         """
         uri = f"/p/{note_detail.note_id}"
         result: List[TiebaComment] = []
         current_page = 1
-        while note_detail.total_replay_page >= current_page:
+        while note_detail.total_replay_page >= current_page and len(result) < max_count:
             params = {
                 "pn": current_page
             }
@@ -227,6 +229,8 @@ class BaiduTieBaClient(AbstractApiClient):
                                                                                 note_id=note_detail.note_id)
             if not comments:
                 break
+            if len(result) + len(comments) > max_count:
+                comments = comments[:max_count - len(result)]
             if callback:
                 await callback(note_detail.note_id, comments)
             result.extend(comments)
diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py
index f1eb2ff..567a1fa 100644
--- a/media_platform/tieba/core.py
+++ b/media_platform/tieba/core.py
@@ -226,7 +226,8 @@ class TieBaCrawler(AbstractCrawler):
             await self.tieba_client.get_note_all_comments(
                 note_detail=note_detail,
                 crawl_interval=random.random(),
-                callback=tieba_store.batch_update_tieba_note_comments
+                callback=tieba_store.batch_update_tieba_note_comments,
+                max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
             )
 
     async def get_creators_and_notes(self) -> None:
diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py
index c6f25af..92ab9c7 100644
--- a/media_platform/weibo/client.py
+++ b/media_platform/weibo/client.py
@@ -149,23 +149,28 @@ class WeiboClient:
         return await self.get(uri, params, headers=headers)
 
     async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
-                                    callback: Optional[Callable] = None, ):
+                                    callback: Optional[Callable] = None,
+                                    max_count: int = 10,
+                                    ):
         """
         get note all comments include sub comments
         :param note_id:
         :param crawl_interval:
         :param callback:
+        :param max_count:
         :return:
         """
 
         result = []
         is_end = False
         max_id = -1
-        while not is_end:
+        while not is_end and len(result) < max_count:
             comments_res = await self.get_note_comments(note_id, max_id)
             max_id: int = comments_res.get("max_id")
             comment_list: List[Dict] = comments_res.get("data", [])
             is_end = max_id == 0
+            if len(result) + len(comment_list) > max_count:
+                comment_list = comment_list[:max_count - len(result)]
             if callback:  # 如果有回调函数，就执行回调函数
                 await callback(note_id, comment_list)
             await asyncio.sleep(crawl_interval)
diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py
index a6e9729..e72fdf1 100644
--- a/media_platform/weibo/core.py
+++ b/media_platform/weibo/core.py
@@ -206,7 +206,8 @@ class WeiboCrawler(AbstractCrawler):
                 await self.wb_client.get_note_all_comments(
                     note_id=note_id,
                     crawl_interval=random.randint(1,3), # 微博对API的限流比较严重，所以延时提高一些
-                    callback=weibo_store.batch_update_weibo_note_comments
+                    callback=weibo_store.batch_update_weibo_note_comments,
+                    max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
                 )
             except DataFetchError as ex:
                 utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")
diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py
index ee7f82e..44ea138 100644
--- a/media_platform/xhs/client.py
+++ b/media_platform/xhs/client.py
@@ -296,7 +296,7 @@ class XiaoHongShuClient(AbstractApiClient):
             note_id: 笔记ID
             crawl_interval: 爬取一次笔记的延迟单位（秒）
             callback: 一次笔记爬取结束后
-
+            max_count: 一次笔记爬取的最大评论数量
         Returns:
 
         """