From f17a85305e8e965cad686a21919c1da6cc0062f7 Mon Sep 17 00:00:00 2001
From: peanutsplash <b1300658700@outlook.com>
Date: Wed, 13 Dec 2023 23:53:12 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=8A=9F=E8=83=BD:(=E5=93=94?=
 =?UTF-8?q?=E5=93=A9=E5=93=94=E5=93=A9,=E5=BF=AB=E6=89=8B,=E5=B0=8F?=
 =?UTF-8?q?=E7=BA=A2=E4=B9=A6)=E6=AF=8F=E4=B8=AA=E8=A7=86=E9=A2=91/?=
 =?UTF-8?q?=E5=B8=96=E5=AD=90=E6=8A=93=E5=8F=96=E8=AF=84=E8=AE=BA=E6=9C=80?=
 =?UTF-8?q?=E5=A4=A7=E6=9D=A1=E6=95=B0=E9=99=90=E5=88=B6,=E8=AF=84?=
 =?UTF-8?q?=E8=AE=BA=E5=85=B3=E9=94=AE=E8=AF=8D=E7=AD=9B=E9=80=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config/base_config.py             |  8 ++++----
 media_platform/bilibili/core.py   | 28 ++++++++++++++++++++++++----
 media_platform/douyin/core.py     |  4 ++--
 media_platform/kuaishou/client.py | 26 +++++++++++++++++++++-----
 media_platform/xhs/core.py        | 18 +++++++++++++++++-
 5 files changed, 68 insertions(+), 16 deletions(-)

diff --git a/config/base_config.py b/config/base_config.py
index 4da697c..f46b779 100644
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -29,11 +29,11 @@ CRAWLER_MAX_NOTES_COUNT = 20
 # 并发爬虫数量控制
 MAX_CONCURRENCY_NUM = 10
 
-# 抖音每个视频抓取评论最大条数 (为0则不限制)
-DY_MAX_COMMENTS_PER_POST = 10
+# 每个视频/帖子抓取评论最大条数 (为0则不限制)
+MAX_COMMENTS_PER_POST = 10
 
-# 抖音评论关键词筛选(只会留下包含关键词的评论,为空不限制)
-DY_COMMENT_KEYWORDS = [
+# 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
+COMMENT_KEYWORDS = [
     "我"
     # ........................
 ]
diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py
index ac837c6..af7aa0b 100644
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@@ -146,18 +146,38 @@ class BilibiliCrawler(AbstractCrawler):
         """
         async with semaphore:
             try:
-                utils.logger.info(f"[get_comments] bengin get video_id: {video_id} comments ...")
-                await self.bili_client.get_video_all_comments(
+                utils.logger.info(f"[get_comments] begin get video_id: {video_id} comments ...")
+                # Read keyword and quantity from config
+                keywords = config.COMMENT_KEYWORDS
+                max_comments = config.MAX_COMMENTS_PER_POST
+
+                # Download comments
+                all_comments = await self.bili_client.get_video_all_comments(
                     video_id=video_id,
                     crawl_interval=random.random(),
-                    callback=bilibili.batch_update_bilibili_video_comments
                 )
+
+                # Filter comments by keyword
+                if keywords:
+                    filtered_comments = [
+                        comment for comment in all_comments if
+                        any(keyword in comment["content"]["message"] for keyword in keywords)
+                    ]
+                else:
+                    filtered_comments = all_comments
+
+                # Limit the number of comments
+                if max_comments > 0:
+                    filtered_comments = filtered_comments[:max_comments]
+
+                # Update bilibili video comments
+                await bilibili.batch_update_bilibili_video_comments(video_id, filtered_comments)
+
             except DataFetchError as ex:
                 utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}")
             except Exception as e:
                 utils.logger.error(f"[get_comments] may be been blocked, err:", e)
 
-
     async def get_specified_videos(self):
         """
         get specified videos info
diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py
index 0a2432a..4411780 100644
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@@ -132,7 +132,7 @@ class DouYinCrawler(AbstractCrawler):
         semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
         for aweme_id in aweme_list:
             task = asyncio.create_task(
-                self.get_comments(aweme_id, semaphore, max_comments=config.DY_MAX_COMMENTS_PER_POST), name=aweme_id)
+                self.get_comments(aweme_id, semaphore, max_comments=config.MAX_COMMENTS_PER_POST), name=aweme_id)
             task_list.append(task)
         await asyncio.wait(task_list)
 
@@ -143,7 +143,7 @@ class DouYinCrawler(AbstractCrawler):
                 comments = await self.dy_client.get_aweme_all_comments(
                     aweme_id=aweme_id,
                     max_comments=max_comments, # 最大数量
-                    keywords=config.DY_COMMENT_KEYWORDS  # 关键词列表
+                    keywords=config.COMMENT_KEYWORDS  # 关键词列表
                 )
                 # 现在返回的 comments 已经是经过关键词筛选的
                 await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py
index 6b120e8..76456b1 100644
--- a/media_platform/kuaishou/client.py
+++ b/media_platform/kuaishou/client.py
@@ -7,6 +7,7 @@ from urllib.parse import urlencode
 import httpx
 from playwright.async_api import BrowserContext, Page
 
+import config
 from tools import utils
 
 from .exception import DataFetchError, IPBlockError
@@ -124,7 +125,7 @@ class KuaiShouClient:
         return await self.post("", post_data)
 
     async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
-                                     callback: Optional[Callable] = None, ):
+                                     callback: Optional[Callable] = None):
         """
         get video all comments include sub comments
         :param photo_id:
@@ -136,18 +137,33 @@ class KuaiShouClient:
 
         result = []
         pcursor = ""
-        while pcursor != "no_more":
+        count = 0  # 计数器，记录已获取的评论数量
+
+        while pcursor != "no_more" and (
+                config.MAX_COMMENTS_PER_POST == 0 or count < config.MAX_COMMENTS_PER_POST):
             comments_res = await self.get_video_comments(photo_id, pcursor)
             vision_commen_list = comments_res.get("visionCommentList", {})
             pcursor = vision_commen_list.get("pcursor", "")
             comments = vision_commen_list.get("rootComments", [])
 
-            if callback:  # 如果有回调函数，就执行回调函数
-                await callback(photo_id, comments)
+            filtered_comments = []  # 存储经过关键词筛选后的评论
 
+            for comment in comments:
+                content = comment.get("content", "")
+
+                if not config.COMMENT_KEYWORDS or any(keyword in content for keyword in config.COMMENT_KEYWORDS):
+                    filtered_comments.append(comment)
+
+                    count += 1
+                    if config.MAX_COMMENTS_PER_POST != 0 and count >= config.MAX_COMMENTS_PER_POST:
+                        break
+
+            if callback:  # 如果有回调函数，就执行回调函数
+                await callback(photo_id, filtered_comments)
+
+            result.extend(filtered_comments)
             await asyncio.sleep(crawl_interval)
             if not is_fetch_sub_comments:
-                result.extend(comments)
                 continue
             # todo handle get sub comments
         return result
diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py
index bf0246f..c497241 100644
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -152,11 +152,27 @@ class XiaoHongShuCrawler(AbstractCrawler):
         await asyncio.gather(*task_list)
 
     async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
-        """Get note comments"""
+        """Get note comments with keyword filtering and quantity limitation"""
         async with semaphore:
             utils.logger.info(f"Begin get note id comments {note_id}")
             all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
+
+            # 从配置文件中读取关键词和数量限制
+            keywords = getattr(config, 'COMMENT_KEYWORDS', [])
+            max_comments = getattr(config, 'MAX_COMMENTS_PER_POST', 0)
+
+            # 过滤评论
+            filtered_comments = []
             for comment in all_comments:
+                # 检查评论内容是否包含关键词
+                if not keywords or any(keyword in comment['content'] for keyword in keywords):
+                    filtered_comments.append(comment)
+                    # 如果达到最大评论数量限制，则停止添加更多评论
+                    if max_comments and len(filtered_comments) >= max_comments:
+                        break
+
+            # 更新或保存过滤后的评论
+            for comment in filtered_comments:
                 await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)
 
     @staticmethod