From e940a41033be1964441e4d350419723c8fd36045 Mon Sep 17 00:00:00 2001
From: Relakkes <relakkes@gmail.com>
Date: Wed, 17 Jan 2024 23:02:05 +0800
Subject: [PATCH] =?UTF-8?q?refactor:=20=E7=A7=BB=E9=99=A4=E8=AF=84?=
 =?UTF-8?q?=E8=AE=BA=E4=B8=AD=E6=8C=87=E5=AE=9A=E6=95=B0=E9=87=8F=E5=92=8C?=
 =?UTF-8?q?=E8=BF=87=E6=BB=A4=E7=89=B9=E5=AE=9A=E5=85=B3=E9=94=AE=E8=AF=8D?=
 =?UTF-8?q?=E7=9A=84=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config/base_config.py             |  2 --
 media_platform/douyin/client.py   | 34 ++-----------------------------
 media_platform/douyin/core.py     | 12 +++++------
 media_platform/kuaishou/client.py | 20 +++---------------
 4 files changed, 11 insertions(+), 57 deletions(-)

diff --git a/config/base_config.py b/config/base_config.py
index c8bf062..df3dc79 100644
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -29,8 +29,6 @@ CRAWLER_MAX_NOTES_COUNT = 20
 # 并发爬虫数量控制
 MAX_CONCURRENCY_NUM = 4
 
-# 每个视频/帖子抓取评论最大条数 (为0则不限制)
-MAX_COMMENTS_PER_POST = 0
 
 # 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
 COMMENT_KEYWORDS = [
diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py
index 2cebc49..8e3230d 100644
--- a/media_platform/douyin/client.py
+++ b/media_platform/douyin/client.py
@@ -167,8 +167,6 @@ class DOUYINClient:
             crawl_interval: float = 1.0,
             is_fetch_sub_comments=False,
             callback: Optional[Callable] = None,
-            max_comments: int = None,  # 新增参数来限制评论数
-            keywords: List[str] = None  # 新增参数，用于关键字筛选
     ):
         """
         获取帖子的所有评论，包括子评论
@@ -176,50 +174,22 @@ class DOUYINClient:
         :param crawl_interval: 抓取间隔
         :param is_fetch_sub_comments: 是否抓取子评论
         :param callback: 回调函数，用于处理抓取到的评论
-        :param max_comments: 最大评论数限制，如果为0，则不限制评论数
-        :param keywords: 需要过滤的关键字列表
         :return: 评论列表
         """
         result = []
         comments_has_more = 1
         comments_cursor = 0
-        collected_comments_count = 0  # 已收集的评论数
-
-        while comments_has_more and (
-                max_comments is None or collected_comments_count < max_comments or max_comments == 0):
+        while comments_has_more:
             comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
             comments_has_more = comments_res.get("has_more", 0)
             comments_cursor = comments_res.get("cursor", 0)
             comments = comments_res.get("comments", [])
             if not comments:
                 continue
-
-            # 在添加评论到结果列表之前进行关键字筛选
-            if keywords:
-                filtered_comments = []
-                for comment in comments:
-                    if any(keyword in comment.get("text", "") for keyword in keywords):
-                        filtered_comments.append(comment)
-            else:
-                filtered_comments = comments
-
-            # 如果设置了最大评论数限制，并且不为0，只添加未超过该限制的评论
-            if max_comments is not None and max_comments > 0:
-                remaining_quota = max_comments - collected_comments_count
-                comments_to_add = filtered_comments[:remaining_quota]
-                result.extend(comments_to_add)
-                collected_comments_count += len(comments_to_add)
-            else:
-                result.extend(filtered_comments)
-                collected_comments_count += len(filtered_comments)
-
+            result.extend(comments)
             if callback:  # 如果有回调函数，就执行回调函数
                 await callback(aweme_id, comments)
 
-            # 如果已经达到最大评论数（且最大评论数不为0），或者不需要子评论，结束循环
-            if max_comments is not None and 0 < max_comments <= collected_comments_count:
-                break
-
             await asyncio.sleep(crawl_interval)
             if not is_fetch_sub_comments:
                 continue
diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py
index 5aa1c9f..68a5e72 100644
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@@ -1,5 +1,6 @@
 import asyncio
 import os
+import random
 from asyncio import Task
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -132,21 +133,20 @@ class DouYinCrawler(AbstractCrawler):
         semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
         for aweme_id in aweme_list:
             task = asyncio.create_task(
-                self.get_comments(aweme_id, semaphore, max_comments=config.MAX_COMMENTS_PER_POST), name=aweme_id)
+                self.get_comments(aweme_id, semaphore), name=aweme_id)
             task_list.append(task)
         await asyncio.wait(task_list)
 
-    async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_comments: int = None) -> None:
+    async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
         async with semaphore:
             try:
                 # 将关键词列表传递给 get_aweme_all_comments 方法
                 comments = await self.dy_client.get_aweme_all_comments(
                     aweme_id=aweme_id,
-                    max_comments=max_comments, # 最大数量
-                    keywords=config.COMMENT_KEYWORDS  # 关键词列表
+                    crawl_interval=random.random(),
+                    callback=douyin_store.batch_update_dy_aweme_comments
+
                 )
-                # 现在返回的 comments 已经是经过关键词筛选的
-                await douyin_store.batch_update_dy_aweme_comments(aweme_id, comments)
                 utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
             except DataFetchError as e:
                 utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py
index 9693c70..889cbb3 100644
--- a/media_platform/kuaishou/client.py
+++ b/media_platform/kuaishou/client.py
@@ -145,31 +145,17 @@ class KuaiShouClient:
 
         result = []
         pcursor = ""
-        count = 0  # 计数器，记录已获取的评论数量
 
-        while pcursor != "no_more" and (
-                config.MAX_COMMENTS_PER_POST == 0 or count < config.MAX_COMMENTS_PER_POST):
+        while pcursor != "no_more":
             comments_res = await self.get_video_comments(photo_id, pcursor)
             vision_commen_list = comments_res.get("visionCommentList", {})
             pcursor = vision_commen_list.get("pcursor", "")
             comments = vision_commen_list.get("rootComments", [])
 
-            filtered_comments = []  # 存储经过关键词筛选后的评论
-
-            for comment in comments:
-                content = comment.get("content", "")
-
-                if not config.COMMENT_KEYWORDS or any(keyword in content for keyword in config.COMMENT_KEYWORDS):
-                    filtered_comments.append(comment)
-
-                    count += 1
-                    if config.MAX_COMMENTS_PER_POST != 0 and count >= config.MAX_COMMENTS_PER_POST:
-                        break
-
             if callback:  # 如果有回调函数，就执行回调函数
-                await callback(photo_id, filtered_comments)
+                await callback(photo_id, comments)
 
-            result.extend(filtered_comments)
+            result.extend(comments)
             await asyncio.sleep(crawl_interval)
             if not is_fetch_sub_comments:
                 continue