From e940a41033be1964441e4d350419723c8fd36045 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Wed, 17 Jan 2024 23:02:05 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20=E7=A7=BB=E9=99=A4=E8=AF=84?= =?UTF-8?q?=E8=AE=BA=E4=B8=AD=E6=8C=87=E5=AE=9A=E6=95=B0=E9=87=8F=E5=92=8C?= =?UTF-8?q?=E8=BF=87=E6=BB=A4=E7=89=B9=E5=AE=9A=E5=85=B3=E9=94=AE=E8=AF=8D?= =?UTF-8?q?=E7=9A=84=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/base_config.py | 2 -- media_platform/douyin/client.py | 34 ++----------------------------- media_platform/douyin/core.py | 12 +++++------ media_platform/kuaishou/client.py | 20 +++--------------- 4 files changed, 11 insertions(+), 57 deletions(-) diff --git a/config/base_config.py b/config/base_config.py index c8bf062..df3dc79 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -29,8 +29,6 @@ CRAWLER_MAX_NOTES_COUNT = 20 # 并发爬虫数量控制 MAX_CONCURRENCY_NUM = 4 -# 每个视频/帖子抓取评论最大条数 (为0则不限制) -MAX_COMMENTS_PER_POST = 0 # 评论关键词筛选(只会留下包含关键词的评论,为空不限制) COMMENT_KEYWORDS = [ diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index 2cebc49..8e3230d 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -167,8 +167,6 @@ class DOUYINClient: crawl_interval: float = 1.0, is_fetch_sub_comments=False, callback: Optional[Callable] = None, - max_comments: int = None, # 新增参数来限制评论数 - keywords: List[str] = None # 新增参数,用于关键字筛选 ): """ 获取帖子的所有评论,包括子评论 @@ -176,50 +174,22 @@ class DOUYINClient: :param crawl_interval: 抓取间隔 :param is_fetch_sub_comments: 是否抓取子评论 :param callback: 回调函数,用于处理抓取到的评论 - :param max_comments: 最大评论数限制,如果为0,则不限制评论数 - :param keywords: 需要过滤的关键字列表 :return: 评论列表 """ result = [] comments_has_more = 1 comments_cursor = 0 - collected_comments_count = 0 # 已收集的评论数 - - while comments_has_more and ( - max_comments is None or collected_comments_count < max_comments or max_comments == 0): + while comments_has_more: comments_res = await self.get_aweme_comments(aweme_id, comments_cursor) comments_has_more = comments_res.get("has_more", 0) comments_cursor = comments_res.get("cursor", 0) comments = comments_res.get("comments", []) if not comments: continue - - # 在添加评论到结果列表之前进行关键字筛选 - if keywords: - filtered_comments = [] - for comment in comments: - if any(keyword in comment.get("text", "") for keyword in keywords): - filtered_comments.append(comment) - else: - filtered_comments = comments - - # 如果设置了最大评论数限制,并且不为0,只添加未超过该限制的评论 - if max_comments is not None and max_comments > 0: - remaining_quota = max_comments - collected_comments_count - comments_to_add = filtered_comments[:remaining_quota] - result.extend(comments_to_add) - collected_comments_count += len(comments_to_add) - else: - result.extend(filtered_comments) - collected_comments_count += len(filtered_comments) - + result.extend(comments) if callback: # 如果有回调函数,就执行回调函数 await callback(aweme_id, comments) - # 如果已经达到最大评论数(且最大评论数不为0),或者不需要子评论,结束循环 - if max_comments is not None and 0 < max_comments <= collected_comments_count: - break - await asyncio.sleep(crawl_interval) if not is_fetch_sub_comments: continue diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 5aa1c9f..68a5e72 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -1,5 +1,6 @@ import asyncio import os +import random from asyncio import Task from typing import Any, Dict, List, Optional, Tuple @@ -132,21 +133,20 @@ class DouYinCrawler(AbstractCrawler): semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) for aweme_id in aweme_list: task = asyncio.create_task( - self.get_comments(aweme_id, semaphore, max_comments=config.MAX_COMMENTS_PER_POST), name=aweme_id) + self.get_comments(aweme_id, semaphore), name=aweme_id) task_list.append(task) await asyncio.wait(task_list) - async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_comments: int = None) -> None: + async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None: async with semaphore: try: # 将关键词列表传递给 get_aweme_all_comments 方法 comments = await self.dy_client.get_aweme_all_comments( aweme_id=aweme_id, - max_comments=max_comments, # 最大数量 - keywords=config.COMMENT_KEYWORDS # 关键词列表 + crawl_interval=random.random(), + callback=douyin_store.batch_update_dy_aweme_comments + ) - # 现在返回的 comments 已经是经过关键词筛选的 - await douyin_store.batch_update_dy_aweme_comments(aweme_id, comments) utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...") except DataFetchError as e: utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}") diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index 9693c70..889cbb3 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -145,31 +145,17 @@ class KuaiShouClient: result = [] pcursor = "" - count = 0 # 计数器,记录已获取的评论数量 - while pcursor != "no_more" and ( - config.MAX_COMMENTS_PER_POST == 0 or count < config.MAX_COMMENTS_PER_POST): + while pcursor != "no_more": comments_res = await self.get_video_comments(photo_id, pcursor) vision_commen_list = comments_res.get("visionCommentList", {}) pcursor = vision_commen_list.get("pcursor", "") comments = vision_commen_list.get("rootComments", []) - filtered_comments = [] # 存储经过关键词筛选后的评论 - - for comment in comments: - content = comment.get("content", "") - - if not config.COMMENT_KEYWORDS or any(keyword in content for keyword in config.COMMENT_KEYWORDS): - filtered_comments.append(comment) - - count += 1 - if config.MAX_COMMENTS_PER_POST != 0 and count >= config.MAX_COMMENTS_PER_POST: - break - if callback: # 如果有回调函数,就执行回调函数 - await callback(photo_id, filtered_comments) + await callback(photo_id, comments) - result.extend(filtered_comments) + result.extend(comments) await asyncio.sleep(crawl_interval) if not is_fetch_sub_comments: continue