diff --git a/config/base_config.py b/config/base_config.py index 4da697c..f46b779 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -29,11 +29,11 @@ CRAWLER_MAX_NOTES_COUNT = 20 # 并发爬虫数量控制 MAX_CONCURRENCY_NUM = 10 -# 抖音每个视频抓取评论最大条数 (为0则不限制) -DY_MAX_COMMENTS_PER_POST = 10 +# 每个视频/帖子抓取评论最大条数 (为0则不限制) +MAX_COMMENTS_PER_POST = 10 -# 抖音评论关键词筛选(只会留下包含关键词的评论,为空不限制) -DY_COMMENT_KEYWORDS = [ +# 评论关键词筛选(只会留下包含关键词的评论,为空不限制) +COMMENT_KEYWORDS = [ "我" # ........................ ] diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index ac837c6..af7aa0b 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -146,18 +146,38 @@ class BilibiliCrawler(AbstractCrawler): """ async with semaphore: try: - utils.logger.info(f"[get_comments] bengin get video_id: {video_id} comments ...") - await self.bili_client.get_video_all_comments( + utils.logger.info(f"[get_comments] begin get video_id: {video_id} comments ...") + # Read keyword and quantity from config + keywords = config.COMMENT_KEYWORDS + max_comments = config.MAX_COMMENTS_PER_POST + + # Download comments + all_comments = await self.bili_client.get_video_all_comments( video_id=video_id, crawl_interval=random.random(), - callback=bilibili.batch_update_bilibili_video_comments ) + + # Filter comments by keyword + if keywords: + filtered_comments = [ + comment for comment in all_comments if + any(keyword in comment["content"]["message"] for keyword in keywords) + ] + else: + filtered_comments = all_comments + + # Limit the number of comments + if max_comments > 0: + filtered_comments = filtered_comments[:max_comments] + + # Update bilibili video comments + await bilibili.batch_update_bilibili_video_comments(video_id, filtered_comments) + except DataFetchError as ex: utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}") except Exception as e: utils.logger.error(f"[get_comments] may be been blocked, err:", e) - async def get_specified_videos(self): """ get specified videos info diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 0a2432a..4411780 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -132,7 +132,7 @@ class DouYinCrawler(AbstractCrawler): semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) for aweme_id in aweme_list: task = asyncio.create_task( - self.get_comments(aweme_id, semaphore, max_comments=config.DY_MAX_COMMENTS_PER_POST), name=aweme_id) + self.get_comments(aweme_id, semaphore, max_comments=config.MAX_COMMENTS_PER_POST), name=aweme_id) task_list.append(task) await asyncio.wait(task_list) @@ -143,7 +143,7 @@ class DouYinCrawler(AbstractCrawler): comments = await self.dy_client.get_aweme_all_comments( aweme_id=aweme_id, max_comments=max_comments, # 最大数量 - keywords=config.DY_COMMENT_KEYWORDS # 关键词列表 + keywords=config.COMMENT_KEYWORDS # 关键词列表 ) # 现在返回的 comments 已经是经过关键词筛选的 await douyin.batch_update_dy_aweme_comments(aweme_id, comments) diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index 6b120e8..76456b1 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -7,6 +7,7 @@ from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext, Page +import config from tools import utils from .exception import DataFetchError, IPBlockError @@ -124,7 +125,7 @@ class KuaiShouClient: return await self.post("", post_data) async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False, - callback: Optional[Callable] = None, ): + callback: Optional[Callable] = None): """ get video all comments include sub comments :param photo_id: @@ -136,18 +137,33 @@ class KuaiShouClient: result = [] pcursor = "" - while pcursor != "no_more": + count = 0 # 计数器,记录已获取的评论数量 + + while pcursor != "no_more" and ( + config.MAX_COMMENTS_PER_POST == 0 or count < config.MAX_COMMENTS_PER_POST): comments_res = await self.get_video_comments(photo_id, pcursor) vision_commen_list = comments_res.get("visionCommentList", {}) pcursor = vision_commen_list.get("pcursor", "") comments = vision_commen_list.get("rootComments", []) - if callback: # 如果有回调函数,就执行回调函数 - await callback(photo_id, comments) + filtered_comments = [] # 存储经过关键词筛选后的评论 + for comment in comments: + content = comment.get("content", "") + + if not config.COMMENT_KEYWORDS or any(keyword in content for keyword in config.COMMENT_KEYWORDS): + filtered_comments.append(comment) + + count += 1 + if config.MAX_COMMENTS_PER_POST != 0 and count >= config.MAX_COMMENTS_PER_POST: + break + + if callback: # 如果有回调函数,就执行回调函数 + await callback(photo_id, filtered_comments) + + result.extend(filtered_comments) await asyncio.sleep(crawl_interval) if not is_fetch_sub_comments: - result.extend(comments) continue # todo handle get sub comments return result diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index bf0246f..c497241 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -152,11 +152,27 @@ class XiaoHongShuCrawler(AbstractCrawler): await asyncio.gather(*task_list) async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore): - """Get note comments""" + """Get note comments with keyword filtering and quantity limitation""" async with semaphore: utils.logger.info(f"Begin get note id comments {note_id}") all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random()) + + # 从配置文件中读取关键词和数量限制 + keywords = getattr(config, 'COMMENT_KEYWORDS', []) + max_comments = getattr(config, 'MAX_COMMENTS_PER_POST', 0) + + # 过滤评论 + filtered_comments = [] for comment in all_comments: + # 检查评论内容是否包含关键词 + if not keywords or any(keyword in comment['content'] for keyword in keywords): + filtered_comments.append(comment) + # 如果达到最大评论数量限制,则停止添加更多评论 + if max_comments and len(filtered_comments) >= max_comments: + break + + # 更新或保存过滤后的评论 + for comment in filtered_comments: await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment) @staticmethod