refactor: 移除评论中指定数量和过滤特定关键词的逻辑

This commit is contained in:
Relakkes 2024-01-17 23:02:05 +08:00
parent e0f9a487e4
commit e940a41033
4 changed files with 11 additions and 57 deletions

View File

@ -29,8 +29,6 @@ CRAWLER_MAX_NOTES_COUNT = 20
# 并发爬虫数量控制 # 并发爬虫数量控制
MAX_CONCURRENCY_NUM = 4 MAX_CONCURRENCY_NUM = 4
# 每个视频/帖子抓取评论最大条数 (为0则不限制)
MAX_COMMENTS_PER_POST = 0
# 评论关键词筛选(只会留下包含关键词的评论,为空不限制) # 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
COMMENT_KEYWORDS = [ COMMENT_KEYWORDS = [

View File

@ -167,8 +167,6 @@ class DOUYINClient:
crawl_interval: float = 1.0, crawl_interval: float = 1.0,
is_fetch_sub_comments=False, is_fetch_sub_comments=False,
callback: Optional[Callable] = None, callback: Optional[Callable] = None,
max_comments: int = None, # 新增参数来限制评论数
keywords: List[str] = None # 新增参数,用于关键字筛选
): ):
""" """
获取帖子的所有评论包括子评论 获取帖子的所有评论包括子评论
@ -176,50 +174,22 @@ class DOUYINClient:
:param crawl_interval: 抓取间隔 :param crawl_interval: 抓取间隔
:param is_fetch_sub_comments: 是否抓取子评论 :param is_fetch_sub_comments: 是否抓取子评论
:param callback: 回调函数用于处理抓取到的评论 :param callback: 回调函数用于处理抓取到的评论
:param max_comments: 最大评论数限制如果为0则不限制评论数
:param keywords: 需要过滤的关键字列表
:return: 评论列表 :return: 评论列表
""" """
result = [] result = []
comments_has_more = 1 comments_has_more = 1
comments_cursor = 0 comments_cursor = 0
collected_comments_count = 0 # 已收集的评论数 while comments_has_more:
while comments_has_more and (
max_comments is None or collected_comments_count < max_comments or max_comments == 0):
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor) comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
comments_has_more = comments_res.get("has_more", 0) comments_has_more = comments_res.get("has_more", 0)
comments_cursor = comments_res.get("cursor", 0) comments_cursor = comments_res.get("cursor", 0)
comments = comments_res.get("comments", []) comments = comments_res.get("comments", [])
if not comments: if not comments:
continue continue
result.extend(comments)
# 在添加评论到结果列表之前进行关键字筛选
if keywords:
filtered_comments = []
for comment in comments:
if any(keyword in comment.get("text", "") for keyword in keywords):
filtered_comments.append(comment)
else:
filtered_comments = comments
# 如果设置了最大评论数限制并且不为0只添加未超过该限制的评论
if max_comments is not None and max_comments > 0:
remaining_quota = max_comments - collected_comments_count
comments_to_add = filtered_comments[:remaining_quota]
result.extend(comments_to_add)
collected_comments_count += len(comments_to_add)
else:
result.extend(filtered_comments)
collected_comments_count += len(filtered_comments)
if callback: # 如果有回调函数,就执行回调函数 if callback: # 如果有回调函数,就执行回调函数
await callback(aweme_id, comments) await callback(aweme_id, comments)
# 如果已经达到最大评论数且最大评论数不为0或者不需要子评论结束循环
if max_comments is not None and 0 < max_comments <= collected_comments_count:
break
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)
if not is_fetch_sub_comments: if not is_fetch_sub_comments:
continue continue

View File

@ -1,5 +1,6 @@
import asyncio import asyncio
import os import os
import random
from asyncio import Task from asyncio import Task
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
@ -132,21 +133,20 @@ class DouYinCrawler(AbstractCrawler):
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
for aweme_id in aweme_list: for aweme_id in aweme_list:
task = asyncio.create_task( task = asyncio.create_task(
self.get_comments(aweme_id, semaphore, max_comments=config.MAX_COMMENTS_PER_POST), name=aweme_id) self.get_comments(aweme_id, semaphore), name=aweme_id)
task_list.append(task) task_list.append(task)
await asyncio.wait(task_list) await asyncio.wait(task_list)
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_comments: int = None) -> None: async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
async with semaphore: async with semaphore:
try: try:
# 将关键词列表传递给 get_aweme_all_comments 方法 # 将关键词列表传递给 get_aweme_all_comments 方法
comments = await self.dy_client.get_aweme_all_comments( comments = await self.dy_client.get_aweme_all_comments(
aweme_id=aweme_id, aweme_id=aweme_id,
max_comments=max_comments, # 最大数量 crawl_interval=random.random(),
keywords=config.COMMENT_KEYWORDS # 关键词列表 callback=douyin_store.batch_update_dy_aweme_comments
) )
# 现在返回的 comments 已经是经过关键词筛选的
await douyin_store.batch_update_dy_aweme_comments(aweme_id, comments)
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...") utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
except DataFetchError as e: except DataFetchError as e:
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}") utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")

View File

@ -145,31 +145,17 @@ class KuaiShouClient:
result = [] result = []
pcursor = "" pcursor = ""
count = 0 # 计数器,记录已获取的评论数量
while pcursor != "no_more" and ( while pcursor != "no_more":
config.MAX_COMMENTS_PER_POST == 0 or count < config.MAX_COMMENTS_PER_POST):
comments_res = await self.get_video_comments(photo_id, pcursor) comments_res = await self.get_video_comments(photo_id, pcursor)
vision_commen_list = comments_res.get("visionCommentList", {}) vision_commen_list = comments_res.get("visionCommentList", {})
pcursor = vision_commen_list.get("pcursor", "") pcursor = vision_commen_list.get("pcursor", "")
comments = vision_commen_list.get("rootComments", []) comments = vision_commen_list.get("rootComments", [])
filtered_comments = [] # 存储经过关键词筛选后的评论
for comment in comments:
content = comment.get("content", "")
if not config.COMMENT_KEYWORDS or any(keyword in content for keyword in config.COMMENT_KEYWORDS):
filtered_comments.append(comment)
count += 1
if config.MAX_COMMENTS_PER_POST != 0 and count >= config.MAX_COMMENTS_PER_POST:
break
if callback: # 如果有回调函数,就执行回调函数 if callback: # 如果有回调函数,就执行回调函数
await callback(photo_id, filtered_comments) await callback(photo_id, comments)
result.extend(filtered_comments) result.extend(comments)
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)
if not is_fetch_sub_comments: if not is_fetch_sub_comments:
continue continue