refactor: 移除评论中指定数量和过滤特定关键词的逻辑
This commit is contained in:
parent
e0f9a487e4
commit
e940a41033
|
@ -29,8 +29,6 @@ CRAWLER_MAX_NOTES_COUNT = 20
|
||||||
# 并发爬虫数量控制
|
# 并发爬虫数量控制
|
||||||
MAX_CONCURRENCY_NUM = 4
|
MAX_CONCURRENCY_NUM = 4
|
||||||
|
|
||||||
# 每个视频/帖子抓取评论最大条数 (为0则不限制)
|
|
||||||
MAX_COMMENTS_PER_POST = 0
|
|
||||||
|
|
||||||
# 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
|
# 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
|
||||||
COMMENT_KEYWORDS = [
|
COMMENT_KEYWORDS = [
|
||||||
|
|
|
@ -167,8 +167,6 @@ class DOUYINClient:
|
||||||
crawl_interval: float = 1.0,
|
crawl_interval: float = 1.0,
|
||||||
is_fetch_sub_comments=False,
|
is_fetch_sub_comments=False,
|
||||||
callback: Optional[Callable] = None,
|
callback: Optional[Callable] = None,
|
||||||
max_comments: int = None, # 新增参数来限制评论数
|
|
||||||
keywords: List[str] = None # 新增参数,用于关键字筛选
|
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
获取帖子的所有评论,包括子评论
|
获取帖子的所有评论,包括子评论
|
||||||
|
@ -176,50 +174,22 @@ class DOUYINClient:
|
||||||
:param crawl_interval: 抓取间隔
|
:param crawl_interval: 抓取间隔
|
||||||
:param is_fetch_sub_comments: 是否抓取子评论
|
:param is_fetch_sub_comments: 是否抓取子评论
|
||||||
:param callback: 回调函数,用于处理抓取到的评论
|
:param callback: 回调函数,用于处理抓取到的评论
|
||||||
:param max_comments: 最大评论数限制,如果为0,则不限制评论数
|
|
||||||
:param keywords: 需要过滤的关键字列表
|
|
||||||
:return: 评论列表
|
:return: 评论列表
|
||||||
"""
|
"""
|
||||||
result = []
|
result = []
|
||||||
comments_has_more = 1
|
comments_has_more = 1
|
||||||
comments_cursor = 0
|
comments_cursor = 0
|
||||||
collected_comments_count = 0 # 已收集的评论数
|
while comments_has_more:
|
||||||
|
|
||||||
while comments_has_more and (
|
|
||||||
max_comments is None or collected_comments_count < max_comments or max_comments == 0):
|
|
||||||
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
|
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
|
||||||
comments_has_more = comments_res.get("has_more", 0)
|
comments_has_more = comments_res.get("has_more", 0)
|
||||||
comments_cursor = comments_res.get("cursor", 0)
|
comments_cursor = comments_res.get("cursor", 0)
|
||||||
comments = comments_res.get("comments", [])
|
comments = comments_res.get("comments", [])
|
||||||
if not comments:
|
if not comments:
|
||||||
continue
|
continue
|
||||||
|
result.extend(comments)
|
||||||
# 在添加评论到结果列表之前进行关键字筛选
|
|
||||||
if keywords:
|
|
||||||
filtered_comments = []
|
|
||||||
for comment in comments:
|
|
||||||
if any(keyword in comment.get("text", "") for keyword in keywords):
|
|
||||||
filtered_comments.append(comment)
|
|
||||||
else:
|
|
||||||
filtered_comments = comments
|
|
||||||
|
|
||||||
# 如果设置了最大评论数限制,并且不为0,只添加未超过该限制的评论
|
|
||||||
if max_comments is not None and max_comments > 0:
|
|
||||||
remaining_quota = max_comments - collected_comments_count
|
|
||||||
comments_to_add = filtered_comments[:remaining_quota]
|
|
||||||
result.extend(comments_to_add)
|
|
||||||
collected_comments_count += len(comments_to_add)
|
|
||||||
else:
|
|
||||||
result.extend(filtered_comments)
|
|
||||||
collected_comments_count += len(filtered_comments)
|
|
||||||
|
|
||||||
if callback: # 如果有回调函数,就执行回调函数
|
if callback: # 如果有回调函数,就执行回调函数
|
||||||
await callback(aweme_id, comments)
|
await callback(aweme_id, comments)
|
||||||
|
|
||||||
# 如果已经达到最大评论数(且最大评论数不为0),或者不需要子评论,结束循环
|
|
||||||
if max_comments is not None and 0 < max_comments <= collected_comments_count:
|
|
||||||
break
|
|
||||||
|
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
if not is_fetch_sub_comments:
|
if not is_fetch_sub_comments:
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
from asyncio import Task
|
from asyncio import Task
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
@ -132,21 +133,20 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
for aweme_id in aweme_list:
|
for aweme_id in aweme_list:
|
||||||
task = asyncio.create_task(
|
task = asyncio.create_task(
|
||||||
self.get_comments(aweme_id, semaphore, max_comments=config.MAX_COMMENTS_PER_POST), name=aweme_id)
|
self.get_comments(aweme_id, semaphore), name=aweme_id)
|
||||||
task_list.append(task)
|
task_list.append(task)
|
||||||
await asyncio.wait(task_list)
|
await asyncio.wait(task_list)
|
||||||
|
|
||||||
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, max_comments: int = None) -> None:
|
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
try:
|
try:
|
||||||
# 将关键词列表传递给 get_aweme_all_comments 方法
|
# 将关键词列表传递给 get_aweme_all_comments 方法
|
||||||
comments = await self.dy_client.get_aweme_all_comments(
|
comments = await self.dy_client.get_aweme_all_comments(
|
||||||
aweme_id=aweme_id,
|
aweme_id=aweme_id,
|
||||||
max_comments=max_comments, # 最大数量
|
crawl_interval=random.random(),
|
||||||
keywords=config.COMMENT_KEYWORDS # 关键词列表
|
callback=douyin_store.batch_update_dy_aweme_comments
|
||||||
|
|
||||||
)
|
)
|
||||||
# 现在返回的 comments 已经是经过关键词筛选的
|
|
||||||
await douyin_store.batch_update_dy_aweme_comments(aweme_id, comments)
|
|
||||||
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
|
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
|
||||||
except DataFetchError as e:
|
except DataFetchError as e:
|
||||||
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
|
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
|
||||||
|
|
|
@ -145,31 +145,17 @@ class KuaiShouClient:
|
||||||
|
|
||||||
result = []
|
result = []
|
||||||
pcursor = ""
|
pcursor = ""
|
||||||
count = 0 # 计数器,记录已获取的评论数量
|
|
||||||
|
|
||||||
while pcursor != "no_more" and (
|
while pcursor != "no_more":
|
||||||
config.MAX_COMMENTS_PER_POST == 0 or count < config.MAX_COMMENTS_PER_POST):
|
|
||||||
comments_res = await self.get_video_comments(photo_id, pcursor)
|
comments_res = await self.get_video_comments(photo_id, pcursor)
|
||||||
vision_commen_list = comments_res.get("visionCommentList", {})
|
vision_commen_list = comments_res.get("visionCommentList", {})
|
||||||
pcursor = vision_commen_list.get("pcursor", "")
|
pcursor = vision_commen_list.get("pcursor", "")
|
||||||
comments = vision_commen_list.get("rootComments", [])
|
comments = vision_commen_list.get("rootComments", [])
|
||||||
|
|
||||||
filtered_comments = [] # 存储经过关键词筛选后的评论
|
|
||||||
|
|
||||||
for comment in comments:
|
|
||||||
content = comment.get("content", "")
|
|
||||||
|
|
||||||
if not config.COMMENT_KEYWORDS or any(keyword in content for keyword in config.COMMENT_KEYWORDS):
|
|
||||||
filtered_comments.append(comment)
|
|
||||||
|
|
||||||
count += 1
|
|
||||||
if config.MAX_COMMENTS_PER_POST != 0 and count >= config.MAX_COMMENTS_PER_POST:
|
|
||||||
break
|
|
||||||
|
|
||||||
if callback: # 如果有回调函数,就执行回调函数
|
if callback: # 如果有回调函数,就执行回调函数
|
||||||
await callback(photo_id, filtered_comments)
|
await callback(photo_id, comments)
|
||||||
|
|
||||||
result.extend(filtered_comments)
|
result.extend(comments)
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
if not is_fetch_sub_comments:
|
if not is_fetch_sub_comments:
|
||||||
continue
|
continue
|
||||||
|
|
Loading…
Reference in New Issue