添加功能:(哔哩哔哩,快手,小红书)每个视频/帖子抓取评论最大条数限制,评论关键词筛选

2023-12-13 23:53:12 +08:00 · 2023-12-13 23:53:12 +08:00 · f17a85305e
parent 5c42076ff8
commit f17a85305e
5 changed files with 68 additions and 16 deletions
--- a/config/base_config.py
+++ b/config/base_config.py
@ -29,11 +29,11 @@ CRAWLER_MAX_NOTES_COUNT = 20
 # 并发爬虫数量控制
 MAX_CONCURRENCY_NUM = 10

-# 抖音每个视频抓取评论最大条数 (为0则不限制)
-DY_MAX_COMMENTS_PER_POST = 10
+# 每个视频/帖子抓取评论最大条数 (为0则不限制)
+MAX_COMMENTS_PER_POST = 10

-# 抖音评论关键词筛选(只会留下包含关键词的评论,为空不限制)
-DY_COMMENT_KEYWORDS = [
+# 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
+COMMENT_KEYWORDS = [
    "我"
    # ........................
 ]
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@ -146,18 +146,38 @@ class BilibiliCrawler(AbstractCrawler):
        """
        async with semaphore:
            try:
-                utils.logger.info(f"[get_comments] bengin get video_id: {video_id} comments ...")
-                await self.bili_client.get_video_all_comments(
+                utils.logger.info(f"[get_comments] begin get video_id: {video_id} comments ...")
+                # Read keyword and quantity from config
+                keywords = config.COMMENT_KEYWORDS
+                max_comments = config.MAX_COMMENTS_PER_POST
+
+                # Download comments
+                all_comments = await self.bili_client.get_video_all_comments(
                    video_id=video_id,
                    crawl_interval=random.random(),
-                    callback=bilibili.batch_update_bilibili_video_comments
                )
+
+                # Filter comments by keyword
+                if keywords:
+                    filtered_comments = [
+                        comment for comment in all_comments if
+                        any(keyword in comment["content"]["message"] for keyword in keywords)
+                    ]
+                else:
+                    filtered_comments = all_comments
+
+                # Limit the number of comments
+                if max_comments > 0:
+                    filtered_comments = filtered_comments[:max_comments]
+
+                # Update bilibili video comments
+                await bilibili.batch_update_bilibili_video_comments(video_id, filtered_comments)
+
            except DataFetchError as ex:
                utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}")
            except Exception as e:
                utils.logger.error(f"[get_comments] may be been blocked, err:", e)

-
    async def get_specified_videos(self):
        """
        get specified videos info
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@ -132,7 +132,7 @@ class DouYinCrawler(AbstractCrawler):
        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
        for aweme_id in aweme_list:
            task = asyncio.create_task(
-                self.get_comments(aweme_id, semaphore, max_comments=config.DY_MAX_COMMENTS_PER_POST), name=aweme_id)
+                self.get_comments(aweme_id, semaphore, max_comments=config.MAX_COMMENTS_PER_POST), name=aweme_id)
            task_list.append(task)
        await asyncio.wait(task_list)

@ -143,7 +143,7 @@ class DouYinCrawler(AbstractCrawler):
                comments = await self.dy_client.get_aweme_all_comments(
                    aweme_id=aweme_id,
                    max_comments=max_comments, # 最大数量
-                    keywords=config.DY_COMMENT_KEYWORDS  # 关键词列表
+                    keywords=config.COMMENT_KEYWORDS  # 关键词列表
                )
                # 现在返回的 comments 已经是经过关键词筛选的
                await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
--- a/media_platform/kuaishou/client.py
+++ b/media_platform/kuaishou/client.py
@ -7,6 +7,7 @@ from urllib.parse import urlencode
 import httpx
 from playwright.async_api import BrowserContext, Page

+import config
 from tools import utils

 from .exception import DataFetchError, IPBlockError
@ -124,7 +125,7 @@ class KuaiShouClient:
        return await self.post("", post_data)

    async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
-                                     callback: Optional[Callable] = None, ):
+                                     callback: Optional[Callable] = None):
        """
        get video all comments include sub comments
        :param photo_id:
@ -136,18 +137,33 @@ class KuaiShouClient:

        result = []
        pcursor = ""
-        while pcursor != "no_more":
+        count = 0  # 计数器，记录已获取的评论数量
+
+        while pcursor != "no_more" and (
+                config.MAX_COMMENTS_PER_POST == 0 or count < config.MAX_COMMENTS_PER_POST):
            comments_res = await self.get_video_comments(photo_id, pcursor)
            vision_commen_list = comments_res.get("visionCommentList", {})
            pcursor = vision_commen_list.get("pcursor", "")
            comments = vision_commen_list.get("rootComments", [])

-            if callback:  # 如果有回调函数，就执行回调函数
-                await callback(photo_id, comments)
+            filtered_comments = []  # 存储经过关键词筛选后的评论

+            for comment in comments:
+                content = comment.get("content", "")
+
+                if not config.COMMENT_KEYWORDS or any(keyword in content for keyword in config.COMMENT_KEYWORDS):
+                    filtered_comments.append(comment)
+
+                    count += 1
+                    if config.MAX_COMMENTS_PER_POST != 0 and count >= config.MAX_COMMENTS_PER_POST:
+                        break
+
+            if callback:  # 如果有回调函数，就执行回调函数
+                await callback(photo_id, filtered_comments)
+
+            result.extend(filtered_comments)
            await asyncio.sleep(crawl_interval)
            if not is_fetch_sub_comments:
-                result.extend(comments)
                continue
            # todo handle get sub comments
        return result
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@ -152,11 +152,27 @@ class XiaoHongShuCrawler(AbstractCrawler):
        await asyncio.gather(*task_list)

    async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
-        """Get note comments"""
+        """Get note comments with keyword filtering and quantity limitation"""
        async with semaphore:
            utils.logger.info(f"Begin get note id comments {note_id}")
            all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
+
+            # 从配置文件中读取关键词和数量限制
+            keywords = getattr(config, 'COMMENT_KEYWORDS', [])
+            max_comments = getattr(config, 'MAX_COMMENTS_PER_POST', 0)
+
+            # 过滤评论
+            filtered_comments = []
            for comment in all_comments:
+                # 检查评论内容是否包含关键词
+                if not keywords or any(keyword in comment['content'] for keyword in keywords):
+                    filtered_comments.append(comment)
+                    # 如果达到最大评论数量限制，则停止添加更多评论
+                    if max_comments and len(filtered_comments) >= max_comments:
+                        break
+
+            # 更新或保存过滤后的评论
+            for comment in filtered_comments:
                await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)

    @staticmethod