diff --git a/config/base_config.py b/config/base_config.py index a78ab5e..8e55b99 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -11,7 +11,7 @@ # 基础配置 PLATFORM = "xhs" -KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置,以英文逗号分隔 +KEYWORDS = "城市" # 关键词搜索配置,以英文逗号分隔 LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" # 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书 @@ -48,7 +48,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name START_PAGE = 1 # 爬取视频/帖子的数量控制 -CRAWLER_MAX_NOTES_COUNT = 200 +CRAWLER_MAX_NOTES_COUNT = 10 # 并发爬虫数量控制 MAX_CONCURRENCY_NUM = 1 diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index 9bca496..c8338c8 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -222,20 +222,23 @@ class BilibiliClient(AbstractApiClient): return await self.get(uri, post_data) async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False, - callback: Optional[Callable] = None, ): + callback: Optional[Callable] = None, + max_count: int = 10,): """ get video all comments include sub comments :param video_id: :param crawl_interval: :param is_fetch_sub_comments: :param callback: + max_count: 一次笔记爬取的最大评论数量 + :return: """ result = [] is_end = False next_page = 0 - while not is_end: + while not is_end and len(result) < max_count: comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page) cursor_info: Dict = comments_res.get("cursor") comment_list: List[Dict] = comments_res.get("replies", []) @@ -249,6 +252,8 @@ class BilibiliClient(AbstractApiClient): await self.get_video_all_level_two_comments( video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback) } + if len(result) + len(comment_list) > max_count: + comment_list = comment_list[:max_count - len(result)] if callback: # 如果有回调函数,就执行回调函数 await callback(video_id, comment_list) await asyncio.sleep(crawl_interval) diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index d377e90..e698bbd 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -182,7 +182,8 @@ class BilibiliCrawler(AbstractCrawler): video_id=video_id, crawl_interval=random.random(), is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS, - callback=bilibili_store.batch_update_bilibili_video_comments + callback=bilibili_store.batch_update_bilibili_video_comments, + max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, ) except DataFetchError as ex: diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index a06e9da..37951dd 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -230,6 +230,7 @@ class DOUYINClient(AbstractApiClient): crawl_interval: float = 1.0, is_fetch_sub_comments=False, callback: Optional[Callable] = None, + max_count: int = 10, ): """ 获取帖子的所有评论,包括子评论 @@ -237,18 +238,21 @@ class DOUYINClient(AbstractApiClient): :param crawl_interval: 抓取间隔 :param is_fetch_sub_comments: 是否抓取子评论 :param callback: 回调函数,用于处理抓取到的评论 + :param max_count: 一次帖子爬取的最大评论数量 :return: 评论列表 """ result = [] comments_has_more = 1 comments_cursor = 0 - while comments_has_more: + while comments_has_more and len(result) < max_count: comments_res = await self.get_aweme_comments(aweme_id, comments_cursor) comments_has_more = comments_res.get("has_more", 0) comments_cursor = comments_res.get("cursor", 0) comments = comments_res.get("comments", []) if not comments: continue + if len(result) + len(comments) > max_count: + comments = comments[:max_count - len(result)] result.extend(comments) if callback: # 如果有回调函数,就执行回调函数 await callback(aweme_id, comments) diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 3aea4a1..426b33b 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -179,7 +179,8 @@ class DouYinCrawler(AbstractCrawler): aweme_id=aweme_id, crawl_interval=random.random(), is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS, - callback=douyin_store.batch_update_dy_aweme_comments + callback=douyin_store.batch_update_dy_aweme_comments, + max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES ) utils.logger.info( f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...") diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index 9aaa690..e728ed8 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -189,27 +189,29 @@ class KuaiShouClient(AbstractApiClient): photo_id: str, crawl_interval: float = 1.0, callback: Optional[Callable] = None, + max_count: int = 10, ): """ get video all comments include sub comments :param photo_id: :param crawl_interval: :param callback: + :param max_count: :return: """ result = [] pcursor = "" - while pcursor != "no_more": + while pcursor != "no_more" and len(result) < max_count: comments_res = await self.get_video_comments(photo_id, pcursor) vision_commen_list = comments_res.get("visionCommentList", {}) pcursor = vision_commen_list.get("pcursor", "") comments = vision_commen_list.get("rootComments", []) - + if len(result) + len(comments) > max_count: + comments = comments[:max_count - len(result)] if callback: # 如果有回调函数,就执行回调函数 await callback(photo_id, comments) - result.extend(comments) await asyncio.sleep(crawl_interval) sub_comments = await self.get_comments_all_sub_comments( diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index 0456f01..609f705 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -186,7 +186,8 @@ class KuaishouCrawler(AbstractCrawler): await self.ks_client.get_video_all_comments( photo_id=video_id, crawl_interval=random.random(), - callback=kuaishou_store.batch_update_ks_video_comments + callback=kuaishou_store.batch_update_ks_video_comments, + max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES ) except DataFetchError as ex: utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}") diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index 25cac59..1adbbe4 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -204,21 +204,23 @@ class BaiduTieBaClient(AbstractApiClient): return self._page_extractor.extract_note_detail(page_content) async def get_note_all_comments(self, note_detail: TiebaNote, crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[TiebaComment]: + callback: Optional[Callable] = None, + max_count: int = 10, + ) -> List[TiebaComment]: """ 获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 Args: note_detail: 帖子详情对象 crawl_interval: 爬取一次笔记的延迟单位(秒) callback: 一次笔记爬取结束后 - + max_count: 一次帖子爬取的最大评论数量 Returns: """ uri = f"/p/{note_detail.note_id}" result: List[TiebaComment] = [] current_page = 1 - while note_detail.total_replay_page >= current_page: + while note_detail.total_replay_page >= current_page and len(result) < max_count: params = { "pn": current_page } @@ -227,6 +229,8 @@ class BaiduTieBaClient(AbstractApiClient): note_id=note_detail.note_id) if not comments: break + if len(result) + len(comments) > max_count: + comments = comments[:max_count - len(result)] if callback: await callback(note_detail.note_id, comments) result.extend(comments) diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index f1eb2ff..567a1fa 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -226,7 +226,8 @@ class TieBaCrawler(AbstractCrawler): await self.tieba_client.get_note_all_comments( note_detail=note_detail, crawl_interval=random.random(), - callback=tieba_store.batch_update_tieba_note_comments + callback=tieba_store.batch_update_tieba_note_comments, + max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES ) async def get_creators_and_notes(self) -> None: diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py index c6f25af..92ab9c7 100644 --- a/media_platform/weibo/client.py +++ b/media_platform/weibo/client.py @@ -149,23 +149,28 @@ class WeiboClient: return await self.get(uri, params, headers=headers) async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, - callback: Optional[Callable] = None, ): + callback: Optional[Callable] = None, + max_count: int = 10, + ): """ get note all comments include sub comments :param note_id: :param crawl_interval: :param callback: + :param max_count: :return: """ result = [] is_end = False max_id = -1 - while not is_end: + while not is_end and len(result) < max_count: comments_res = await self.get_note_comments(note_id, max_id) max_id: int = comments_res.get("max_id") comment_list: List[Dict] = comments_res.get("data", []) is_end = max_id == 0 + if len(result) + len(comment_list) > max_count: + comment_list = comment_list[:max_count - len(result)] if callback: # 如果有回调函数,就执行回调函数 await callback(note_id, comment_list) await asyncio.sleep(crawl_interval) diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index a6e9729..e72fdf1 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -206,7 +206,8 @@ class WeiboCrawler(AbstractCrawler): await self.wb_client.get_note_all_comments( note_id=note_id, crawl_interval=random.randint(1,3), # 微博对API的限流比较严重,所以延时提高一些 - callback=weibo_store.batch_update_weibo_note_comments + callback=weibo_store.batch_update_weibo_note_comments, + max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES ) except DataFetchError as ex: utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}") diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 4d83c0b..397c290 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -297,7 +297,7 @@ class XiaoHongShuClient(AbstractApiClient): note_id: 笔记ID crawl_interval: 爬取一次笔记的延迟单位(秒) callback: 一次笔记爬取结束后 - + max_count: 一次笔记爬取的最大评论数量 Returns: """