Merge pull request #462 from FloRainRJY/xiaohongshu_comment_number_restrict

All_platform_comments_restrict
This commit is contained in:
程序员阿江(Relakkes) 2024-10-24 15:31:13 +08:00 committed by GitHub
commit 5a27ad089c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 44 additions and 19 deletions

View File

@ -11,7 +11,7 @@
# 基础配置 # 基础配置
PLATFORM = "xhs" PLATFORM = "xhs"
KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置,以英文逗号分隔 KEYWORDS = "城市" # 关键词搜索配置,以英文逗号分隔
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = "" COOKIES = ""
# 具体值参见media_platform.xxx.field下的枚举值暂时只支持小红书 # 具体值参见media_platform.xxx.field下的枚举值暂时只支持小红书
@ -48,7 +48,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
START_PAGE = 1 START_PAGE = 1
# 爬取视频/帖子的数量控制 # 爬取视频/帖子的数量控制
CRAWLER_MAX_NOTES_COUNT = 200 CRAWLER_MAX_NOTES_COUNT = 10
# 并发爬虫数量控制 # 并发爬虫数量控制
MAX_CONCURRENCY_NUM = 1 MAX_CONCURRENCY_NUM = 1

View File

@ -222,20 +222,23 @@ class BilibiliClient(AbstractApiClient):
return await self.get(uri, post_data) return await self.get(uri, post_data)
async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False, async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
callback: Optional[Callable] = None, ): callback: Optional[Callable] = None,
max_count: int = 10,):
""" """
get video all comments include sub comments get video all comments include sub comments
:param video_id: :param video_id:
:param crawl_interval: :param crawl_interval:
:param is_fetch_sub_comments: :param is_fetch_sub_comments:
:param callback: :param callback:
max_count: 一次笔记爬取的最大评论数量
:return: :return:
""" """
result = [] result = []
is_end = False is_end = False
next_page = 0 next_page = 0
while not is_end: while not is_end and len(result) < max_count:
comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page) comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
cursor_info: Dict = comments_res.get("cursor") cursor_info: Dict = comments_res.get("cursor")
comment_list: List[Dict] = comments_res.get("replies", []) comment_list: List[Dict] = comments_res.get("replies", [])
@ -249,6 +252,8 @@ class BilibiliClient(AbstractApiClient):
await self.get_video_all_level_two_comments( await self.get_video_all_level_two_comments(
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback) video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
} }
if len(result) + len(comment_list) > max_count:
comment_list = comment_list[:max_count - len(result)]
if callback: # 如果有回调函数,就执行回调函数 if callback: # 如果有回调函数,就执行回调函数
await callback(video_id, comment_list) await callback(video_id, comment_list)
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)

View File

@ -182,7 +182,8 @@ class BilibiliCrawler(AbstractCrawler):
video_id=video_id, video_id=video_id,
crawl_interval=random.random(), crawl_interval=random.random(),
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS, is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
callback=bilibili_store.batch_update_bilibili_video_comments callback=bilibili_store.batch_update_bilibili_video_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
) )
except DataFetchError as ex: except DataFetchError as ex:

View File

@ -230,6 +230,7 @@ class DOUYINClient(AbstractApiClient):
crawl_interval: float = 1.0, crawl_interval: float = 1.0,
is_fetch_sub_comments=False, is_fetch_sub_comments=False,
callback: Optional[Callable] = None, callback: Optional[Callable] = None,
max_count: int = 10,
): ):
""" """
获取帖子的所有评论包括子评论 获取帖子的所有评论包括子评论
@ -237,18 +238,21 @@ class DOUYINClient(AbstractApiClient):
:param crawl_interval: 抓取间隔 :param crawl_interval: 抓取间隔
:param is_fetch_sub_comments: 是否抓取子评论 :param is_fetch_sub_comments: 是否抓取子评论
:param callback: 回调函数用于处理抓取到的评论 :param callback: 回调函数用于处理抓取到的评论
:param max_count: 一次帖子爬取的最大评论数量
:return: 评论列表 :return: 评论列表
""" """
result = [] result = []
comments_has_more = 1 comments_has_more = 1
comments_cursor = 0 comments_cursor = 0
while comments_has_more: while comments_has_more and len(result) < max_count:
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor) comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
comments_has_more = comments_res.get("has_more", 0) comments_has_more = comments_res.get("has_more", 0)
comments_cursor = comments_res.get("cursor", 0) comments_cursor = comments_res.get("cursor", 0)
comments = comments_res.get("comments", []) comments = comments_res.get("comments", [])
if not comments: if not comments:
continue continue
if len(result) + len(comments) > max_count:
comments = comments[:max_count - len(result)]
result.extend(comments) result.extend(comments)
if callback: # 如果有回调函数,就执行回调函数 if callback: # 如果有回调函数,就执行回调函数
await callback(aweme_id, comments) await callback(aweme_id, comments)

View File

@ -179,7 +179,8 @@ class DouYinCrawler(AbstractCrawler):
aweme_id=aweme_id, aweme_id=aweme_id,
crawl_interval=random.random(), crawl_interval=random.random(),
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS, is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
callback=douyin_store.batch_update_dy_aweme_comments callback=douyin_store.batch_update_dy_aweme_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
) )
utils.logger.info( utils.logger.info(
f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...") f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")

View File

@ -189,27 +189,29 @@ class KuaiShouClient(AbstractApiClient):
photo_id: str, photo_id: str,
crawl_interval: float = 1.0, crawl_interval: float = 1.0,
callback: Optional[Callable] = None, callback: Optional[Callable] = None,
max_count: int = 10,
): ):
""" """
get video all comments include sub comments get video all comments include sub comments
:param photo_id: :param photo_id:
:param crawl_interval: :param crawl_interval:
:param callback: :param callback:
:param max_count:
:return: :return:
""" """
result = [] result = []
pcursor = "" pcursor = ""
while pcursor != "no_more": while pcursor != "no_more" and len(result) < max_count:
comments_res = await self.get_video_comments(photo_id, pcursor) comments_res = await self.get_video_comments(photo_id, pcursor)
vision_commen_list = comments_res.get("visionCommentList", {}) vision_commen_list = comments_res.get("visionCommentList", {})
pcursor = vision_commen_list.get("pcursor", "") pcursor = vision_commen_list.get("pcursor", "")
comments = vision_commen_list.get("rootComments", []) comments = vision_commen_list.get("rootComments", [])
if len(result) + len(comments) > max_count:
comments = comments[:max_count - len(result)]
if callback: # 如果有回调函数,就执行回调函数 if callback: # 如果有回调函数,就执行回调函数
await callback(photo_id, comments) await callback(photo_id, comments)
result.extend(comments) result.extend(comments)
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)
sub_comments = await self.get_comments_all_sub_comments( sub_comments = await self.get_comments_all_sub_comments(

View File

@ -186,7 +186,8 @@ class KuaishouCrawler(AbstractCrawler):
await self.ks_client.get_video_all_comments( await self.ks_client.get_video_all_comments(
photo_id=video_id, photo_id=video_id,
crawl_interval=random.random(), crawl_interval=random.random(),
callback=kuaishou_store.batch_update_ks_video_comments callback=kuaishou_store.batch_update_ks_video_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
) )
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}") utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}")

View File

@ -204,21 +204,23 @@ class BaiduTieBaClient(AbstractApiClient):
return self._page_extractor.extract_note_detail(page_content) return self._page_extractor.extract_note_detail(page_content)
async def get_note_all_comments(self, note_detail: TiebaNote, crawl_interval: float = 1.0, async def get_note_all_comments(self, note_detail: TiebaNote, crawl_interval: float = 1.0,
callback: Optional[Callable] = None) -> List[TiebaComment]: callback: Optional[Callable] = None,
max_count: int = 10,
) -> List[TiebaComment]:
""" """
获取指定帖子下的所有一级评论该方法会一直查找一个帖子下的所有评论信息 获取指定帖子下的所有一级评论该方法会一直查找一个帖子下的所有评论信息
Args: Args:
note_detail: 帖子详情对象 note_detail: 帖子详情对象
crawl_interval: 爬取一次笔记的延迟单位 crawl_interval: 爬取一次笔记的延迟单位
callback: 一次笔记爬取结束后 callback: 一次笔记爬取结束后
max_count: 一次帖子爬取的最大评论数量
Returns: Returns:
""" """
uri = f"/p/{note_detail.note_id}" uri = f"/p/{note_detail.note_id}"
result: List[TiebaComment] = [] result: List[TiebaComment] = []
current_page = 1 current_page = 1
while note_detail.total_replay_page >= current_page: while note_detail.total_replay_page >= current_page and len(result) < max_count:
params = { params = {
"pn": current_page "pn": current_page
} }
@ -227,6 +229,8 @@ class BaiduTieBaClient(AbstractApiClient):
note_id=note_detail.note_id) note_id=note_detail.note_id)
if not comments: if not comments:
break break
if len(result) + len(comments) > max_count:
comments = comments[:max_count - len(result)]
if callback: if callback:
await callback(note_detail.note_id, comments) await callback(note_detail.note_id, comments)
result.extend(comments) result.extend(comments)

View File

@ -226,7 +226,8 @@ class TieBaCrawler(AbstractCrawler):
await self.tieba_client.get_note_all_comments( await self.tieba_client.get_note_all_comments(
note_detail=note_detail, note_detail=note_detail,
crawl_interval=random.random(), crawl_interval=random.random(),
callback=tieba_store.batch_update_tieba_note_comments callback=tieba_store.batch_update_tieba_note_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
) )
async def get_creators_and_notes(self) -> None: async def get_creators_and_notes(self) -> None:

View File

@ -149,23 +149,28 @@ class WeiboClient:
return await self.get(uri, params, headers=headers) return await self.get(uri, params, headers=headers)
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
callback: Optional[Callable] = None, ): callback: Optional[Callable] = None,
max_count: int = 10,
):
""" """
get note all comments include sub comments get note all comments include sub comments
:param note_id: :param note_id:
:param crawl_interval: :param crawl_interval:
:param callback: :param callback:
:param max_count:
:return: :return:
""" """
result = [] result = []
is_end = False is_end = False
max_id = -1 max_id = -1
while not is_end: while not is_end and len(result) < max_count:
comments_res = await self.get_note_comments(note_id, max_id) comments_res = await self.get_note_comments(note_id, max_id)
max_id: int = comments_res.get("max_id") max_id: int = comments_res.get("max_id")
comment_list: List[Dict] = comments_res.get("data", []) comment_list: List[Dict] = comments_res.get("data", [])
is_end = max_id == 0 is_end = max_id == 0
if len(result) + len(comment_list) > max_count:
comment_list = comment_list[:max_count - len(result)]
if callback: # 如果有回调函数,就执行回调函数 if callback: # 如果有回调函数,就执行回调函数
await callback(note_id, comment_list) await callback(note_id, comment_list)
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)

View File

@ -206,7 +206,8 @@ class WeiboCrawler(AbstractCrawler):
await self.wb_client.get_note_all_comments( await self.wb_client.get_note_all_comments(
note_id=note_id, note_id=note_id,
crawl_interval=random.randint(1,3), # 微博对API的限流比较严重所以延时提高一些 crawl_interval=random.randint(1,3), # 微博对API的限流比较严重所以延时提高一些
callback=weibo_store.batch_update_weibo_note_comments callback=weibo_store.batch_update_weibo_note_comments,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
) )
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}") utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")

View File

@ -297,7 +297,7 @@ class XiaoHongShuClient(AbstractApiClient):
note_id: 笔记ID note_id: 笔记ID
crawl_interval: 爬取一次笔记的延迟单位 crawl_interval: 爬取一次笔记的延迟单位
callback: 一次笔记爬取结束后 callback: 一次笔记爬取结束后
max_count: 一次笔记爬取的最大评论数量
Returns: Returns:
""" """