All_platform_comments_restrict
This commit is contained in:
parent
19269c66fd
commit
7e53c4acfc
|
@ -11,7 +11,7 @@
|
||||||
|
|
||||||
# 基础配置
|
# 基础配置
|
||||||
PLATFORM = "xhs"
|
PLATFORM = "xhs"
|
||||||
KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置,以英文逗号分隔
|
KEYWORDS = "城市" # 关键词搜索配置,以英文逗号分隔
|
||||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
||||||
COOKIES = ""
|
COOKIES = ""
|
||||||
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书
|
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书
|
||||||
|
@ -48,7 +48,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||||
START_PAGE = 1
|
START_PAGE = 1
|
||||||
|
|
||||||
# 爬取视频/帖子的数量控制
|
# 爬取视频/帖子的数量控制
|
||||||
CRAWLER_MAX_NOTES_COUNT = 200
|
CRAWLER_MAX_NOTES_COUNT = 10
|
||||||
|
|
||||||
# 并发爬虫数量控制
|
# 并发爬虫数量控制
|
||||||
MAX_CONCURRENCY_NUM = 1
|
MAX_CONCURRENCY_NUM = 1
|
||||||
|
|
|
@ -222,20 +222,23 @@ class BilibiliClient(AbstractApiClient):
|
||||||
return await self.get(uri, post_data)
|
return await self.get(uri, post_data)
|
||||||
|
|
||||||
async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
|
async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
|
||||||
callback: Optional[Callable] = None, ):
|
callback: Optional[Callable] = None,
|
||||||
|
max_count: int = 10,):
|
||||||
"""
|
"""
|
||||||
get video all comments include sub comments
|
get video all comments include sub comments
|
||||||
:param video_id:
|
:param video_id:
|
||||||
:param crawl_interval:
|
:param crawl_interval:
|
||||||
:param is_fetch_sub_comments:
|
:param is_fetch_sub_comments:
|
||||||
:param callback:
|
:param callback:
|
||||||
|
max_count: 一次笔记爬取的最大评论数量
|
||||||
|
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
result = []
|
result = []
|
||||||
is_end = False
|
is_end = False
|
||||||
next_page = 0
|
next_page = 0
|
||||||
while not is_end:
|
while not is_end and len(result) < max_count:
|
||||||
comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
|
comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
|
||||||
cursor_info: Dict = comments_res.get("cursor")
|
cursor_info: Dict = comments_res.get("cursor")
|
||||||
comment_list: List[Dict] = comments_res.get("replies", [])
|
comment_list: List[Dict] = comments_res.get("replies", [])
|
||||||
|
@ -249,6 +252,8 @@ class BilibiliClient(AbstractApiClient):
|
||||||
await self.get_video_all_level_two_comments(
|
await self.get_video_all_level_two_comments(
|
||||||
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
|
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
|
||||||
}
|
}
|
||||||
|
if len(result) + len(comment_list) > max_count:
|
||||||
|
comment_list = comment_list[:max_count - len(result)]
|
||||||
if callback: # 如果有回调函数,就执行回调函数
|
if callback: # 如果有回调函数,就执行回调函数
|
||||||
await callback(video_id, comment_list)
|
await callback(video_id, comment_list)
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
|
|
|
@ -182,7 +182,8 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
video_id=video_id,
|
video_id=video_id,
|
||||||
crawl_interval=random.random(),
|
crawl_interval=random.random(),
|
||||||
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
||||||
callback=bilibili_store.batch_update_bilibili_video_comments
|
callback=bilibili_store.batch_update_bilibili_video_comments,
|
||||||
|
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
|
||||||
)
|
)
|
||||||
|
|
||||||
except DataFetchError as ex:
|
except DataFetchError as ex:
|
||||||
|
|
|
@ -230,6 +230,7 @@ class DOUYINClient(AbstractApiClient):
|
||||||
crawl_interval: float = 1.0,
|
crawl_interval: float = 1.0,
|
||||||
is_fetch_sub_comments=False,
|
is_fetch_sub_comments=False,
|
||||||
callback: Optional[Callable] = None,
|
callback: Optional[Callable] = None,
|
||||||
|
max_count: int = 10,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
获取帖子的所有评论,包括子评论
|
获取帖子的所有评论,包括子评论
|
||||||
|
@ -237,18 +238,21 @@ class DOUYINClient(AbstractApiClient):
|
||||||
:param crawl_interval: 抓取间隔
|
:param crawl_interval: 抓取间隔
|
||||||
:param is_fetch_sub_comments: 是否抓取子评论
|
:param is_fetch_sub_comments: 是否抓取子评论
|
||||||
:param callback: 回调函数,用于处理抓取到的评论
|
:param callback: 回调函数,用于处理抓取到的评论
|
||||||
|
:param max_count: 一次帖子爬取的最大评论数量
|
||||||
:return: 评论列表
|
:return: 评论列表
|
||||||
"""
|
"""
|
||||||
result = []
|
result = []
|
||||||
comments_has_more = 1
|
comments_has_more = 1
|
||||||
comments_cursor = 0
|
comments_cursor = 0
|
||||||
while comments_has_more:
|
while comments_has_more and len(result) < max_count:
|
||||||
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
|
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
|
||||||
comments_has_more = comments_res.get("has_more", 0)
|
comments_has_more = comments_res.get("has_more", 0)
|
||||||
comments_cursor = comments_res.get("cursor", 0)
|
comments_cursor = comments_res.get("cursor", 0)
|
||||||
comments = comments_res.get("comments", [])
|
comments = comments_res.get("comments", [])
|
||||||
if not comments:
|
if not comments:
|
||||||
continue
|
continue
|
||||||
|
if len(result) + len(comments) > max_count:
|
||||||
|
comments = comments[:max_count - len(result)]
|
||||||
result.extend(comments)
|
result.extend(comments)
|
||||||
if callback: # 如果有回调函数,就执行回调函数
|
if callback: # 如果有回调函数,就执行回调函数
|
||||||
await callback(aweme_id, comments)
|
await callback(aweme_id, comments)
|
||||||
|
|
|
@ -179,7 +179,8 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
aweme_id=aweme_id,
|
aweme_id=aweme_id,
|
||||||
crawl_interval=random.random(),
|
crawl_interval=random.random(),
|
||||||
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
|
||||||
callback=douyin_store.batch_update_dy_aweme_comments
|
callback=douyin_store.batch_update_dy_aweme_comments,
|
||||||
|
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||||
)
|
)
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
|
f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
|
||||||
|
|
|
@ -189,27 +189,29 @@ class KuaiShouClient(AbstractApiClient):
|
||||||
photo_id: str,
|
photo_id: str,
|
||||||
crawl_interval: float = 1.0,
|
crawl_interval: float = 1.0,
|
||||||
callback: Optional[Callable] = None,
|
callback: Optional[Callable] = None,
|
||||||
|
max_count: int = 10,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
get video all comments include sub comments
|
get video all comments include sub comments
|
||||||
:param photo_id:
|
:param photo_id:
|
||||||
:param crawl_interval:
|
:param crawl_interval:
|
||||||
:param callback:
|
:param callback:
|
||||||
|
:param max_count:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
result = []
|
result = []
|
||||||
pcursor = ""
|
pcursor = ""
|
||||||
|
|
||||||
while pcursor != "no_more":
|
while pcursor != "no_more" and len(result) < max_count:
|
||||||
comments_res = await self.get_video_comments(photo_id, pcursor)
|
comments_res = await self.get_video_comments(photo_id, pcursor)
|
||||||
vision_commen_list = comments_res.get("visionCommentList", {})
|
vision_commen_list = comments_res.get("visionCommentList", {})
|
||||||
pcursor = vision_commen_list.get("pcursor", "")
|
pcursor = vision_commen_list.get("pcursor", "")
|
||||||
comments = vision_commen_list.get("rootComments", [])
|
comments = vision_commen_list.get("rootComments", [])
|
||||||
|
if len(result) + len(comments) > max_count:
|
||||||
|
comments = comments[:max_count - len(result)]
|
||||||
if callback: # 如果有回调函数,就执行回调函数
|
if callback: # 如果有回调函数,就执行回调函数
|
||||||
await callback(photo_id, comments)
|
await callback(photo_id, comments)
|
||||||
|
|
||||||
result.extend(comments)
|
result.extend(comments)
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
sub_comments = await self.get_comments_all_sub_comments(
|
sub_comments = await self.get_comments_all_sub_comments(
|
||||||
|
|
|
@ -186,7 +186,8 @@ class KuaishouCrawler(AbstractCrawler):
|
||||||
await self.ks_client.get_video_all_comments(
|
await self.ks_client.get_video_all_comments(
|
||||||
photo_id=video_id,
|
photo_id=video_id,
|
||||||
crawl_interval=random.random(),
|
crawl_interval=random.random(),
|
||||||
callback=kuaishou_store.batch_update_ks_video_comments
|
callback=kuaishou_store.batch_update_ks_video_comments,
|
||||||
|
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||||
)
|
)
|
||||||
except DataFetchError as ex:
|
except DataFetchError as ex:
|
||||||
utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
|
utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
|
||||||
|
|
|
@ -204,21 +204,23 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||||
return self._page_extractor.extract_note_detail(page_content)
|
return self._page_extractor.extract_note_detail(page_content)
|
||||||
|
|
||||||
async def get_note_all_comments(self, note_detail: TiebaNote, crawl_interval: float = 1.0,
|
async def get_note_all_comments(self, note_detail: TiebaNote, crawl_interval: float = 1.0,
|
||||||
callback: Optional[Callable] = None) -> List[TiebaComment]:
|
callback: Optional[Callable] = None,
|
||||||
|
max_count: int = 10,
|
||||||
|
) -> List[TiebaComment]:
|
||||||
"""
|
"""
|
||||||
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||||
Args:
|
Args:
|
||||||
note_detail: 帖子详情对象
|
note_detail: 帖子详情对象
|
||||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||||
callback: 一次笔记爬取结束后
|
callback: 一次笔记爬取结束后
|
||||||
|
max_count: 一次帖子爬取的最大评论数量
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
uri = f"/p/{note_detail.note_id}"
|
uri = f"/p/{note_detail.note_id}"
|
||||||
result: List[TiebaComment] = []
|
result: List[TiebaComment] = []
|
||||||
current_page = 1
|
current_page = 1
|
||||||
while note_detail.total_replay_page >= current_page:
|
while note_detail.total_replay_page >= current_page and len(result) < max_count:
|
||||||
params = {
|
params = {
|
||||||
"pn": current_page
|
"pn": current_page
|
||||||
}
|
}
|
||||||
|
@ -227,6 +229,8 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||||
note_id=note_detail.note_id)
|
note_id=note_detail.note_id)
|
||||||
if not comments:
|
if not comments:
|
||||||
break
|
break
|
||||||
|
if len(result) + len(comments) > max_count:
|
||||||
|
comments = comments[:max_count - len(result)]
|
||||||
if callback:
|
if callback:
|
||||||
await callback(note_detail.note_id, comments)
|
await callback(note_detail.note_id, comments)
|
||||||
result.extend(comments)
|
result.extend(comments)
|
||||||
|
|
|
@ -226,7 +226,8 @@ class TieBaCrawler(AbstractCrawler):
|
||||||
await self.tieba_client.get_note_all_comments(
|
await self.tieba_client.get_note_all_comments(
|
||||||
note_detail=note_detail,
|
note_detail=note_detail,
|
||||||
crawl_interval=random.random(),
|
crawl_interval=random.random(),
|
||||||
callback=tieba_store.batch_update_tieba_note_comments
|
callback=tieba_store.batch_update_tieba_note_comments,
|
||||||
|
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||||
)
|
)
|
||||||
|
|
||||||
async def get_creators_and_notes(self) -> None:
|
async def get_creators_and_notes(self) -> None:
|
||||||
|
|
|
@ -149,23 +149,28 @@ class WeiboClient:
|
||||||
return await self.get(uri, params, headers=headers)
|
return await self.get(uri, params, headers=headers)
|
||||||
|
|
||||||
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
|
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
|
||||||
callback: Optional[Callable] = None, ):
|
callback: Optional[Callable] = None,
|
||||||
|
max_count: int = 10,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
get note all comments include sub comments
|
get note all comments include sub comments
|
||||||
:param note_id:
|
:param note_id:
|
||||||
:param crawl_interval:
|
:param crawl_interval:
|
||||||
:param callback:
|
:param callback:
|
||||||
|
:param max_count:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
result = []
|
result = []
|
||||||
is_end = False
|
is_end = False
|
||||||
max_id = -1
|
max_id = -1
|
||||||
while not is_end:
|
while not is_end and len(result) < max_count:
|
||||||
comments_res = await self.get_note_comments(note_id, max_id)
|
comments_res = await self.get_note_comments(note_id, max_id)
|
||||||
max_id: int = comments_res.get("max_id")
|
max_id: int = comments_res.get("max_id")
|
||||||
comment_list: List[Dict] = comments_res.get("data", [])
|
comment_list: List[Dict] = comments_res.get("data", [])
|
||||||
is_end = max_id == 0
|
is_end = max_id == 0
|
||||||
|
if len(result) + len(comment_list) > max_count:
|
||||||
|
comment_list = comment_list[:max_count - len(result)]
|
||||||
if callback: # 如果有回调函数,就执行回调函数
|
if callback: # 如果有回调函数,就执行回调函数
|
||||||
await callback(note_id, comment_list)
|
await callback(note_id, comment_list)
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
|
|
|
@ -206,7 +206,8 @@ class WeiboCrawler(AbstractCrawler):
|
||||||
await self.wb_client.get_note_all_comments(
|
await self.wb_client.get_note_all_comments(
|
||||||
note_id=note_id,
|
note_id=note_id,
|
||||||
crawl_interval=random.randint(1,3), # 微博对API的限流比较严重,所以延时提高一些
|
crawl_interval=random.randint(1,3), # 微博对API的限流比较严重,所以延时提高一些
|
||||||
callback=weibo_store.batch_update_weibo_note_comments
|
callback=weibo_store.batch_update_weibo_note_comments,
|
||||||
|
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||||
)
|
)
|
||||||
except DataFetchError as ex:
|
except DataFetchError as ex:
|
||||||
utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")
|
utils.logger.error(f"[WeiboCrawler.get_note_comments] get note_id: {note_id} comment error: {ex}")
|
||||||
|
|
|
@ -296,7 +296,7 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||||
note_id: 笔记ID
|
note_id: 笔记ID
|
||||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||||
callback: 一次笔记爬取结束后
|
callback: 一次笔记爬取结束后
|
||||||
|
max_count: 一次笔记爬取的最大评论数量
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue