Merge pull request #461 from FloRainRJY/xiaohongshu_comment_number_restrict
feat: xhs增加最大评论数量限制
This commit is contained in:
commit
fa2bcc4181
|
@ -59,6 +59,11 @@ ENABLE_GET_IMAGES = False
|
||||||
# 是否开启爬评论模式, 默认开启爬评论
|
# 是否开启爬评论模式, 默认开启爬评论
|
||||||
ENABLE_GET_COMMENTS = True
|
ENABLE_GET_COMMENTS = True
|
||||||
|
|
||||||
|
# 爬取一级评论的数量控制(单视频/帖子)
|
||||||
|
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
||||||
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
||||||
ENABLE_GET_SUB_COMMENTS = False
|
ENABLE_GET_SUB_COMMENTS = False
|
||||||
|
|
|
@ -288,7 +288,8 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||||
return await self.get(uri, params)
|
return await self.get(uri, params)
|
||||||
|
|
||||||
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
|
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
|
||||||
callback: Optional[Callable] = None) -> List[Dict]:
|
callback: Optional[Callable] = None,
|
||||||
|
max_count: int = 10) -> List[Dict]:
|
||||||
"""
|
"""
|
||||||
获取指定笔记下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
获取指定笔记下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||||
Args:
|
Args:
|
||||||
|
@ -302,7 +303,7 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||||
result = []
|
result = []
|
||||||
comments_has_more = True
|
comments_has_more = True
|
||||||
comments_cursor = ""
|
comments_cursor = ""
|
||||||
while comments_has_more:
|
while comments_has_more and len(result) < max_count:
|
||||||
comments_res = await self.get_note_comments(note_id, comments_cursor)
|
comments_res = await self.get_note_comments(note_id, comments_cursor)
|
||||||
comments_has_more = comments_res.get("has_more", False)
|
comments_has_more = comments_res.get("has_more", False)
|
||||||
comments_cursor = comments_res.get("cursor", "")
|
comments_cursor = comments_res.get("cursor", "")
|
||||||
|
@ -311,6 +312,8 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||||
f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
|
f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
|
||||||
break
|
break
|
||||||
comments = comments_res["comments"]
|
comments = comments_res["comments"]
|
||||||
|
if len(result) + len(comments) > max_count:
|
||||||
|
comments = comments[:max_count - len(result)]
|
||||||
if callback:
|
if callback:
|
||||||
await callback(note_id, comments)
|
await callback(note_id, comments)
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
|
|
|
@ -21,6 +21,7 @@ from tenacity import RetryError
|
||||||
|
|
||||||
import config
|
import config
|
||||||
from base.base_crawler import AbstractCrawler
|
from base.base_crawler import AbstractCrawler
|
||||||
|
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||||
from model.m_xiaohongshu import NoteUrlInfo
|
from model.m_xiaohongshu import NoteUrlInfo
|
||||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||||
from store import xhs as xhs_store
|
from store import xhs as xhs_store
|
||||||
|
@ -263,7 +264,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
await self.xhs_client.get_note_all_comments(
|
await self.xhs_client.get_note_all_comments(
|
||||||
note_id=note_id,
|
note_id=note_id,
|
||||||
crawl_interval=random.random(),
|
crawl_interval=random.random(),
|
||||||
callback=xhs_store.batch_update_xhs_note_comments
|
callback=xhs_store.batch_update_xhs_note_comments,
|
||||||
|
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
Loading…
Reference in New Issue