feat: support bilibili creator
This commit is contained in:
parent
645ec729f6
commit
111e08602c
|
@ -22,7 +22,7 @@
|
|||
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
|
||||
| B 站 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
||||
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,8 @@ PLATFORM = "xhs"
|
|||
KEYWORDS = "python,golang"
|
||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
||||
COOKIES = ""
|
||||
SORT_TYPE = "popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书
|
||||
# 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书
|
||||
SORT_TYPE = "popularity_descending"
|
||||
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
|
||||
|
||||
# 是否开启 IP 代理
|
||||
|
@ -95,3 +96,9 @@ DY_CREATOR_ID_LIST = [
|
|||
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
|
||||
# ........................
|
||||
]
|
||||
|
||||
# 指定bili创作者ID列表(sec_id)
|
||||
BILI_CREATOR_ID_LIST = [
|
||||
"20813884",
|
||||
# ........................
|
||||
]
|
||||
|
|
|
@ -208,7 +208,6 @@ class BilibiliClient(AbstractApiClient):
|
|||
if not is_fetch_sub_comments:
|
||||
result.extend(comment_list)
|
||||
continue
|
||||
# todo handle get sub comments
|
||||
return result
|
||||
|
||||
async def get_video_all_level_two_comments(self,
|
||||
|
@ -230,15 +229,15 @@ class BilibiliClient(AbstractApiClient):
|
|||
:return:
|
||||
"""
|
||||
|
||||
pn = 0
|
||||
pn = 1
|
||||
while True:
|
||||
result = await self.get_video_level_two_comments(
|
||||
video_id, level_one_comment_id, 0, ps, order_mode)
|
||||
video_id, level_one_comment_id, pn, ps, order_mode)
|
||||
comment_list: List[Dict] = result.get("replies", [])
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(video_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if (int(result["page"]["count"]) <= (pn+1) * ps):
|
||||
if (int(result["page"]["count"]) <= pn * ps):
|
||||
break
|
||||
|
||||
pn += 1
|
||||
|
@ -268,3 +267,21 @@ class BilibiliClient(AbstractApiClient):
|
|||
}
|
||||
result = await self.get(uri, post_data)
|
||||
return result
|
||||
|
||||
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
|
||||
"""get all videos for a creator
|
||||
:param creator_id: 创作者 ID
|
||||
:param pn: 页数
|
||||
:param ps: 一页视频数
|
||||
:param order_mode: 排序方式
|
||||
|
||||
:return:
|
||||
"""
|
||||
uri = "/x/space/wbi/arc/search"
|
||||
post_data = {
|
||||
"mid": creator_id,
|
||||
"pn": pn,
|
||||
"ps": ps,
|
||||
"order": order_mode,
|
||||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
|
|
@ -75,7 +75,10 @@ class BilibiliCrawler(AbstractCrawler):
|
|||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_videos()
|
||||
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
for creator_id in config.BILI_CREATOR_ID_LIST:
|
||||
await self.get_creator_videos(int(creator_id))
|
||||
else:
|
||||
pass
|
||||
utils.logger.info(
|
||||
|
@ -173,7 +176,25 @@ class BilibiliCrawler(AbstractCrawler):
|
|||
utils.logger.error(
|
||||
f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
|
||||
|
||||
async def get_specified_videos(self):
|
||||
async def get_creator_videos(self, creator_id: int):
|
||||
"""
|
||||
get videos for a creator
|
||||
:return:
|
||||
"""
|
||||
ps = 30
|
||||
pn = 1
|
||||
video_bvids_list = []
|
||||
while True:
|
||||
result = await self.bili_client.get_creator_videos(creator_id, pn, ps)
|
||||
for video in result["list"]["vlist"]:
|
||||
video_bvids_list.append(video["bvid"])
|
||||
if (int(result["page"]["count"]) <= pn * ps):
|
||||
break
|
||||
await asyncio.sleep(random.random())
|
||||
pn += 1
|
||||
await self.get_specified_videos(video_bvids_list)
|
||||
|
||||
async def get_specified_videos(self, bvids_list: List[str]):
|
||||
"""
|
||||
get specified videos info
|
||||
:return:
|
||||
|
@ -181,7 +202,7 @@ class BilibiliCrawler(AbstractCrawler):
|
|||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
|
||||
config.BILI_SPECIFIED_ID_LIST
|
||||
bvids_list
|
||||
]
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
video_aids_list = []
|
||||
|
|
|
@ -311,4 +311,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
|||
ALTER TABLE `douyin_aweme_comment`
|
||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
|
||||
ALTER TABLE `bilibili_video_comment`
|
||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
|
||||
SET FOREIGN_KEY_CHECKS = 1;
|
||||
|
|
Loading…
Reference in New Issue