feat: support bilibili creator

This commit is contained in:
nelzomal 2024-06-12 15:18:55 +08:00
parent 645ec729f6
commit 111e08602c
5 changed files with 57 additions and 9 deletions

View File

@ -22,7 +22,7 @@
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | | 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
| B 站 | ✅ | ✅ | ✅ | | ✅ | ✅ | | B 站 | ✅ | ✅ | ✅ | | ✅ | ✅ |
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | | 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |

View File

@ -3,7 +3,8 @@ PLATFORM = "xhs"
KEYWORDS = "python,golang" KEYWORDS = "python,golang"
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = "" COOKIES = ""
SORT_TYPE = "popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值展示只支持小红书 # 具体值参见media_platform.xxx.field下的枚举值展示只支持小红书
SORT_TYPE = "popularity_descending"
CRAWLER_TYPE = "search" # 爬取类型search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据) CRAWLER_TYPE = "search" # 爬取类型search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
# 是否开启 IP 代理 # 是否开启 IP 代理
@ -95,3 +96,9 @@ DY_CREATOR_ID_LIST = [
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE", "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
# ........................ # ........................
] ]
# 指定bili创作者ID列表(sec_id)
BILI_CREATOR_ID_LIST = [
"20813884",
# ........................
]

View File

@ -208,7 +208,6 @@ class BilibiliClient(AbstractApiClient):
if not is_fetch_sub_comments: if not is_fetch_sub_comments:
result.extend(comment_list) result.extend(comment_list)
continue continue
# todo handle get sub comments
return result return result
async def get_video_all_level_two_comments(self, async def get_video_all_level_two_comments(self,
@ -230,15 +229,15 @@ class BilibiliClient(AbstractApiClient):
:return: :return:
""" """
pn = 0 pn = 1
while True: while True:
result = await self.get_video_level_two_comments( result = await self.get_video_level_two_comments(
video_id, level_one_comment_id, 0, ps, order_mode) video_id, level_one_comment_id, pn, ps, order_mode)
comment_list: List[Dict] = result.get("replies", []) comment_list: List[Dict] = result.get("replies", [])
if callback: # 如果有回调函数,就执行回调函数 if callback: # 如果有回调函数,就执行回调函数
await callback(video_id, comment_list) await callback(video_id, comment_list)
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)
if (int(result["page"]["count"]) <= (pn+1) * ps): if (int(result["page"]["count"]) <= pn * ps):
break break
pn += 1 pn += 1
@ -268,3 +267,21 @@ class BilibiliClient(AbstractApiClient):
} }
result = await self.get(uri, post_data) result = await self.get(uri, post_data)
return result return result
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
"""get all videos for a creator
:param creator_id: 创作者 ID
:param pn: 页数
:param ps: 一页视频数
:param order_mode: 排序方式
:return:
"""
uri = "/x/space/wbi/arc/search"
post_data = {
"mid": creator_id,
"pn": pn,
"ps": ps,
"order": order_mode,
}
return await self.get(uri, post_data)

View File

@ -75,7 +75,10 @@ class BilibiliCrawler(AbstractCrawler):
await self.search() await self.search()
elif config.CRAWLER_TYPE == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_videos() await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
elif config.CRAWLER_TYPE == "creator":
for creator_id in config.BILI_CREATOR_ID_LIST:
await self.get_creator_videos(int(creator_id))
else: else:
pass pass
utils.logger.info( utils.logger.info(
@ -173,7 +176,25 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.error( utils.logger.error(
f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}") f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
async def get_specified_videos(self): async def get_creator_videos(self, creator_id: int):
"""
get videos for a creator
:return:
"""
ps = 30
pn = 1
video_bvids_list = []
while True:
result = await self.bili_client.get_creator_videos(creator_id, pn, ps)
for video in result["list"]["vlist"]:
video_bvids_list.append(video["bvid"])
if (int(result["page"]["count"]) <= pn * ps):
break
await asyncio.sleep(random.random())
pn += 1
await self.get_specified_videos(video_bvids_list)
async def get_specified_videos(self, bvids_list: List[str]):
""" """
get specified videos info get specified videos info
:return: :return:
@ -181,7 +202,7 @@ class BilibiliCrawler(AbstractCrawler):
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [ task_list = [
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
config.BILI_SPECIFIED_ID_LIST bvids_list
] ]
video_details = await asyncio.gather(*task_list) video_details = await asyncio.gather(*task_list)
video_aids_list = [] video_aids_list = []

View File

@ -311,4 +311,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `douyin_aweme_comment` ALTER TABLE `douyin_aweme_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `bilibili_video_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
SET FOREIGN_KEY_CHECKS = 1; SET FOREIGN_KEY_CHECKS = 1;