feat: support bilibili creator
This commit is contained in:
parent
645ec729f6
commit
111e08602c
|
@ -22,7 +22,7 @@
|
||||||
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
|
| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
|
||||||
| B 站 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
|
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
|
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,8 @@ PLATFORM = "xhs"
|
||||||
KEYWORDS = "python,golang"
|
KEYWORDS = "python,golang"
|
||||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
||||||
COOKIES = ""
|
COOKIES = ""
|
||||||
SORT_TYPE = "popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书
|
# 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书
|
||||||
|
SORT_TYPE = "popularity_descending"
|
||||||
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
|
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
|
||||||
|
|
||||||
# 是否开启 IP 代理
|
# 是否开启 IP 代理
|
||||||
|
@ -95,3 +96,9 @@ DY_CREATOR_ID_LIST = [
|
||||||
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
|
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
|
||||||
# ........................
|
# ........................
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# 指定bili创作者ID列表(sec_id)
|
||||||
|
BILI_CREATOR_ID_LIST = [
|
||||||
|
"20813884",
|
||||||
|
# ........................
|
||||||
|
]
|
||||||
|
|
|
@ -208,7 +208,6 @@ class BilibiliClient(AbstractApiClient):
|
||||||
if not is_fetch_sub_comments:
|
if not is_fetch_sub_comments:
|
||||||
result.extend(comment_list)
|
result.extend(comment_list)
|
||||||
continue
|
continue
|
||||||
# todo handle get sub comments
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
async def get_video_all_level_two_comments(self,
|
async def get_video_all_level_two_comments(self,
|
||||||
|
@ -230,15 +229,15 @@ class BilibiliClient(AbstractApiClient):
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pn = 0
|
pn = 1
|
||||||
while True:
|
while True:
|
||||||
result = await self.get_video_level_two_comments(
|
result = await self.get_video_level_two_comments(
|
||||||
video_id, level_one_comment_id, 0, ps, order_mode)
|
video_id, level_one_comment_id, pn, ps, order_mode)
|
||||||
comment_list: List[Dict] = result.get("replies", [])
|
comment_list: List[Dict] = result.get("replies", [])
|
||||||
if callback: # 如果有回调函数,就执行回调函数
|
if callback: # 如果有回调函数,就执行回调函数
|
||||||
await callback(video_id, comment_list)
|
await callback(video_id, comment_list)
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
if (int(result["page"]["count"]) <= (pn+1) * ps):
|
if (int(result["page"]["count"]) <= pn * ps):
|
||||||
break
|
break
|
||||||
|
|
||||||
pn += 1
|
pn += 1
|
||||||
|
@ -268,3 +267,21 @@ class BilibiliClient(AbstractApiClient):
|
||||||
}
|
}
|
||||||
result = await self.get(uri, post_data)
|
result = await self.get(uri, post_data)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
|
||||||
|
"""get all videos for a creator
|
||||||
|
:param creator_id: 创作者 ID
|
||||||
|
:param pn: 页数
|
||||||
|
:param ps: 一页视频数
|
||||||
|
:param order_mode: 排序方式
|
||||||
|
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
uri = "/x/space/wbi/arc/search"
|
||||||
|
post_data = {
|
||||||
|
"mid": creator_id,
|
||||||
|
"pn": pn,
|
||||||
|
"ps": ps,
|
||||||
|
"order": order_mode,
|
||||||
|
}
|
||||||
|
return await self.get(uri, post_data)
|
||||||
|
|
|
@ -75,7 +75,10 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
await self.search()
|
await self.search()
|
||||||
elif config.CRAWLER_TYPE == "detail":
|
elif config.CRAWLER_TYPE == "detail":
|
||||||
# Get the information and comments of the specified post
|
# Get the information and comments of the specified post
|
||||||
await self.get_specified_videos()
|
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
|
||||||
|
elif config.CRAWLER_TYPE == "creator":
|
||||||
|
for creator_id in config.BILI_CREATOR_ID_LIST:
|
||||||
|
await self.get_creator_videos(int(creator_id))
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
|
@ -173,7 +176,25 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
utils.logger.error(
|
utils.logger.error(
|
||||||
f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
|
f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
|
||||||
|
|
||||||
async def get_specified_videos(self):
|
async def get_creator_videos(self, creator_id: int):
|
||||||
|
"""
|
||||||
|
get videos for a creator
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
ps = 30
|
||||||
|
pn = 1
|
||||||
|
video_bvids_list = []
|
||||||
|
while True:
|
||||||
|
result = await self.bili_client.get_creator_videos(creator_id, pn, ps)
|
||||||
|
for video in result["list"]["vlist"]:
|
||||||
|
video_bvids_list.append(video["bvid"])
|
||||||
|
if (int(result["page"]["count"]) <= pn * ps):
|
||||||
|
break
|
||||||
|
await asyncio.sleep(random.random())
|
||||||
|
pn += 1
|
||||||
|
await self.get_specified_videos(video_bvids_list)
|
||||||
|
|
||||||
|
async def get_specified_videos(self, bvids_list: List[str]):
|
||||||
"""
|
"""
|
||||||
get specified videos info
|
get specified videos info
|
||||||
:return:
|
:return:
|
||||||
|
@ -181,7 +202,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
task_list = [
|
task_list = [
|
||||||
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
|
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
|
||||||
config.BILI_SPECIFIED_ID_LIST
|
bvids_list
|
||||||
]
|
]
|
||||||
video_details = await asyncio.gather(*task_list)
|
video_details = await asyncio.gather(*task_list)
|
||||||
video_aids_list = []
|
video_aids_list = []
|
||||||
|
|
|
@ -311,4 +311,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||||
ALTER TABLE `douyin_aweme_comment`
|
ALTER TABLE `douyin_aweme_comment`
|
||||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||||
|
|
||||||
|
ALTER TABLE `bilibili_video_comment`
|
||||||
|
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||||
|
|
||||||
SET FOREIGN_KEY_CHECKS = 1;
|
SET FOREIGN_KEY_CHECKS = 1;
|
||||||
|
|
Loading…
Reference in New Issue