新增B站创作者(UP主)信息爬取

This commit is contained in:
ZhouXSh 2024-07-18 20:11:51 +08:00
parent 548271e537
commit 3b2cc44750
6 changed files with 129 additions and 1 deletions

View File

@ -48,7 +48,7 @@ class AbstractStore(ABC):
# TODO support all platform
# only xhs is supported, so @abstractmethod is commented
# @abstractmethod
@abstractmethod
async def store_creator(self, creator: Dict):
pass

View File

@ -127,6 +127,7 @@ class BilibiliCrawler(AbstractCrawler):
if video_item:
video_id_list.append(video_item.get("View").get("aid"))
await bilibili_store.update_bilibili_video(video_item)
await bilibili_store.update_up_info(video_item)
await self.get_bilibili_video(video_item, semaphore)
page += 1
await self.batch_get_video_comments(video_id_list)

View File

@ -46,6 +46,25 @@ CREATE TABLE `bilibili_video_comment` (
KEY `idx_bilibili_vi_video_i_f22873` (`video_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站视频评论';
-- ----------------------------
-- Table structure for bilibili_up_info
-- ----------------------------
DROP TABLE IF EXISTS `bilibili_up_info`;
CREATE TABLE `bilibili_up_info` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`total_fans` bigint DEFAULT NULL COMMENT '粉丝数',
`total_liked` bigint DEFAULT NULL COMMENT '总获赞数',
`user_rank` int DEFAULT NULL COMMENT '用户等级',
`is_official` int DEFAULT NULL COMMENT '是否官号',
PRIMARY KEY (`id`),
KEY `idx_bilibili_vi_user_123456` (`user_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站UP主信息';
-- ----------------------------
-- Table structure for douyin_aweme
-- ----------------------------

View File

@ -53,6 +53,24 @@ async def update_bilibili_video(video_item: Dict):
await BiliStoreFactory.create_store().store_content(content_item=save_content_item)
async def update_up_info(video_item: Dict):
video_item_card_list: Dict = video_item.get("Card")
video_item_card: Dict = video_item_card_list.get("card")
saver_up_info = {
"user_id": str(video_item_card.get("mid")),
"nickname": video_item_card.get("name"),
"avatar": video_item_card.get("face"),
"last_modify_ts": utils.get_current_timestamp(),
"total_fans": video_item_card.get("fans"),
"total_liked": video_item_card_list.get("like_num"),
"user_rank": video_item_card.get("level_info").get("current_level"),
"is_official": video_item_card.get("official_verify").get("type"),
}
utils.logger.info(
f"[store.bilibili.update_up_info] bilibili user_id:{video_item_card.get('mid')}")
await BiliStoreFactory.create_store().store_creator(creator=saver_up_info)
async def batch_update_bilibili_video_comments(video_id: str, comments: List[Dict]):
if not comments:
return

View File

@ -85,6 +85,17 @@ class BiliCsvStoreImplement(AbstractStore):
"""
await self.save_data_to_csv(save_item=comment_item, store_type="comments")
async def store_creator(self, creator: Dict):
"""
Bilibili creator CSV storage implementation
Args:
creator: creator item dict
Returns:
"""
await self.save_data_to_csv(save_item=creator, store_type="creators")
class BiliDbStoreImplement(AbstractStore):
async def store_content(self, content_item: Dict):
@ -129,6 +140,27 @@ class BiliDbStoreImplement(AbstractStore):
else:
await update_comment_by_comment_id(comment_id, comment_item=comment_item)
async def store_creator(self, creator: Dict):
"""
Bilibili creator DB storage implementation
Args:
creator: creator item dict
Returns:
"""
from .bilibili_store_sql import (add_new_creator,
query_creator_by_creator_id,
update_creator_by_creator_id)
creator_id = creator.get("user_id")
creator_detail: Dict = await query_creator_by_creator_id(creator_id=creator_id)
if not creator_detail:
creator["add_ts"] = utils.get_current_timestamp()
await add_new_creator(creator)
else:
await update_creator_by_creator_id(creator_id,creator_item=creator)
class BiliJsonStoreImplement(AbstractStore):
json_store_path: str = "data/bilibili/json"
@ -204,3 +236,14 @@ class BiliJsonStoreImplement(AbstractStore):
"""
await self.save_data_to_json(comment_item, "comments")
async def store_creator(self, creator: Dict):
"""
creator JSON storage implementatio
Args:
creator:
Returns:
"""
await self.save_data_to_json(creator, "creators")

View File

@ -100,3 +100,50 @@ async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> i
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
effect_row: int = await async_db_conn.update_table("bilibili_video_comment", comment_item, "comment_id", comment_id)
return effect_row
async def query_creator_by_creator_id(creator_id: str) -> Dict:
"""
查询up主信息
Args:
creator_id:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
sql: str = f"select * from bilibili_up_info where user_id = '{creator_id}'"
rows: List[Dict] = await async_db_conn.query(sql)
if len(rows) > 0:
return rows[0]
return dict()
async def add_new_creator(creator_item: Dict) -> int:
"""
新增up主信息
Args:
creator_item:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
last_row_id: int = await async_db_conn.item_to_table("bilibili_up_info", creator_item)
return last_row_id
async def update_creator_by_creator_id(creator_id: str, creator_item: Dict) -> int:
"""
更新up主信息
Args:
creator_id:
creator_item:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
effect_row: int = await async_db_conn.update_table("bilibili_up_info", creator_item, "user_id", creator_id)
return effect_row