From 97d7a0c38bfef92628844aa048c08561e3050a68 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Sat, 9 Dec 2023 21:10:01 +0800 Subject: [PATCH] feat: Bilibili comment done --- README.md | 10 ++-- config/base_config.py | 25 ++++---- media_platform/bilibili/client.py | 32 ++++++----- media_platform/bilibili/core.py | 82 ++++++++++++++++++++++++-- media_platform/bilibili/help.py | 5 +- media_platform/douyin/core.py | 2 +- media_platform/kuaishou/core.py | 2 +- media_platform/xhs/core.py | 2 +- models/__init__.py | 1 + models/bilibili.py | 96 +++++++++++++++++++++++++------ 10 files changed, 199 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 15b5697..ed50e92 100644 --- a/README.md +++ b/README.md @@ -20,11 +20,11 @@ ## 功能列表 | 平台 | Cookie 登录 | 二维码登录 | 手机号登录 | 关键词搜索 | 指定视频/帖子 ID 爬取 | 登录状态缓存 | 数据保存 | IP 代理池 | 滑块验证码 | |:---:|:---------:|:-----:|:-----:|:-----:|:-------------:|:------:|:----:|:------:|:-----:| -| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | -| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 快手 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | -| B 站 | ✅ | ✅ | ✕ | ✅ | ✕ | ✅ | ✅ | ✕ | ✕ | -| 微博 | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | +| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | +| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 快手 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | +| B 站 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | +| 微博 | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ## 使用方法 diff --git a/config/base_config.py b/config/base_config.py index 1493d15..4da697c 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -40,22 +40,25 @@ DY_COMMENT_KEYWORDS = [ # 指定小红书需要爬虫的笔记ID列表 XHS_SPECIFIED_ID_LIST = [ -"6422c2750000000027000d88", -"64ca1b73000000000b028dd2", -"630d5b85000000001203ab41", -# ........................ + "6422c2750000000027000d88", + "64ca1b73000000000b028dd2", + "630d5b85000000001203ab41", + # ........................ ] - # 指定抖音需要爬取的ID列表 DY_SPECIFIED_ID_LIST = [ -"7280854932641664319", -"7202432992642387233" -# ........................ + "7280854932641664319", + "7202432992642387233" + # ........................ ] - # 指定快手平台需要爬取的ID列表 -KS_SPECIFIED_ID_LIST = [ +KS_SPECIFIED_ID_LIST = [] -] \ No newline at end of file +# 指定B站平台需要爬取的视频ID列表 +BILI_SPECIFIED_ID_LIST = [ + "416252543", + "976148468" + # ........................ +] diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index d2d7e89..48cb058 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -77,9 +77,10 @@ class BilibiliClient: sub_key = sub_url.rsplit('/', 1)[1].split('.')[0] return img_key, sub_key - async def get(self, uri: str, params=None) -> Dict: + async def get(self, uri: str, params=None, enable_params_sign: bool = True) -> Dict: final_uri = uri - params = await self.pre_request_data(params) + if enable_params_sign: + params = await self.pre_request_data(params) if isinstance(params, dict): final_uri = (f"{uri}?" f"{urlencode(params)}") @@ -102,7 +103,7 @@ class BilibiliClient: utils.logger.info("use cache login state get web interface successfull!") ping_flag = True except Exception as e: - utils.logger.error(f"Pong kuaishou failed: {e}, and try to login again...") + utils.logger.error(f"Pong bilibili failed: {e}, and try to login again...") ping_flag = False return ping_flag @@ -133,23 +134,25 @@ class BilibiliClient: async def get_video_info(self, video_id: str) -> Dict: """ - Kuaishou web video detail api + Bilibli web video detail api :param video_id: :return: """ - post_data = { + uri = "/x/web-interface/view/detail" + params = { + "aid": video_id } - return await self.post("", post_data) + return await self.get(uri, params, enable_params_sign=False) async def get_video_comments(self, video_id: str, order_mode: CommentOrderType = CommentOrderType.DEFAULT, - pagination_str: str = '{"offset":""}' + next: int = 0 ) -> Dict: """get video comments :param video_id: 视频 ID :param order_mode: 排序方式 - :param pagination_str: 分页字符串,由 api 返回下一个评论请求的分页信息 + :param next: 评论页选择 :return: """ uri = "/x/v2/reply/wbi/main" @@ -157,9 +160,10 @@ class BilibiliClient: "oid": video_id, "mode": order_mode.value, "type": 1, - "pagination_str": pagination_str + "ps": 20, + "next": next } - return await self.post(uri, post_data) + return await self.get(uri, post_data) async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False, callback: Optional[Callable] = None, ): @@ -174,13 +178,13 @@ class BilibiliClient: result = [] is_end = False - pagination_str = '{"offset":""}' + next_page =0 while not is_end: - comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, pagination_str) - curson_info: Dict = comments_res.get("data").get("cursor") + comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page) + curson_info: Dict = comments_res.get("cursor") comment_list: List[Dict] = comments_res.get("replies", []) is_end = curson_info.get("is_end") - pagination_str = curson_info.get("pagination_reply").get("next_offset") + next_page = curson_info.get("next") if callback: # 如果有回调函数,就执行回调函数 await callback(video_id, comment_list) await asyncio.sleep(crawl_interval) diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index 149aa33..ac837c6 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -16,11 +16,12 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler from models import bilibili -from proxy.proxy_ip_pool import create_ip_pool, IpInfoModel +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from tools import utils from var import comment_tasks_var, crawler_type_var from .client import BilibiliClient +from .exception import DataFetchError from .field import SearchOrderType from .login import BilibiliLogin @@ -107,17 +108,88 @@ class BilibiliCrawler(AbstractCrawler): order=SearchOrderType.DEFAULT, ) video_list: List[Dict] = videos_res.get("result") - for video_item in video_list: - video_id_list.append(video_item.get("id")) - await bilibili.update_bilibili_video(video_item) + + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_video_info_task(video_item.get("aid"), semaphore) + for video_item in video_list + ] + video_items = await asyncio.gather(*task_list) + for video_item in video_items: + if video_item: + video_id_list.append(video_item.get("View").get("aid")) + await bilibili.update_bilibili_video(video_item) + page += 1 + await self.batch_get_video_comments(video_id_list) + + async def batch_get_video_comments(self, video_id_list: List[str]): + """ + batch get video comments + :param video_id_list: + :return: + """ + utils.logger.info(f"[batch_get_video_comments] video ids:{video_id_list}") + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list: List[Task] = [] + for video_id in video_id_list: + task = asyncio.create_task(self.get_comments(video_id, semaphore), name=video_id) + task_list.append(task) + await asyncio.gather(*task_list) + + async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore): + """ + get comment for video id + :param video_id: + :param semaphore: + :return: + """ + async with semaphore: + try: + utils.logger.info(f"[get_comments] bengin get video_id: {video_id} comments ...") + await self.bili_client.get_video_all_comments( + video_id=video_id, + crawl_interval=random.random(), + callback=bilibili.batch_update_bilibili_video_comments + ) + except DataFetchError as ex: + utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}") + except Exception as e: + utils.logger.error(f"[get_comments] may be been blocked, err:", e) + async def get_specified_videos(self): """ get specified videos info :return: """ - pass + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_video_info_task(video_id=video_id, semaphore=semaphore) for video_id in config.BILI_SPECIFIED_ID_LIST + ] + video_details = await asyncio.gather(*task_list) + for video_detail in video_details: + if video_detail is not None: + await bilibili.update_bilibili_video(video_detail) + await self.batch_get_video_comments(config.BILI_SPECIFIED_ID_LIST) + + async def get_video_info_task(self, video_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: + """ + Get video detail task + :param video_id: + :param semaphore: + :return: + """ + async with semaphore: + try: + result = await self.bili_client.get_video_info(video_id) + return result + except DataFetchError as ex: + utils.logger.error(f"Get video detail error: {ex}") + return None + except KeyError as ex: + utils.logger.error(f"have not fund note detail video_id:{video_id}, err: {ex}") + return None async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient: """Create xhs client""" diff --git a/media_platform/bilibili/help.py b/media_platform/bilibili/help.py index 6978bfb..c5c52e6 100644 --- a/media_platform/bilibili/help.py +++ b/media_platform/bilibili/help.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- + # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com # @Time : 2023/12/2 23:26 # @Desc : bilibili 请求参数签名 @@ -52,7 +52,6 @@ class BilibiliSign: salt = self.get_salt() wbi_sign = md5((query + salt).encode()).hexdigest() # 计算 w_rid req_data['w_rid'] = wbi_sign - print(urllib.parse.urlencode(req_data)) return req_data @@ -67,5 +66,5 @@ if __name__ == '__main__': value = kvalues[1] _req_data[key] = value print("pre req_data", _req_data) - _req_data = BilibiliSign(img_key=_img_key, sub_key=_sub_key).sign(req_data=_req_data) + _req_data = BilibiliSign(img_key=_img_key, sub_key=_sub_key).sign(req_data={"aid":170001}) print(_req_data) diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index e047d39..0a2432a 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -9,7 +9,7 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler from models import douyin -from proxy.proxy_ip_pool import create_ip_pool, IpInfoModel +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from tools import utils from var import crawler_type_var diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index 78b9f6f..f3f55a2 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -11,7 +11,7 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler from models import kuaishou -from proxy.proxy_ip_pool import create_ip_pool, IpInfoModel +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from tools import utils from var import comment_tasks_var, crawler_type_var diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index c724ae1..bf0246f 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -10,7 +10,7 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler from models import xiaohongshu as xhs_model -from proxy.proxy_ip_pool import create_ip_pool, IpInfoModel +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from tools import utils from var import crawler_type_var diff --git a/models/__init__.py b/models/__init__.py index cb9970b..cd278e6 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -1,3 +1,4 @@ +from .bilibili import * from .douyin import * from .kuaishou import * from .xiaohongshu import * diff --git a/models/bilibili.py b/models/bilibili.py index a4757f3..15cfafb 100644 --- a/models/bilibili.py +++ b/models/bilibili.py @@ -48,28 +48,44 @@ class BilibiliVideo(BilibiliBaseModel): return f"{self.video_id} - {self.title}" +class BilibiliComment(BilibiliBaseModel): + comment_id = fields.CharField(max_length=64, index=True, description="评论ID") + video_id = fields.CharField(max_length=64, index=True, description="视频ID") + content = fields.TextField(null=True, description="评论内容") + create_time = fields.BigIntField(description="评论时间戳") + sub_comment_count = fields.CharField(max_length=16, description="评论回复数") + + class Meta: + table = "bilibili_video_comment" + table_description = "B 站视频评论" + + def __str__(self): + return f"{self.comment_id} - {self.content}" + + async def update_bilibili_video(video_item: Dict): - video_id = video_item.get("id") - if video_item.get("type") != "video": - return + video_item_view: Dict = video_item.get("View") + video_user_info: Dict = video_item_view.get("owner") + video_item_stat: Dict = video_item_view.get("stat") + video_id = str(video_item_view.get("aid")) local_db_item = { "video_id": video_id, - "video_type": str(video_item.get("type")), - "title": video_item.get("title", "")[:500], - "desc": video_item.get("description", "")[:500], - "create_time": video_item.get("pubdate"), - "user_id": video_item.get("mid"), - "nickname": video_item.get("author"), - "avatar": video_item.get("upic", ""), - "liked_count": str(video_item.get("like", "")), - "video_play_count": str(video_item.get("play", "")), - "video_danmaku": str(video_item.get("danmaku", "")), - "video_comment": str(video_item.get("review", "")), + "video_type": "video", + "title": video_item_view.get("title", "")[:500], + "desc": video_item_view.get("desc", "")[:500], + "create_time": video_item_view.get("pubdate"), + "user_id": str(video_user_info.get("mid")), + "nickname": video_user_info.get("name"), + "avatar": video_user_info.get("face", ""), + "liked_count": str(video_item_stat.get("like", "")), + "video_play_count": str(video_item_stat.get("view", "")), + "video_danmaku": str(video_item_stat.get("danmaku", "")), + "video_comment": str(video_item_stat.get("reply", "")), "last_modify_ts": utils.get_current_timestamp(), "video_url": f"https://www.bilibili.com/video/av{video_id}", - "video_cover_url": video_item.get("pic", ""), + "video_cover_url": video_item_view.get("pic", ""), } - print(f"bilibili video id:{video_id}, title:{local_db_item.get('title')}") + utils.logger.info(f"bilibili video id:{video_id}, title:{local_db_item.get('title')}") if config.IS_SAVED_DATABASED: if not await BilibiliVideo.filter(video_id=video_id).exists(): local_db_item["add_ts"] = utils.get_current_timestamp() @@ -91,4 +107,50 @@ async def update_bilibili_video(video_item: Dict): writer = csv.writer(f) if f.tell() == 0: writer.writerow(local_db_item.keys()) - writer.writerow(local_db_item.values()) \ No newline at end of file + writer.writerow(local_db_item.values()) + + +async def batch_update_bilibili_video_comments(video_id: str, comments: List[Dict]): + if not comments: + return + for comment_item in comments: + await update_bilibili_video_comment(video_id, comment_item) + +async def update_bilibili_video_comment(video_id: str, comment_item: Dict): + comment_id = str(comment_item.get("rpid")) + content: Dict = comment_item.get("content") + user_info: Dict = comment_item.get("member") + local_db_item = { + "comment_id": comment_id, + "create_time": comment_item.get("ctime"), + "video_id": video_id, + "content": content.get("message"), + "user_id": user_info.get("mid"), + "nickname": user_info.get("uname"), + "avatar": user_info.get("avatar"), + "sub_comment_count": str(comment_item.get("rcount", 0)), + "last_modify_ts": utils.get_current_timestamp(), + } + utils.logger.info(f"Bilibili video comment: {comment_id}, content: {local_db_item.get('content')}") + if config.IS_SAVED_DATABASED: + if not await BilibiliComment.filter(comment_id=comment_id).exists(): + local_db_item["add_ts"] = utils.get_current_timestamp() + comment_pydantic = pydantic_model_creator(BilibiliComment, name='BilibiliVideoCommentCreate', + exclude=('id',)) + comment_data = comment_pydantic(**local_db_item) + comment_pydantic.validate(comment_data) + await BilibiliComment.create(**comment_data.dict()) + else: + comment_pydantic = pydantic_model_creator(BilibiliComment, name='BilibiliVideoCommentUpdate', + exclude=('id', 'add_ts')) + comment_data = comment_pydantic(**local_db_item) + comment_pydantic.validate(comment_data) + await BilibiliComment.filter(comment_id=comment_id).update(**comment_data.dict()) + else: + pathlib.Path(f"data/bilibili").mkdir(parents=True, exist_ok=True) + save_file_name = f"data/bilibili/{crawler_type_var.get()}_comments_{utils.get_current_date()}.csv" + with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + if f.tell() == 0: + writer.writerow(local_db_item.keys()) + writer.writerow(local_db_item.values())