From d686d17f9b7fc598a3ccbf88f1492b4a08e1e154 Mon Sep 17 00:00:00 2001 From: helloteemo Date: Fri, 12 Jul 2024 20:09:16 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81bilibili=E8=A7=86?= =?UTF-8?q?=E9=A2=91=E4=B8=8B=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- media_platform/bilibili/client.py | 30 ++++++++++++ media_platform/bilibili/core.py | 61 ++++++++++++++++++++++++- store/bilibili/__init__.py | 14 +++++- store/bilibili/bilibilli_store_video.py | 53 +++++++++++++++++++++ 4 files changed, 156 insertions(+), 2 deletions(-) create mode 100644 store/bilibili/bilibilli_store_video.py diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index 5c13e03..76fd520 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -153,6 +153,36 @@ class BilibiliClient(AbstractApiClient): params.update({"bvid": bvid}) return await self.get(uri, params, enable_params_sign=False) + async def get_video_play_url(self, aid: int, cid: int) -> Dict: + """ + Bilibli web video play url api + :param aid: 稿件avid + :param cid: cid + :return: + """ + if not aid or not cid or aid <= 0 or cid <= 0: + raise ValueError("aid 和 cid 必须存在") + uri = "/x/player/wbi/playurl" + params = { + "avid": aid, + "cid": cid, + "qn": 80, + "fourk": 1, + "fnval": 1, + "platform": "pc", + } + + return await self.get(uri, params, enable_params_sign=True) + + async def get_video_media(self, url: str) -> Union[bytes, None]: + async with httpx.AsyncClient(proxies=self.proxies) as client: + response = await client.request("GET", url, timeout=self.timeout, headers=self.headers) + if not response.reason_phrase == "OK": + utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}") + return None + else: + return response.content + async def get_video_comments(self, video_id: str, order_mode: CommentOrderType = CommentOrderType.DEFAULT, diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index 1d08791..641a9d6 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -7,7 +7,7 @@ import asyncio import os import random from asyncio import Task -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union from playwright.async_api import (BrowserContext, BrowserType, Page, async_playwright) @@ -127,6 +127,7 @@ class BilibiliCrawler(AbstractCrawler): if video_item: video_id_list.append(video_item.get("View").get("aid")) await bilibili_store.update_bilibili_video(video_item) + await self.get_bilibili_video(video_item, semaphore) page += 1 await self.batch_get_video_comments(video_id_list) @@ -213,6 +214,7 @@ class BilibiliCrawler(AbstractCrawler): if video_aid: video_aids_list.append(video_aid) await bilibili_store.update_bilibili_video(video_detail) + await self.get_bilibili_video(video_detail, semaphore) await self.batch_get_video_comments(video_aids_list) async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: @@ -236,6 +238,27 @@ class BilibiliCrawler(AbstractCrawler): f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}") return None + async def get_video_play_url_task(self, aid: int, cid: int, semaphore: asyncio.Semaphore) -> Union[Dict, None]: + """ + Get video play url + :param aid: + :param cid: + :param semaphore: + :return: + """ + async with semaphore: + try: + result = await self.bili_client.get_video_play_url(aid=aid, cid=cid) + return result + except DataFetchError as ex: + utils.logger.error( + f"[BilibiliCrawler.get_video_play_url_task] Get video play url error: {ex}") + return None + except KeyError as ex: + utils.logger.error( + f"[BilibiliCrawler.get_video_play_url_task] have not fund play url from :{aid}|{cid}, err: {ex}") + return None + async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient: """Create xhs client""" utils.logger.info( @@ -300,3 +323,39 @@ class BilibiliCrawler(AbstractCrawler): user_agent=user_agent ) return browser_context + + async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphore): + """ + download bilibili video + :param video_item: + :param semaphore: + :return: + """ + if not config.ENABLE_GET_IMAGES: + utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Crawling image mode is not enabled") + return + video_item_view: Dict = video_item.get("View") + aid = video_item_view.get("aid") + cid = video_item_view.get("cid") + result = await self.get_video_play_url_task(aid, cid, semaphore) + if result is None: + utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video play url failed") + return + durl_list = result.get("durl") + max_size = -1 + video_url = "" + for durl in durl_list: + size = durl.get("size") + if size > max_size: + max_size = size + video_url = durl.get("url") + if video_url == "": + utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video url failed") + return + + content = await self.bili_client.get_video_media(video_url) + if content is None: + return + extension_file_name = f"video.mp4" + await bilibili_store.store_video(aid, content, extension_file_name) + diff --git a/store/bilibili/__init__.py b/store/bilibili/__init__.py index a1fe0f4..9099bf6 100644 --- a/store/bilibili/__init__.py +++ b/store/bilibili/__init__.py @@ -8,7 +8,7 @@ from typing import List import config from .bilibili_store_impl import * - +from .bilibilli_store_video import * class BiliStoreFactory: STORES = { @@ -80,3 +80,15 @@ async def update_bilibili_video_comment(video_id: str, comment_item: Dict): utils.logger.info( f"[store.bilibili.update_bilibili_video_comment] Bilibili video comment: {comment_id}, content: {save_comment_item.get('content')}") await BiliStoreFactory.create_store().store_comment(comment_item=save_comment_item) + + +async def store_video(aid, video_content, extension_file_name): + """ + video video storage implementation + Args: + aid: + video_content: + extension_file_name: + """ + await BilibiliVideo().store_video( + {"aid": aid, "video_content": video_content, "extension_file_name": extension_file_name}) diff --git a/store/bilibili/bilibilli_store_video.py b/store/bilibili/bilibilli_store_video.py new file mode 100644 index 0000000..c2837e2 --- /dev/null +++ b/store/bilibili/bilibilli_store_video.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +# @Author : helloteemo +# @Time : 2024/7/12 20:01 +# @Desc : bilibili图片保存 +import pathlib +from typing import Dict + +import aiofiles + +from base.base_crawler import AbstractStoreImage +from tools import utils + + +class BilibiliVideo(AbstractStoreImage): + video_store_path: str = "data/bilibili/videos" + + async def store_video(self, video_content_item: Dict): + """ + store content + Args: + content_item: + + Returns: + + """ + await self.save_video(video_content_item.get("aid"), video_content_item.get("video_content"), + video_content_item.get("extension_file_name")) + + def make_save_file_name(self, aid: str, extension_file_name: str) -> str: + """ + make save file name by store type + Args: + aid: aid + Returns: + + """ + return f"{self.video_store_path}/{aid}/{extension_file_name}" + + async def save_video(self, aid: int, video_content: str, extension_file_name="mp4"): + """ + save video to local + Args: + aid: aid + video_content: video content + + Returns: + + """ + pathlib.Path(self.video_store_path + "/" + str(aid)).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(str(aid), extension_file_name) + async with aiofiles.open(save_file_name, 'wb') as f: + await f.write(video_content) + utils.logger.info(f"[BilibiliVideoImplement.save_video] save save_video {save_file_name} success ...")