feat: 支持bilibili视频下载

This commit is contained in:
helloteemo 2024-07-12 20:09:16 +08:00
parent e26c2aaf2a
commit d686d17f9b
4 changed files with 156 additions and 2 deletions

View File

@ -153,6 +153,36 @@ class BilibiliClient(AbstractApiClient):
params.update({"bvid": bvid}) params.update({"bvid": bvid})
return await self.get(uri, params, enable_params_sign=False) return await self.get(uri, params, enable_params_sign=False)
async def get_video_play_url(self, aid: int, cid: int) -> Dict:
"""
Bilibli web video play url api
:param aid: 稿件avid
:param cid: cid
:return:
"""
if not aid or not cid or aid <= 0 or cid <= 0:
raise ValueError("aid 和 cid 必须存在")
uri = "/x/player/wbi/playurl"
params = {
"avid": aid,
"cid": cid,
"qn": 80,
"fourk": 1,
"fnval": 1,
"platform": "pc",
}
return await self.get(uri, params, enable_params_sign=True)
async def get_video_media(self, url: str) -> Union[bytes, None]:
async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
if not response.reason_phrase == "OK":
utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}")
return None
else:
return response.content
async def get_video_comments(self, async def get_video_comments(self,
video_id: str, video_id: str,
order_mode: CommentOrderType = CommentOrderType.DEFAULT, order_mode: CommentOrderType = CommentOrderType.DEFAULT,

View File

@ -7,7 +7,7 @@ import asyncio
import os import os
import random import random
from asyncio import Task from asyncio import Task
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple, Union
from playwright.async_api import (BrowserContext, BrowserType, Page, from playwright.async_api import (BrowserContext, BrowserType, Page,
async_playwright) async_playwright)
@ -127,6 +127,7 @@ class BilibiliCrawler(AbstractCrawler):
if video_item: if video_item:
video_id_list.append(video_item.get("View").get("aid")) video_id_list.append(video_item.get("View").get("aid"))
await bilibili_store.update_bilibili_video(video_item) await bilibili_store.update_bilibili_video(video_item)
await self.get_bilibili_video(video_item, semaphore)
page += 1 page += 1
await self.batch_get_video_comments(video_id_list) await self.batch_get_video_comments(video_id_list)
@ -213,6 +214,7 @@ class BilibiliCrawler(AbstractCrawler):
if video_aid: if video_aid:
video_aids_list.append(video_aid) video_aids_list.append(video_aid)
await bilibili_store.update_bilibili_video(video_detail) await bilibili_store.update_bilibili_video(video_detail)
await self.get_bilibili_video(video_detail, semaphore)
await self.batch_get_video_comments(video_aids_list) await self.batch_get_video_comments(video_aids_list)
async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
@ -236,6 +238,27 @@ class BilibiliCrawler(AbstractCrawler):
f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}") f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
return None return None
async def get_video_play_url_task(self, aid: int, cid: int, semaphore: asyncio.Semaphore) -> Union[Dict, None]:
"""
Get video play url
:param aid:
:param cid:
:param semaphore:
:return:
"""
async with semaphore:
try:
result = await self.bili_client.get_video_play_url(aid=aid, cid=cid)
return result
except DataFetchError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_video_play_url_task] Get video play url error: {ex}")
return None
except KeyError as ex:
utils.logger.error(
f"[BilibiliCrawler.get_video_play_url_task] have not fund play url from :{aid}|{cid}, err: {ex}")
return None
async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient: async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
"""Create xhs client""" """Create xhs client"""
utils.logger.info( utils.logger.info(
@ -300,3 +323,39 @@ class BilibiliCrawler(AbstractCrawler):
user_agent=user_agent user_agent=user_agent
) )
return browser_context return browser_context
async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphore):
"""
download bilibili video
:param video_item:
:param semaphore:
:return:
"""
if not config.ENABLE_GET_IMAGES:
utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Crawling image mode is not enabled")
return
video_item_view: Dict = video_item.get("View")
aid = video_item_view.get("aid")
cid = video_item_view.get("cid")
result = await self.get_video_play_url_task(aid, cid, semaphore)
if result is None:
utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video play url failed")
return
durl_list = result.get("durl")
max_size = -1
video_url = ""
for durl in durl_list:
size = durl.get("size")
if size > max_size:
max_size = size
video_url = durl.get("url")
if video_url == "":
utils.logger.info("[BilibiliCrawler.get_bilibili_video] get video url failed")
return
content = await self.bili_client.get_video_media(video_url)
if content is None:
return
extension_file_name = f"video.mp4"
await bilibili_store.store_video(aid, content, extension_file_name)

View File

@ -8,7 +8,7 @@ from typing import List
import config import config
from .bilibili_store_impl import * from .bilibili_store_impl import *
from .bilibilli_store_video import *
class BiliStoreFactory: class BiliStoreFactory:
STORES = { STORES = {
@ -80,3 +80,15 @@ async def update_bilibili_video_comment(video_id: str, comment_item: Dict):
utils.logger.info( utils.logger.info(
f"[store.bilibili.update_bilibili_video_comment] Bilibili video comment: {comment_id}, content: {save_comment_item.get('content')}") f"[store.bilibili.update_bilibili_video_comment] Bilibili video comment: {comment_id}, content: {save_comment_item.get('content')}")
await BiliStoreFactory.create_store().store_comment(comment_item=save_comment_item) await BiliStoreFactory.create_store().store_comment(comment_item=save_comment_item)
async def store_video(aid, video_content, extension_file_name):
"""
video video storage implementation
Args:
aid:
video_content:
extension_file_name:
"""
await BilibiliVideo().store_video(
{"aid": aid, "video_content": video_content, "extension_file_name": extension_file_name})

View File

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
# @Author : helloteemo
# @Time : 2024/7/12 20:01
# @Desc : bilibili图片保存
import pathlib
from typing import Dict
import aiofiles
from base.base_crawler import AbstractStoreImage
from tools import utils
class BilibiliVideo(AbstractStoreImage):
video_store_path: str = "data/bilibili/videos"
async def store_video(self, video_content_item: Dict):
"""
store content
Args:
content_item:
Returns:
"""
await self.save_video(video_content_item.get("aid"), video_content_item.get("video_content"),
video_content_item.get("extension_file_name"))
def make_save_file_name(self, aid: str, extension_file_name: str) -> str:
"""
make save file name by store type
Args:
aid: aid
Returns:
"""
return f"{self.video_store_path}/{aid}/{extension_file_name}"
async def save_video(self, aid: int, video_content: str, extension_file_name="mp4"):
"""
save video to local
Args:
aid: aid
video_content: video content
Returns:
"""
pathlib.Path(self.video_store_path + "/" + str(aid)).mkdir(parents=True, exist_ok=True)
save_file_name = self.make_save_file_name(str(aid), extension_file_name)
async with aiofiles.open(save_file_name, 'wb') as f:
await f.write(video_content)
utils.logger.info(f"[BilibiliVideoImplement.save_video] save save_video {save_file_name} success ...")