Merge pull request #328 from helloteemo/feature/xiaohognshu_get_image
feature: 支持小红书图片、视频下载
This commit is contained in:
commit
8fe93dca23
|
@ -57,6 +57,7 @@ XHS_SPECIFIED_ID_LIST = [
|
|||
"6422c2750000000027000d88",
|
||||
"64ca1b73000000000b028dd2",
|
||||
"630d5b85000000001203ab41",
|
||||
"668fe13000000000030241fa", # 图文混合
|
||||
# ........................
|
||||
]
|
||||
|
||||
|
|
|
@ -129,6 +129,15 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||
return await self.request(method="POST", url=f"{self._host}{uri}",
|
||||
data=json_str, headers=headers)
|
||||
|
||||
async def get_note_media(self, url: str) -> bytes | None:
|
||||
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
||||
response = await client.request("GET", url, timeout=self.timeout)
|
||||
if not response.reason_phrase == "OK":
|
||||
utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
|
||||
return None
|
||||
else:
|
||||
return response.content
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
|
|
|
@ -120,6 +120,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
for note_detail in note_details:
|
||||
if note_detail is not None:
|
||||
await xhs_store.update_xhs_note(note_detail)
|
||||
await self.get_notice_media(note_detail)
|
||||
note_id_list.append(note_detail.get("note_id"))
|
||||
page += 1
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
|
||||
|
@ -171,6 +172,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
for note_detail in note_details:
|
||||
if note_detail is not None:
|
||||
await xhs_store.update_xhs_note(note_detail)
|
||||
await self.get_notice_media(note_detail)
|
||||
await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST)
|
||||
|
||||
async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
||||
|
@ -276,4 +278,63 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
async def close(self):
|
||||
"""Close browser context"""
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
|
||||
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
|
||||
|
||||
async def get_notice_media(self, note_detail: Dict):
|
||||
if not config.ENABLE_GET_IMAGES:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_notice_media] Crawling image mode is not enabled")
|
||||
return
|
||||
await self.get_note_images(note_detail)
|
||||
await self.get_notice_video(note_detail)
|
||||
|
||||
async def get_note_images(self, note_item: Dict):
|
||||
"""
|
||||
get note images. please use get_notice_media
|
||||
:param note_item:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_IMAGES:
|
||||
return
|
||||
note_id = note_item.get("note_id")
|
||||
image_list: List[Dict] = note_item.get("image_list", [])
|
||||
|
||||
for img in image_list:
|
||||
if img.get('url_default') != '':
|
||||
img.update({'url': img.get('url_default')})
|
||||
|
||||
if not image_list:
|
||||
return
|
||||
picNum = 0
|
||||
for pic in image_list:
|
||||
url = pic.get("url")
|
||||
if not url:
|
||||
continue
|
||||
content = await self.xhs_client.get_note_media(url)
|
||||
if content is None:
|
||||
continue
|
||||
extension_file_name = f"{picNum}.jpg"
|
||||
picNum += 1
|
||||
await xhs_store.update_xhs_note_image(note_id, content, extension_file_name)
|
||||
|
||||
async def get_notice_video(self, note_item: Dict):
|
||||
"""
|
||||
get note images. please use get_notice_media
|
||||
:param note_item:
|
||||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_IMAGES:
|
||||
return
|
||||
note_id = note_item.get("note_id")
|
||||
|
||||
videos = xhs_store.get_video_url_arr(note_item)
|
||||
|
||||
if not videos:
|
||||
return
|
||||
videoNum = 0
|
||||
for url in videos:
|
||||
content = await self.xhs_client.get_note_media(url)
|
||||
if content is None:
|
||||
continue
|
||||
extension_file_name = f"{videoNum}.mp4"
|
||||
videoNum += 1
|
||||
await xhs_store.update_xhs_note_image(note_id, content, extension_file_name)
|
||||
|
|
|
@ -8,6 +8,7 @@ import config
|
|||
|
||||
from . import xhs_store_impl
|
||||
from .xhs_store_impl import *
|
||||
from .xhs_store_image import *
|
||||
|
||||
|
||||
class XhsStoreFactory:
|
||||
|
@ -25,6 +26,25 @@ class XhsStoreFactory:
|
|||
return store_class()
|
||||
|
||||
|
||||
def get_video_url_arr(note_item: Dict) -> List:
|
||||
if note_item.get('type') != 'video':
|
||||
return []
|
||||
|
||||
videoArr = []
|
||||
originVideoKey = note_item.get('video').get('consumer').get('origin_video_key')
|
||||
if originVideoKey == '':
|
||||
originVideoKey = note_item.get('video').get('consumer').get('originVideoKey')
|
||||
# 降级有水印
|
||||
if originVideoKey == '':
|
||||
videos = note_item.get('video').get('media').get('stream').get('h264')
|
||||
if type(videos).__name__ == 'list':
|
||||
videoArr = [v.get('master_url') for v in videos]
|
||||
else:
|
||||
videoArr = [f"http://sns-video-bd.xhscdn.com/{originVideoKey}"]
|
||||
|
||||
return videoArr
|
||||
|
||||
|
||||
async def update_xhs_note(note_item: Dict):
|
||||
note_id = note_item.get("note_id")
|
||||
user_info = note_item.get("user", {})
|
||||
|
@ -36,11 +56,7 @@ async def update_xhs_note(note_item: Dict):
|
|||
if img.get('url_default') != '':
|
||||
img.update({'url': img.get('url_default')})
|
||||
|
||||
video_url = ''
|
||||
if note_item.get('type') == 'video':
|
||||
videos = note_item.get('video').get('media').get('stream').get('h264')
|
||||
if type(videos).__name__ == 'list':
|
||||
video_url = ','.join([v.get('master_url') for v in videos])
|
||||
video_url = ','.join(get_video_url_arr(note_item))
|
||||
|
||||
local_db_item = {
|
||||
"note_id": note_item.get("note_id"),
|
||||
|
@ -127,3 +143,8 @@ async def save_creator(user_id: str, creator: Dict):
|
|||
}
|
||||
utils.logger.info(f"[store.xhs.save_creator] creator:{local_db_item}")
|
||||
await XhsStoreFactory.create_store().store_creator(local_db_item)
|
||||
|
||||
|
||||
async def update_xhs_note_image(note_id, pic_content, extension_file_name):
|
||||
await XiaoHongShuImage().store_image(
|
||||
{"notice_id": note_id, "pic_content": pic_content, "extension_file_name": extension_file_name})
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Author : helloteemo
|
||||
# @Time : 2024/7/11 22:35
|
||||
# @Desc : 小红书图片保存
|
||||
import pathlib
|
||||
from typing import Dict
|
||||
|
||||
import aiofiles
|
||||
|
||||
from base.base_crawler import AbstractStoreImage
|
||||
from tools import utils
|
||||
|
||||
|
||||
class XiaoHongShuImage(AbstractStoreImage):
|
||||
image_store_path: str = "data/xhs/images"
|
||||
|
||||
async def store_image(self, image_content_item: Dict):
|
||||
"""
|
||||
store content
|
||||
Args:
|
||||
content_item:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
await self.save_image(image_content_item.get("notice_id"), image_content_item.get("pic_content"),
|
||||
image_content_item.get("extension_file_name"))
|
||||
|
||||
def make_save_file_name(self, notice_id: str, extension_file_name: str) -> str:
|
||||
"""
|
||||
make save file name by store type
|
||||
Args:
|
||||
notice_id: notice id
|
||||
picid: image id
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
return f"{self.image_store_path}/{notice_id}/{extension_file_name}"
|
||||
|
||||
async def save_image(self, notice_id: str, pic_content: str, extension_file_name="jpg"):
|
||||
"""
|
||||
save image to local
|
||||
Args:
|
||||
notice_id: notice id
|
||||
pic_content: image content
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pathlib.Path(self.image_store_path + "/" + notice_id).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name = self.make_save_file_name(notice_id, extension_file_name)
|
||||
async with aiofiles.open(save_file_name, 'wb') as f:
|
||||
await f.write(pic_content)
|
||||
utils.logger.info(f"[XiaoHongShuImageStoreImplement.save_image] save image {save_file_name} success ...")
|
Loading…
Reference in New Issue