新增对微博博客内照片获取的支持 文件存放路径data/weibo/images
This commit is contained in:
parent
5c409c6f0c
commit
16413c3074
|
@ -56,6 +56,12 @@ class AbstractStore(ABC):
|
||||||
async def store_creator(self, creator: Dict):
|
async def store_creator(self, creator: Dict):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class AbstractStoreImage(ABC):
|
||||||
|
#TODO: support all platform
|
||||||
|
# only weibo is supported
|
||||||
|
# @abstractmethod
|
||||||
|
async def store_image(self, image_content_item: Dict):
|
||||||
|
pass
|
||||||
|
|
||||||
class AbstactApiClient(ABC):
|
class AbstactApiClient(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
|
|
@ -33,6 +33,9 @@ CRAWLER_MAX_NOTES_COUNT = 20
|
||||||
# 并发爬虫数量控制
|
# 并发爬虫数量控制
|
||||||
MAX_CONCURRENCY_NUM = 4
|
MAX_CONCURRENCY_NUM = 4
|
||||||
|
|
||||||
|
# 是否开启爬图片模式, 默认不开启爬图片
|
||||||
|
ENABLE_GET_IMAGES = True
|
||||||
|
|
||||||
# 是否开启爬评论模式, 默认不开启爬评论
|
# 是否开启爬评论模式, 默认不开启爬评论
|
||||||
ENABLE_GET_COMMENTS = False
|
ENABLE_GET_COMMENTS = False
|
||||||
|
|
||||||
|
|
|
@ -35,6 +35,7 @@ class WeiboClient:
|
||||||
self._host = "https://m.weibo.cn"
|
self._host = "https://m.weibo.cn"
|
||||||
self.playwright_page = playwright_page
|
self.playwright_page = playwright_page
|
||||||
self.cookie_dict = cookie_dict
|
self.cookie_dict = cookie_dict
|
||||||
|
self._image_agent_host = "https://i1.wp.com/"
|
||||||
|
|
||||||
async def request(self, method, url, **kwargs) -> Any:
|
async def request(self, method, url, **kwargs) -> Any:
|
||||||
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
||||||
|
@ -181,3 +182,25 @@ class WeiboClient:
|
||||||
else:
|
else:
|
||||||
utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值")
|
utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值")
|
||||||
return dict()
|
return dict()
|
||||||
|
|
||||||
|
async def get_note_image(self, image_url: str) -> bytes:
|
||||||
|
image_url = image_url[8:] # 去掉 https://
|
||||||
|
sub_url = image_url.split("/")
|
||||||
|
image_url = ""
|
||||||
|
for i in range(len(sub_url)):
|
||||||
|
if i == 1:
|
||||||
|
image_url += "large/" #都获取高清大图
|
||||||
|
elif i == len(sub_url) - 1:
|
||||||
|
image_url += sub_url[i]
|
||||||
|
else:
|
||||||
|
image_url += sub_url[i] + "/"
|
||||||
|
# 微博图床对外存在防盗链,所以需要代理访问
|
||||||
|
# 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下
|
||||||
|
final_uri = (f"{self._image_agent_host}" f"{image_url}")
|
||||||
|
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
||||||
|
response = await client.request("GET", final_uri, timeout=self.timeout)
|
||||||
|
if not response.reason_phrase == "OK":
|
||||||
|
utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return response.content
|
|
@ -121,8 +121,10 @@ class WeiboCrawler(AbstractCrawler):
|
||||||
for note_item in note_list:
|
for note_item in note_list:
|
||||||
if note_item:
|
if note_item:
|
||||||
mblog: Dict = note_item.get("mblog")
|
mblog: Dict = note_item.get("mblog")
|
||||||
note_id_list.append(mblog.get("id"))
|
if mblog:
|
||||||
await weibo_store.update_weibo_note(note_item)
|
note_id_list.append(mblog.get("id"))
|
||||||
|
await weibo_store.update_weibo_note(note_item)
|
||||||
|
await self.get_note_images(mblog)
|
||||||
|
|
||||||
page += 1
|
page += 1
|
||||||
await self.batch_get_notes_comments(note_id_list)
|
await self.batch_get_notes_comments(note_id_list)
|
||||||
|
@ -200,6 +202,28 @@ class WeiboCrawler(AbstractCrawler):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.logger.error(f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}")
|
utils.logger.error(f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}")
|
||||||
|
|
||||||
|
async def get_note_images(self, mblog: Dict):
|
||||||
|
"""
|
||||||
|
get note images
|
||||||
|
:param mblog:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
if not config.ENABLE_GET_IMAGES:
|
||||||
|
utils.logger.info(f"[WeiboCrawler.get_note_images] Crawling image mode is not enabled")
|
||||||
|
return
|
||||||
|
|
||||||
|
pics: Dict = mblog.get("pics")
|
||||||
|
if not pics:
|
||||||
|
return
|
||||||
|
for pic in pics:
|
||||||
|
url = pic.get("url")
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
content = await self.wb_client.get_note_image(url)
|
||||||
|
if content != None:
|
||||||
|
extension_file_name = url.split(".")[-1]
|
||||||
|
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
|
||||||
|
|
||||||
async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
|
async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
|
||||||
"""Create xhs client"""
|
"""Create xhs client"""
|
||||||
utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")
|
utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")
|
||||||
|
|
|
@ -8,13 +8,14 @@ from typing import List
|
||||||
import config
|
import config
|
||||||
|
|
||||||
from .weibo_store_impl import *
|
from .weibo_store_impl import *
|
||||||
|
from .weibo_store_image import *
|
||||||
|
|
||||||
|
|
||||||
class WeibostoreFactory:
|
class WeibostoreFactory:
|
||||||
STORES = {
|
STORES = {
|
||||||
"csv": WeiboCsvStoreImplement,
|
"csv": WeiboCsvStoreImplement,
|
||||||
"db": WeiboDbStoreImplement,
|
"db": WeiboDbStoreImplement,
|
||||||
"json": WeiboJsonStoreImplement
|
"json": WeiboJsonStoreImplement,
|
||||||
}
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -86,3 +87,6 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...")
|
f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...")
|
||||||
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
|
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
|
||||||
|
|
||||||
|
async def update_weibo_note_image(picid: str, pic_content, extension_file_name):
|
||||||
|
await WeiboStoreImage().store_image({"pic_id": picid, "pic_content": pic_content, "extension_file_name": extension_file_name})
|
|
@ -0,0 +1,51 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Author : Erm
|
||||||
|
# @Time : 2024/4/9 17:35
|
||||||
|
# @Desc : 微博保存图片类
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
from tools import utils
|
||||||
|
from base.base_crawler import AbstractStoreImage
|
||||||
|
import aiofiles
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
class WeiboStoreImage(AbstractStoreImage):
|
||||||
|
image_store_path: str = "data/weibo/images"
|
||||||
|
|
||||||
|
async def store_image(self, image_content_item: Dict):
|
||||||
|
"""
|
||||||
|
store content
|
||||||
|
Args:
|
||||||
|
content_item:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
await self.save_image(image_content_item.get("pic_id"), image_content_item.get("pic_content"), image_content_item.get("extension_file_name"))
|
||||||
|
|
||||||
|
def make_save_file_name(self, picid: str, extension_file_name: str) -> str:
|
||||||
|
"""
|
||||||
|
make save file name by store type
|
||||||
|
Args:
|
||||||
|
picid: image id
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
return f"{self.image_store_path}/{picid}.{extension_file_name}"
|
||||||
|
|
||||||
|
async def save_image(self, picid: str, pic_content: str, extension_file_name="jpg"):
|
||||||
|
"""
|
||||||
|
save image to local
|
||||||
|
Args:
|
||||||
|
picid: image id
|
||||||
|
pic_content: image content
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
pathlib.Path(self.image_store_path).mkdir(parents=True, exist_ok=True)
|
||||||
|
save_file_name = self.make_save_file_name(picid, extension_file_name)
|
||||||
|
async with aiofiles.open(save_file_name, 'wb') as f:
|
||||||
|
await f.write(pic_content)
|
||||||
|
utils.logger.info(f"[WeiboImageStoreImplement.save_image] save image {save_file_name} success ...")
|
Loading…
Reference in New Issue