diff --git a/base/base_crawler.py b/base/base_crawler.py index 24dd83f..2a5b69f 100644 --- a/base/base_crawler.py +++ b/base/base_crawler.py @@ -56,6 +56,12 @@ class AbstractStore(ABC): async def store_creator(self, creator: Dict): pass +class AbstractStoreImage(ABC): + #TODO: support all platform + # only weibo is supported + # @abstractmethod + async def store_image(self, image_content_item: Dict): + pass class AbstactApiClient(ABC): @abstractmethod diff --git a/config/base_config.py b/config/base_config.py index e6ccd03..f85a5a7 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -33,6 +33,9 @@ CRAWLER_MAX_NOTES_COUNT = 20 # 并发爬虫数量控制 MAX_CONCURRENCY_NUM = 4 +# 是否开启爬图片模式, 默认不开启爬图片 +ENABLE_GET_IMAGES = True + # 是否开启爬评论模式, 默认不开启爬评论 ENABLE_GET_COMMENTS = False diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py index e3c975c..89b3ee1 100644 --- a/media_platform/weibo/client.py +++ b/media_platform/weibo/client.py @@ -35,6 +35,7 @@ class WeiboClient: self._host = "https://m.weibo.cn" self.playwright_page = playwright_page self.cookie_dict = cookie_dict + self._image_agent_host = "https://i1.wp.com/" async def request(self, method, url, **kwargs) -> Any: async with httpx.AsyncClient(proxies=self.proxies) as client: @@ -181,3 +182,25 @@ class WeiboClient: else: utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值") return dict() + + async def get_note_image(self, image_url: str) -> bytes: + image_url = image_url[8:] # 去掉 https:// + sub_url = image_url.split("/") + image_url = "" + for i in range(len(sub_url)): + if i == 1: + image_url += "large/" #都获取高清大图 + elif i == len(sub_url) - 1: + image_url += sub_url[i] + else: + image_url += sub_url[i] + "/" + # 微博图床对外存在防盗链,所以需要代理访问 + # 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下 + final_uri = (f"{self._image_agent_host}" f"{image_url}") + async with httpx.AsyncClient(proxies=self.proxies) as client: + response = await client.request("GET", final_uri, timeout=self.timeout) + if not response.reason_phrase == "OK": + utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}") + return None + else: + return response.content \ No newline at end of file diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index 8aecd1e..1a382ca 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -121,8 +121,10 @@ class WeiboCrawler(AbstractCrawler): for note_item in note_list: if note_item: mblog: Dict = note_item.get("mblog") - note_id_list.append(mblog.get("id")) - await weibo_store.update_weibo_note(note_item) + if mblog: + note_id_list.append(mblog.get("id")) + await weibo_store.update_weibo_note(note_item) + await self.get_note_images(mblog) page += 1 await self.batch_get_notes_comments(note_id_list) @@ -200,6 +202,28 @@ class WeiboCrawler(AbstractCrawler): except Exception as e: utils.logger.error(f"[WeiboCrawler.get_note_comments] may be been blocked, err:{e}") + async def get_note_images(self, mblog: Dict): + """ + get note images + :param mblog: + :return: + """ + if not config.ENABLE_GET_IMAGES: + utils.logger.info(f"[WeiboCrawler.get_note_images] Crawling image mode is not enabled") + return + + pics: Dict = mblog.get("pics") + if not pics: + return + for pic in pics: + url = pic.get("url") + if not url: + continue + content = await self.wb_client.get_note_image(url) + if content != None: + extension_file_name = url.split(".")[-1] + await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name) + async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient: """Create xhs client""" utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...") diff --git a/store/weibo/__init__.py b/store/weibo/__init__.py index 3a43e52..533296f 100644 --- a/store/weibo/__init__.py +++ b/store/weibo/__init__.py @@ -8,13 +8,14 @@ from typing import List import config from .weibo_store_impl import * +from .weibo_store_image import * class WeibostoreFactory: STORES = { "csv": WeiboCsvStoreImplement, "db": WeiboDbStoreImplement, - "json": WeiboJsonStoreImplement + "json": WeiboJsonStoreImplement, } @staticmethod @@ -86,3 +87,6 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict): utils.logger.info( f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...") await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item) + +async def update_weibo_note_image(picid: str, pic_content, extension_file_name): + await WeiboStoreImage().store_image({"pic_id": picid, "pic_content": pic_content, "extension_file_name": extension_file_name}) \ No newline at end of file diff --git a/store/weibo/weibo_store_image.py b/store/weibo/weibo_store_image.py new file mode 100644 index 0000000..1243b9d --- /dev/null +++ b/store/weibo/weibo_store_image.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +# @Author : Erm +# @Time : 2024/4/9 17:35 +# @Desc : 微博保存图片类 +import pathlib + +from tools import utils +from base.base_crawler import AbstractStoreImage +import aiofiles +from typing import Dict + +class WeiboStoreImage(AbstractStoreImage): + image_store_path: str = "data/weibo/images" + + async def store_image(self, image_content_item: Dict): + """ + store content + Args: + content_item: + + Returns: + + """ + await self.save_image(image_content_item.get("pic_id"), image_content_item.get("pic_content"), image_content_item.get("extension_file_name")) + + def make_save_file_name(self, picid: str, extension_file_name: str) -> str: + """ + make save file name by store type + Args: + picid: image id + + Returns: + + """ + return f"{self.image_store_path}/{picid}.{extension_file_name}" + + async def save_image(self, picid: str, pic_content: str, extension_file_name="jpg"): + """ + save image to local + Args: + picid: image id + pic_content: image content + + Returns: + + """ + pathlib.Path(self.image_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(picid, extension_file_name) + async with aiofiles.open(save_file_name, 'wb') as f: + await f.write(pic_content) + utils.logger.info(f"[WeiboImageStoreImplement.save_image] save image {save_file_name} success ...") \ No newline at end of file