MediaCrawler/base/base_crawler.py

86 lines
1.9 KiB
Python
Raw Normal View History

from abc import ABC, abstractmethod
from typing import Dict, Optional
from playwright.async_api import BrowserContext, BrowserType
class AbstractCrawler(ABC):
@abstractmethod
async def start(self):
2024-09-02 13:45:12 +00:00
"""
start crawler
"""
pass
@abstractmethod
2023-07-29 07:35:40 +00:00
async def search(self):
2024-09-02 13:45:12 +00:00
"""
search
"""
pass
@abstractmethod
async def launch_browser(self, chromium: BrowserType, playwright_proxy: Optional[Dict], user_agent: Optional[str],
headless: bool = True) -> BrowserContext:
2024-09-02 13:45:12 +00:00
"""
launch browser
:param chromium: chromium browser
:param playwright_proxy: playwright proxy
:param user_agent: user agent
:param headless: headless mode
:return: browser context
"""
pass
class AbstractLogin(ABC):
@abstractmethod
async def begin(self):
pass
@abstractmethod
async def login_by_qrcode(self):
pass
@abstractmethod
async def login_by_mobile(self):
pass
@abstractmethod
async def login_by_cookies(self):
pass
class AbstractStore(ABC):
@abstractmethod
async def store_content(self, content_item: Dict):
pass
@abstractmethod
async def store_comment(self, comment_item: Dict):
pass
# TODO support all platform
# only xhs is supported, so @abstractmethod is commented
@abstractmethod
async def store_creator(self, creator: Dict):
pass
class AbstractStoreImage(ABC):
# TODO: support all platform
# only weibo is supported
# @abstractmethod
async def store_image(self, image_content_item: Dict):
pass
2024-04-13 12:18:04 +00:00
class AbstractApiClient(ABC):
@abstractmethod
async def request(self, method, url, **kwargs):
pass
@abstractmethod
async def update_cookies(self, browser_context: BrowserContext):
pass