From d6fb255bdff66cd1cb99ecee4b4822ac21a43432 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Mon, 2 Sep 2024 21:45:12 +0800 Subject: [PATCH] fix: xhs note detail error --- base/base_crawler.py | 14 ++++++++++++++ main.py | 1 + media_platform/bilibili/core.py | 21 ++++++++++++++++++--- media_platform/xhs/core.py | 2 +- 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/base/base_crawler.py b/base/base_crawler.py index 6c78492..4951e2e 100644 --- a/base/base_crawler.py +++ b/base/base_crawler.py @@ -7,15 +7,29 @@ from playwright.async_api import BrowserContext, BrowserType class AbstractCrawler(ABC): @abstractmethod async def start(self): + """ + start crawler + """ pass @abstractmethod async def search(self): + """ + search + """ pass @abstractmethod async def launch_browser(self, chromium: BrowserType, playwright_proxy: Optional[Dict], user_agent: Optional[str], headless: bool = True) -> BrowserContext: + """ + launch browser + :param chromium: chromium browser + :param playwright_proxy: playwright proxy + :param user_agent: user agent + :param headless: headless mode + :return: browser context + """ pass diff --git a/main.py b/main.py index e051b5e..15aa36d 100644 --- a/main.py +++ b/main.py @@ -45,6 +45,7 @@ async def main(): if config.SAVE_DATA_OPTION == "db": await db.close() + if __name__ == '__main__': try: diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index 334a487..d26bd92 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -262,7 +262,11 @@ class BilibiliCrawler(AbstractCrawler): return None async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient: - """Create xhs client""" + """ + create bilibili client + :param httpx_proxy: httpx proxy + :return: bilibili client + """ utils.logger.info( "[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...") cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) @@ -282,7 +286,11 @@ class BilibiliCrawler(AbstractCrawler): @staticmethod def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: - """format proxy info for playwright and httpx""" + """ + format proxy info for playwright and httpx + :param ip_proxy_info: ip proxy info + :return: playwright proxy, httpx proxy + """ playwright_proxy = { "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", "username": ip_proxy_info.user, @@ -300,7 +308,14 @@ class BilibiliCrawler(AbstractCrawler): user_agent: Optional[str], headless: bool = True ) -> BrowserContext: - """Launch browser and create browser context""" + """ + launch browser and create browser context + :param chromium: chromium browser + :param playwright_proxy: playwright proxy + :param user_agent: user agent + :param headless: headless mode + :return: browser context + """ utils.logger.info( "[BilibiliCrawler.launch_browser] Begin create browser context ...") if config.SAVE_LOGIN_STATE: diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 68e55b8..3334f06 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -220,7 +220,7 @@ class XiaoHongShuCrawler(AbstractCrawler): """Get note detail""" async with semaphore: try: - note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token) + note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id) if not note_detail: utils.logger.error( f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}")