From 79a02963125ef09c6bc9719fbba449f82b2c3b42 Mon Sep 17 00:00:00 2001 From: ubuntu Date: Sat, 17 Jun 2023 15:14:58 +0800 Subject: [PATCH] handby&catch exception --- config.py | 3 ++- main.py | 6 ++++-- media_platform/xhs/core.py | 25 +++++++++++++++++++------ 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/config.py b/config.py index 7cf43d9..a0d4d6d 100644 --- a/config.py +++ b/config.py @@ -2,8 +2,9 @@ platform = "xhs" keyword = "健身" -login_type = "qrcode" # qrcode or phone +login_type = "handby" # qrcode or phone login_phone = "13812345678" # your login phone +login_webSession="040069b5f35b1cfef9787551bd364b86f4d839" # redis config redis_db_host = "redis://127.0.0.1" diff --git a/main.py b/main.py index c01c0f3..092bad4 100644 --- a/main.py +++ b/main.py @@ -23,7 +23,8 @@ async def main(): parser = argparse.ArgumentParser(description='Media crawler program.') parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform) parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword) - parser.add_argument('--lt', type=str, help='Login type (qrcode | phone)', default=config.login_type) + parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | handby)', default=config.login_type) + parser.add_argument('--web_session', type=str, help='cookies to keep log in', default=config.login_webSession) parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone) args = parser.parse_args() @@ -31,7 +32,8 @@ async def main(): crawler.init_config( keywords=args.keywords, login_phone=args.phone, - login_type=args.lt + login_type=args.lt, + web_session=args.web_session ) await crawler.start() diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 787c00d..6298fb4 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -21,6 +21,7 @@ import config from .client import XHSClient from base_crawler import Crawler from models import xhs as xhs_model +from .exception import * class XiaoHongShuCrawler(Crawler): @@ -28,6 +29,7 @@ class XiaoHongShuCrawler(Crawler): self.login_phone = None self.login_type = None self.keywords = None + self.web_session = None self.cookies: Optional[List[Cookie]] = None self.browser_context: Optional[BrowserContext] = None self.context_page: Optional[Page] = None @@ -37,9 +39,8 @@ class XiaoHongShuCrawler(Crawler): self.index_url = "https://www.xiaohongshu.com" def init_config(self, **kwargs): - self.keywords = kwargs.get("keywords") - self.login_type = kwargs.get("login_type") - self.login_phone = kwargs.get("login_phone") + for key in kwargs.keys(): + setattr(self, key, kwargs[key]) async def update_cookies(self): self.cookies = await self.browser_context.cookies() @@ -48,7 +49,7 @@ class XiaoHongShuCrawler(Crawler): async with async_playwright() as playwright: # launch browser and create single browser context chromium = playwright.chromium - browser = await chromium.launch(headless=False) + browser = await chromium.launch(headless=True) self.browser_context = await browser.new_context( viewport={"width": 1920, "height": 1080}, user_agent=self.user_agent, @@ -90,13 +91,22 @@ class XiaoHongShuCrawler(Crawler): # There are two ways to log in: # 1. Semi-automatic: Log in by scanning the QR code. # 2. Fully automatic: Log in using forwarded text message notifications + # 3. handby automatic: Log in using preset cookie # which includes mobile phone number and verification code. if self.login_type == "qrcode": await self.login_by_qrcode() elif self.login_type == "phone": await self.login_by_mobile() - else: + elif self.login_type == "handby": + await self.browser_context.add_cookies([{ + 'name': 'web_session', + 'value': self.web_session, + 'domain': ".xiaohongshu.com", + 'path': "/" + }]) + else: pass + async def login_by_mobile(self): print("Start executing mobile phone number + verification code login on Xiaohongshu. ...") @@ -203,7 +213,10 @@ class XiaoHongShuCrawler(Crawler): for post_item in posts_res.get("items"): max_note_len -= 1 note_id = post_item.get("id") - note_detail = await self.xhs_client.get_note_by_id(note_id) + try: + note_detail = await self.xhs_client.get_note_by_id(note_id) + except DataFetchError as ex: + continue await xhs_model.update_xhs_note(note_detail) await asyncio.sleep(0.05) note_list.append(note_id)