From 1085a2a769abbf32bcb547de2ea280d3de597589 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Thu, 22 Jun 2023 22:43:26 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E5=A2=9E=E5=8A=A0=E5=B0=8F=E7=BA=A2?= =?UTF-8?q?=E4=B9=A6=E7=99=BB=E5=BD=95=E4=B8=A4=E7=A7=8D=E5=BD=A2=E6=80=81?= =?UTF-8?q?=E4=B8=8B=E5=BC=B9=E7=AA=97=E7=9A=84=E5=85=BC=E5=AE=B9=E4=BB=A3?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.py | 9 +++-- main.py | 6 ++-- media_platform/xhs/core.py | 67 ++++++++++++++++++++++++++++---------- utils.py | 16 +++++++++ 4 files changed, 75 insertions(+), 23 deletions(-) diff --git a/config.py b/config.py index a0d4d6d..dad4eb2 100644 --- a/config.py +++ b/config.py @@ -2,9 +2,12 @@ platform = "xhs" keyword = "健身" -login_type = "handby" # qrcode or phone -login_phone = "13812345678" # your login phone -login_webSession="040069b5f35b1cfef9787551bd364b86f4d839" +login_type = "cookie" # qrcode or phone or cookie +login_phone = "" # your login phone + +# If it's on the Xiaohongshu platform, only the web_session cookie will be kept. +# web_session=040069b2acxxxxxxxxxxxxxxxxxxxx; +cookies = "" # redis config redis_db_host = "redis://127.0.0.1" diff --git a/main.py b/main.py index 092bad4..7cad9bd 100644 --- a/main.py +++ b/main.py @@ -23,9 +23,9 @@ async def main(): parser = argparse.ArgumentParser(description='Media crawler program.') parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform) parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword) - parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | handby)', default=config.login_type) - parser.add_argument('--web_session', type=str, help='cookies to keep log in', default=config.login_webSession) + parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.login_type) parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone) + parser.add_argument('--cookies', type=str, help='cookies to keep log in', default=config.cookies) args = parser.parse_args() crawler = CrawlerFactory().create_crawler(platform=args.platform) @@ -33,7 +33,7 @@ async def main(): keywords=args.keywords, login_phone=args.phone, login_type=args.lt, - web_session=args.web_session + cookie_str=args.cookies ) await crawler.start() diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 6298fb4..b549ef4 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -30,7 +30,8 @@ class XiaoHongShuCrawler(Crawler): self.login_type = None self.keywords = None self.web_session = None - self.cookies: Optional[List[Cookie]] = None + self.cookies: Optional[List[Cookie]] = None # cookies from browser context + self.cookie_str: Optional[str] = None # cookie string from config or command line self.browser_context: Optional[BrowserContext] = None self.context_page: Optional[Page] = None self.proxy: Optional[Dict] = None @@ -88,28 +89,51 @@ class XiaoHongShuCrawler(Crawler): async def login(self): """login xiaohongshu website and keep webdriver login state""" - # There are two ways to log in: + # There are three ways to log in: # 1. Semi-automatic: Log in by scanning the QR code. # 2. Fully automatic: Log in using forwarded text message notifications - # 3. handby automatic: Log in using preset cookie - # which includes mobile phone number and verification code. + # 3. Semi-automatic: Log in using preset cookie if self.login_type == "qrcode": await self.login_by_qrcode() elif self.login_type == "phone": await self.login_by_mobile() - elif self.login_type == "handby": - await self.browser_context.add_cookies([{ - 'name': 'web_session', - 'value': self.web_session, - 'domain': ".xiaohongshu.com", - 'path': "/" - }]) - else: + elif self.login_type == "cookie": + # cookie str convert to cookie dict + for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): + await self.browser_context.add_cookies([{ + 'name': key, + 'value': value, + 'domain': ".xiaohongshu.com", + 'path': "/" + }]) + else: pass - async def login_by_mobile(self): print("Start executing mobile phone number + verification code login on Xiaohongshu. ...") + + await asyncio.sleep(1) + try: + # After entering the main page of Xiaohongshu, + # the login window may not pop up automatically and you need to manually click the login button. + login_button_ele = await self.context_page.wait_for_selector( + selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button", + timeout=5000 + ) + await login_button_ele.click() + + # There are also two types of login dialog boxes for pop-ups. + # One type directly shows the phone number and verification code. + # Another type requires clicking to switch to mobile login. + element = await self.context_page.wait_for_selector( + selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]', + timeout=5000 + ) + await element.click() + except: + print("have not found mobile button icon and keep going ...") + await asyncio.sleep(1) + login_container_ele = await self.context_page.wait_for_selector("div.login-container") # Fill login phone input_ele = await login_container_ele.query_selector("label.phone > input") @@ -158,16 +182,25 @@ class XiaoHongShuCrawler(Crawler): async def login_by_qrcode(self): """login xiaohongshu website and keep webdriver login state""" print("Start scanning QR code to log in to Xiaohongshu. ...") + qrcode_img_selector = "xpath=//img[@class='qrcode-img']" # find login qrcode base64_qrcode_img = await utils.find_login_qrcode( self.context_page, - selector="div.login-container > div.left > div.qrcode > img" + selector=qrcode_img_selector ) if not base64_qrcode_img: - # todo ...if this website does not automatically popup login dialog box, we will manual click login button - print("login failed , have not found qrcode please check ....") - sys.exit() + print("have not found qrcode and try again get it ....") + # if this website does not automatically popup login dialog box, we will manual click login button + login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button") + await login_button_ele.click() + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector=qrcode_img_selector + ) + if not base64_qrcode_img: + print("login failed , program exit ...") + sys.exit() # get not logged session current_cookie = await self.browser_context.cookies() diff --git a/utils.py b/utils.py index 1236460..7cddea4 100644 --- a/utils.py +++ b/utils.py @@ -61,6 +61,22 @@ def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]: return cookies_str, cookie_dict +def convert_str_cookie_to_dict(cookie_str: str) -> Dict: + cookie_dict = dict() + if not cookie_str: + return cookie_dict + for cookie in cookie_str.split(";"): + cookie = cookie.strip() + if not cookie: + continue + cookie = cookie.split("=") + cookie_value = cookie[1] + if isinstance(cookie_value, list): + cookie_value = "".join(cookie_value) + cookie_dict[cookie[0]] = cookie_value + return cookie_dict + + def get_current_timestamp(): return int(time.time() * 1000)