From 705b810269a7e392d0e4672c910d31f089eb5788 Mon Sep 17 00:00:00 2001 From: lyx0727 <1324938402@qq.com> Date: Fri, 1 Nov 2024 22:08:46 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20xhs=E9=AA=8C=E8=AF=81=E7=A0=81=E5=A4=84?= =?UTF-8?q?=E7=90=86=EF=BC=8C=E8=B7=B3=E8=BD=AC=E9=93=BE=E6=8E=A5=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- media_platform/xhs/client.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index d3055f9..f61323e 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -22,6 +22,7 @@ from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result import config from base.base_crawler import AbstractApiClient from tools import utils +from html import unescape from .exception import DataFetchError, IPBlockError from .field import SearchNoteType, SearchSortType @@ -518,12 +519,18 @@ class XiaoHongShuClient(AbstractApiClient): return get_note_dict(html) except: href = re.findall(r'href="(.*?)"', html)[0] + href = unescape(href) + utils.logger.info( f"[XiaoHongShuClient.get_note_by_id_from_html] 出现验证码: {href}, 请手动验证" ) await self.playwright_page.goto(href) # 等待用户完成操作页面重定向 if await self.check_redirect(): + utils.logger.info( + f"[XiaoHongShuClient.get_note_by_id_from_html] 用户完成验证, 重定向到笔记详情页" + ) + html = await self.playwright_page.content() return get_note_dict(html) else: @@ -535,7 +542,7 @@ class XiaoHongShuClient(AbstractApiClient): retry=retry_if_result(lambda value: value is False), ) async def check_redirect(self): - url = await self.playwright_page.url() + url = self.playwright_page.url if url.startswith("https://www.xiaohongshu.com/explore"): return True return False