fix: xhs验证码处理,跳转链接修复

This commit is contained in:
lyx0727 2024-11-01 22:08:46 +08:00
parent 1a37df4d5e
commit 705b810269
1 changed files with 8 additions and 1 deletions

View File

@ -22,6 +22,7 @@ from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result
import config import config
from base.base_crawler import AbstractApiClient from base.base_crawler import AbstractApiClient
from tools import utils from tools import utils
from html import unescape
from .exception import DataFetchError, IPBlockError from .exception import DataFetchError, IPBlockError
from .field import SearchNoteType, SearchSortType from .field import SearchNoteType, SearchSortType
@ -518,12 +519,18 @@ class XiaoHongShuClient(AbstractApiClient):
return get_note_dict(html) return get_note_dict(html)
except: except:
href = re.findall(r'href="(.*?)"', html)[0] href = re.findall(r'href="(.*?)"', html)[0]
href = unescape(href)
utils.logger.info( utils.logger.info(
f"[XiaoHongShuClient.get_note_by_id_from_html] 出现验证码: {href}, 请手动验证" f"[XiaoHongShuClient.get_note_by_id_from_html] 出现验证码: {href}, 请手动验证"
) )
await self.playwright_page.goto(href) await self.playwright_page.goto(href)
# 等待用户完成操作页面重定向 # 等待用户完成操作页面重定向
if await self.check_redirect(): if await self.check_redirect():
utils.logger.info(
f"[XiaoHongShuClient.get_note_by_id_from_html] 用户完成验证, 重定向到笔记详情页"
)
html = await self.playwright_page.content() html = await self.playwright_page.content()
return get_note_dict(html) return get_note_dict(html)
else: else:
@ -535,7 +542,7 @@ class XiaoHongShuClient(AbstractApiClient):
retry=retry_if_result(lambda value: value is False), retry=retry_if_result(lambda value: value is False),
) )
async def check_redirect(self): async def check_redirect(self):
url = await self.playwright_page.url() url = self.playwright_page.url
if url.startswith("https://www.xiaohongshu.com/explore"): if url.startswith("https://www.xiaohongshu.com/explore"):
return True return True
return False return False