fix: xhs note detail error

This commit is contained in:
Relakkes 2024-09-02 21:45:12 +08:00
parent 00b12ce482
commit d6fb255bdf
4 changed files with 34 additions and 4 deletions

View File

@ -7,15 +7,29 @@ from playwright.async_api import BrowserContext, BrowserType
class AbstractCrawler(ABC):
@abstractmethod
async def start(self):
"""
start crawler
"""
pass
@abstractmethod
async def search(self):
"""
search
"""
pass
@abstractmethod
async def launch_browser(self, chromium: BrowserType, playwright_proxy: Optional[Dict], user_agent: Optional[str],
headless: bool = True) -> BrowserContext:
"""
launch browser
:param chromium: chromium browser
:param playwright_proxy: playwright proxy
:param user_agent: user agent
:param headless: headless mode
:return: browser context
"""
pass

View File

@ -46,6 +46,7 @@ async def main():
await db.close()
if __name__ == '__main__':
try:
# asyncio.run(main())

View File

@ -262,7 +262,11 @@ class BilibiliCrawler(AbstractCrawler):
return None
async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
"""Create xhs client"""
"""
create bilibili client
:param httpx_proxy: httpx proxy
:return: bilibili client
"""
utils.logger.info(
"[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
@ -282,7 +286,11 @@ class BilibiliCrawler(AbstractCrawler):
@staticmethod
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
"""format proxy info for playwright and httpx"""
"""
format proxy info for playwright and httpx
:param ip_proxy_info: ip proxy info
:return: playwright proxy, httpx proxy
"""
playwright_proxy = {
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
"username": ip_proxy_info.user,
@ -300,7 +308,14 @@ class BilibiliCrawler(AbstractCrawler):
user_agent: Optional[str],
headless: bool = True
) -> BrowserContext:
"""Launch browser and create browser context"""
"""
launch browser and create browser context
:param chromium: chromium browser
:param playwright_proxy: playwright proxy
:param user_agent: user agent
:param headless: headless mode
:return: browser context
"""
utils.logger.info(
"[BilibiliCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE:

View File

@ -220,7 +220,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
"""Get note detail"""
async with semaphore:
try:
note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
if not note_detail:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}")