fix: xhs note detail error

This commit is contained in:
Relakkes 2024-09-02 21:45:12 +08:00
parent 00b12ce482
commit d6fb255bdf
4 changed files with 34 additions and 4 deletions

View File

@ -7,15 +7,29 @@ from playwright.async_api import BrowserContext, BrowserType
class AbstractCrawler(ABC): class AbstractCrawler(ABC):
@abstractmethod @abstractmethod
async def start(self): async def start(self):
"""
start crawler
"""
pass pass
@abstractmethod @abstractmethod
async def search(self): async def search(self):
"""
search
"""
pass pass
@abstractmethod @abstractmethod
async def launch_browser(self, chromium: BrowserType, playwright_proxy: Optional[Dict], user_agent: Optional[str], async def launch_browser(self, chromium: BrowserType, playwright_proxy: Optional[Dict], user_agent: Optional[str],
headless: bool = True) -> BrowserContext: headless: bool = True) -> BrowserContext:
"""
launch browser
:param chromium: chromium browser
:param playwright_proxy: playwright proxy
:param user_agent: user agent
:param headless: headless mode
:return: browser context
"""
pass pass

View File

@ -45,6 +45,7 @@ async def main():
if config.SAVE_DATA_OPTION == "db": if config.SAVE_DATA_OPTION == "db":
await db.close() await db.close()
if __name__ == '__main__': if __name__ == '__main__':
try: try:

View File

@ -262,7 +262,11 @@ class BilibiliCrawler(AbstractCrawler):
return None return None
async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient: async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
"""Create xhs client""" """
create bilibili client
:param httpx_proxy: httpx proxy
:return: bilibili client
"""
utils.logger.info( utils.logger.info(
"[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...") "[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
@ -282,7 +286,11 @@ class BilibiliCrawler(AbstractCrawler):
@staticmethod @staticmethod
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
"""format proxy info for playwright and httpx""" """
format proxy info for playwright and httpx
:param ip_proxy_info: ip proxy info
:return: playwright proxy, httpx proxy
"""
playwright_proxy = { playwright_proxy = {
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
"username": ip_proxy_info.user, "username": ip_proxy_info.user,
@ -300,7 +308,14 @@ class BilibiliCrawler(AbstractCrawler):
user_agent: Optional[str], user_agent: Optional[str],
headless: bool = True headless: bool = True
) -> BrowserContext: ) -> BrowserContext:
"""Launch browser and create browser context""" """
launch browser and create browser context
:param chromium: chromium browser
:param playwright_proxy: playwright proxy
:param user_agent: user agent
:param headless: headless mode
:return: browser context
"""
utils.logger.info( utils.logger.info(
"[BilibiliCrawler.launch_browser] Begin create browser context ...") "[BilibiliCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:

View File

@ -220,7 +220,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
"""Get note detail""" """Get note detail"""
async with semaphore: async with semaphore:
try: try:
note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token) note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
if not note_detail: if not note_detail:
utils.logger.error( utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}") f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}")