fix: xhs note detail error
This commit is contained in:
parent
00b12ce482
commit
d6fb255bdf
|
@ -7,15 +7,29 @@ from playwright.async_api import BrowserContext, BrowserType
|
||||||
class AbstractCrawler(ABC):
|
class AbstractCrawler(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def start(self):
|
async def start(self):
|
||||||
|
"""
|
||||||
|
start crawler
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def search(self):
|
async def search(self):
|
||||||
|
"""
|
||||||
|
search
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def launch_browser(self, chromium: BrowserType, playwright_proxy: Optional[Dict], user_agent: Optional[str],
|
async def launch_browser(self, chromium: BrowserType, playwright_proxy: Optional[Dict], user_agent: Optional[str],
|
||||||
headless: bool = True) -> BrowserContext:
|
headless: bool = True) -> BrowserContext:
|
||||||
|
"""
|
||||||
|
launch browser
|
||||||
|
:param chromium: chromium browser
|
||||||
|
:param playwright_proxy: playwright proxy
|
||||||
|
:param user_agent: user agent
|
||||||
|
:param headless: headless mode
|
||||||
|
:return: browser context
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
1
main.py
1
main.py
|
@ -45,6 +45,7 @@ async def main():
|
||||||
if config.SAVE_DATA_OPTION == "db":
|
if config.SAVE_DATA_OPTION == "db":
|
||||||
await db.close()
|
await db.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -262,7 +262,11 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
|
async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
|
||||||
"""Create xhs client"""
|
"""
|
||||||
|
create bilibili client
|
||||||
|
:param httpx_proxy: httpx proxy
|
||||||
|
:return: bilibili client
|
||||||
|
"""
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
"[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...")
|
"[BilibiliCrawler.create_bilibili_client] Begin create bilibili API client ...")
|
||||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||||
|
@ -282,7 +286,11 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
|
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
|
||||||
"""format proxy info for playwright and httpx"""
|
"""
|
||||||
|
format proxy info for playwright and httpx
|
||||||
|
:param ip_proxy_info: ip proxy info
|
||||||
|
:return: playwright proxy, httpx proxy
|
||||||
|
"""
|
||||||
playwright_proxy = {
|
playwright_proxy = {
|
||||||
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
|
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
|
||||||
"username": ip_proxy_info.user,
|
"username": ip_proxy_info.user,
|
||||||
|
@ -300,7 +308,14 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
user_agent: Optional[str],
|
user_agent: Optional[str],
|
||||||
headless: bool = True
|
headless: bool = True
|
||||||
) -> BrowserContext:
|
) -> BrowserContext:
|
||||||
"""Launch browser and create browser context"""
|
"""
|
||||||
|
launch browser and create browser context
|
||||||
|
:param chromium: chromium browser
|
||||||
|
:param playwright_proxy: playwright proxy
|
||||||
|
:param user_agent: user agent
|
||||||
|
:param headless: headless mode
|
||||||
|
:return: browser context
|
||||||
|
"""
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
"[BilibiliCrawler.launch_browser] Begin create browser context ...")
|
"[BilibiliCrawler.launch_browser] Begin create browser context ...")
|
||||||
if config.SAVE_LOGIN_STATE:
|
if config.SAVE_LOGIN_STATE:
|
||||||
|
|
|
@ -220,7 +220,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
"""Get note detail"""
|
"""Get note detail"""
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
try:
|
try:
|
||||||
note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
|
note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
|
||||||
if not note_detail:
|
if not note_detail:
|
||||||
utils.logger.error(
|
utils.logger.error(
|
||||||
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}")
|
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}")
|
||||||
|
|
Loading…
Reference in New Issue