diff --git a/config/base_config.py b/config/base_config.py index cefc711..690dbab 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -55,9 +55,6 @@ ENABLE_GET_SUB_COMMENTS = False # 指定小红书需要爬虫的笔记ID列表 XHS_SPECIFIED_ID_LIST = [ "6422c2750000000027000d88", - "64ca1b73000000000b028dd2", - "630d5b85000000001203ab41", - "668fe13000000000030241fa", # 图文混合 # ........................ ] diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 62afdcf..2b1379d 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -6,6 +6,7 @@ from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext, Page +from tenacity import retry, stop_after_attempt, wait_fixed import config from base.base_crawler import AbstractApiClient @@ -66,6 +67,7 @@ class XiaoHongShuClient(AbstractApiClient): self.headers.update(headers) return self.headers + @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) async def request(self, method, url, **kwargs) -> Union[str, Any]: """ 封装httpx的公共请求方法,对请求响应做一些处理 @@ -88,7 +90,6 @@ class XiaoHongShuClient(AbstractApiClient): if return_response: return response.text - data: Dict = response.json() if data["success"]: return data.get("data", data.get("success", {})) @@ -114,7 +115,7 @@ class XiaoHongShuClient(AbstractApiClient): headers = await self._pre_headers(final_uri) return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers) - async def post(self, uri: str, data: dict) -> Dict: + async def post(self, uri: str, data: dict, **kwargs) -> Dict: """ POST请求,对请求头签名 Args: @@ -127,7 +128,7 @@ class XiaoHongShuClient(AbstractApiClient): headers = await self._pre_headers(uri, data) json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) return await self.request(method="POST", url=f"{self._host}{uri}", - data=json_str, headers=headers) + data=json_str, headers=headers, **kwargs) async def get_note_media(self, url: str) -> Union[bytes, None]: async with httpx.AsyncClient(proxies=self.proxies) as client: @@ -425,3 +426,60 @@ class XiaoHongShuClient(AbstractApiClient): await asyncio.sleep(crawl_interval) result.extend(notes) return result + + async def get_note_short_url(self, note_id: str) -> Dict: + """ + 获取笔记的短链接 + Args: + note_id: 笔记ID + + Returns: + + """ + uri = f"/api/sns/web/short_url" + data = { + "original_url": f"{self._domain}/discovery/item/{note_id}" + } + return await self.post(uri, data=data, return_response=True) + + async def get_note_by_id_from_html(self, note_id: str): + """ + 通过解析网页版的笔记详情页HTML,获取笔记详情 + copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259 + thanks for ReaJason + Args: + note_id: + + Returns: + + """ + def camel_to_underscore(key): + return re.sub(r"(?", html)[0].replace("undefined", '""') + if state != "{}": + note_dict = transform_json_keys(state) + return note_dict["note"]["note_detail_map"][note_id]["note"] + raise DataFetchError(html) diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index cd51d9a..3c39938 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -116,7 +116,7 @@ class XiaoHongShuCrawler(AbstractCrawler): break semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ - self.get_note_detail( + self.get_note_detail_async_task( note_id=post_item.get("id"), xsec_source=post_item.get("xsec_source"), xsec_token=post_item.get("xsec_token"), @@ -163,7 +163,7 @@ class XiaoHongShuCrawler(AbstractCrawler): """ semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ - self.get_note_detail( + self.get_note_detail_async_task( note_id=post_item.get("note_id"), xsec_source=post_item.get("xsec_source"), xsec_token=post_item.get("xsec_token"), @@ -179,20 +179,41 @@ class XiaoHongShuCrawler(AbstractCrawler): async def get_specified_notes(self): """Get the information and comments of the specified post""" - semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) - fixed_xsec_token = "ABtXiOIX98byLlu-ju5dDq3tIc6uikcJrd3t7OYyqUbE4" - task_list = [ - self.get_note_detail(note_id=note_id, xsec_source="pc_search", xsec_token=fixed_xsec_token, - semaphore=semaphore) for note_id in config.XHS_SPECIFIED_ID_LIST - ] - note_details = await asyncio.gather(*task_list) - for note_detail in note_details: - if note_detail is not None: - await xhs_store.update_xhs_note(note_detail) - await self.get_notice_media(note_detail) - await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST) - async def get_note_detail(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \ + async def get_note_detail_from_html_task(note_id: str, semaphore: asyncio.Semaphore) -> Dict: + async with semaphore: + try: + _note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id) + print("------------------------") + print(_note_detail) + print("------------------------") + if not _note_detail: + utils.logger.error( + f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error, note_id: {note_id}") + return {} + return _note_detail + except DataFetchError as ex: + utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error: {ex}") + return {} + except KeyError as ex: + utils.logger.error( + f"[XiaoHongShuCrawler.get_note_detail_from_html] have not fund note detail note_id:{note_id}, err: {ex}") + return {} + + get_note_detail_task_list = [ + get_note_detail_from_html_task(note_id=note_id, semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)) for + note_id in config.XHS_SPECIFIED_ID_LIST + ] + + need_get_comment_note_ids = [] + note_details = await asyncio.gather(*get_note_detail_task_list) + for note_detail in note_details: + if note_detail: + need_get_comment_note_ids.append(note_detail.get("note_id")) + await xhs_store.update_xhs_note(note_detail) + await self.batch_get_note_comments(need_get_comment_note_ids) + + async def get_note_detail_async_task(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \ Optional[Dict]: """Get note detail""" async with semaphore: @@ -200,16 +221,16 @@ class XiaoHongShuCrawler(AbstractCrawler): note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token) if not note_detail: utils.logger.error( - f"[XiaoHongShuCrawler.get_note_detail] Get note detail error, note_id: {note_id}") + f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}") return None note_detail.update({"xsec_token": xsec_token, "xsec_source": xsec_source}) return note_detail except DataFetchError as ex: - utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}") + utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}") return None except KeyError as ex: utils.logger.error( - f"[XiaoHongShuCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}") + f"[XiaoHongShuCrawler.get_note_detail_async_task] have not fund note detail note_id:{note_id}, err: {ex}") return None async def batch_get_note_comments(self, note_list: List[str]):