From 61ba8c5cc73f034d5a9b06aab74982092a1de47b Mon Sep 17 00:00:00 2001 From: "jayeeliu@gmail.com" Date: Sat, 2 Mar 2024 01:49:42 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=B0=8F=E7=BA=A2=E4=B9=A6=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E9=80=9A=E8=BF=87=E5=8D=9A=E4=B8=BBID=E9=87=87?= =?UTF-8?q?=E9=9B=86=E7=AC=94=E8=AE=B0=E5=92=8C=E8=AF=84=E8=AE=BA=EF=BC=8C?= =?UTF-8?q?=E5=B0=8F=E7=BA=A2=E4=B9=A6type=3Dsearch=E6=97=B6=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E9=85=8D=E7=BD=AE=E6=8C=89=E5=93=AA=E7=A7=8D=E6=8E=92?= =?UTF-8?q?=E5=BA=8F=E6=96=B9=E5=BC=8F=E8=8E=B7=E5=8F=96=E7=AC=94=E8=AE=B0?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=EF=BC=8C=E5=B0=8F=E7=BA=A2=E4=B9=A6=E7=AC=94?= =?UTF-8?q?=E8=AE=B0=E5=A2=9E=E5=8A=A0=E8=A7=86=E9=A2=91=E5=9C=B0=E5=9D=80?= =?UTF-8?q?=E5=92=8C=E6=A0=87=E7=AD=BE=E5=AD=97=E6=AE=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- base/base_crawler.py | 6 +++ config/base_config.py | 9 +++++ main.py | 4 +- media_platform/xhs/client.py | 58 +++++++++++++++++++++++++++++ media_platform/xhs/core.py | 66 +++++++++++++++++++++++++++++++++ store/xhs/__init__.py | 38 +++++++++++++++++++ store/xhs/xhs_store_db_types.py | 18 +++++++++ store/xhs/xhs_store_impl.py | 47 +++++++++++++++++++++++ 8 files changed, 244 insertions(+), 2 deletions(-) diff --git a/base/base_crawler.py b/base/base_crawler.py index a7ab64b..5a59a3d 100644 --- a/base/base_crawler.py +++ b/base/base_crawler.py @@ -49,3 +49,9 @@ class AbstractStore(ABC): @abstractmethod async def store_comment(self, comment_item: Dict): pass + + # TODO support all platform + # only xhs is supported, so @abstractmethod is commented + # @abstractmethod + async def store_creator(self, creator: Dict): + pass diff --git a/config/base_config.py b/config/base_config.py index df3dc79..af58561 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -3,6 +3,7 @@ PLATFORM = "xhs" KEYWORDS = "python,golang" LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" +SORT_TYPE="popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书 CRAWLER_TYPE = "search" # 是否开启 IP 代理 @@ -70,3 +71,11 @@ WEIBO_SPECIFIED_ID_LIST = [ "4982041758140155", # ........................ ] + +# 指定小红书创作者ID列表 +XHS_CREATOR_ID_LIST = [ + "59d8cb33de5fb4696bf17217", + "61b87386000000001000b18b", + "5e8558100000000001005bc5", + # ........................ +] \ No newline at end of file diff --git a/main.py b/main.py index 73a1cfc..7c5902a 100644 --- a/main.py +++ b/main.py @@ -36,8 +36,8 @@ async def main(): choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM) parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) - parser.add_argument('--type', type=str, help='crawler type (search | detail)', - choices=["search", "detail"], default=config.CRAWLER_TYPE) + parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)', + choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE) # init db if config.SAVE_DATA_OPTION == "db": diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 1ea48c1..219c35a 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -1,5 +1,6 @@ import asyncio import json +import re from typing import Callable, Dict, List, Optional from urllib.parse import urlencode @@ -73,11 +74,18 @@ class XHSClient: Returns: """ + # return response.text + return_response = kwargs.pop('return_response', False) + async with httpx.AsyncClient(proxies=self.proxies) as client: response = await client.request( method, url, timeout=self.timeout, **kwargs ) + + if return_response: + return response.text + data: Dict = response.json() if data["success"]: return data.get("data", data.get("success", {})) @@ -178,6 +186,56 @@ class XHSClient: } return await self.post(uri, data) + async def get_creator_info_and_notes(self, creator: str) -> Dict: + """ + 获取博主的信息和第一页的笔记 + Args: + creator: 博主ID + Returns: + {"creator":{}, "notes":[]} + """ + path = '/user/profile/'+creator + content = await self.request(method="GET", url=f"https://www.xiaohongshu.com{path}", return_response=True) + match = re.search(r'