diff --git a/base/base_crawler.py b/base/base_crawler.py index a7ab64b..5a59a3d 100644 --- a/base/base_crawler.py +++ b/base/base_crawler.py @@ -49,3 +49,9 @@ class AbstractStore(ABC): @abstractmethod async def store_comment(self, comment_item: Dict): pass + + # TODO support all platform + # only xhs is supported, so @abstractmethod is commented + # @abstractmethod + async def store_creator(self, creator: Dict): + pass diff --git a/config/base_config.py b/config/base_config.py index df3dc79..af58561 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -3,6 +3,7 @@ PLATFORM = "xhs" KEYWORDS = "python,golang" LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" +SORT_TYPE="popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书 CRAWLER_TYPE = "search" # 是否开启 IP 代理 @@ -70,3 +71,11 @@ WEIBO_SPECIFIED_ID_LIST = [ "4982041758140155", # ........................ ] + +# 指定小红书创作者ID列表 +XHS_CREATOR_ID_LIST = [ + "59d8cb33de5fb4696bf17217", + "61b87386000000001000b18b", + "5e8558100000000001005bc5", + # ........................ +] \ No newline at end of file diff --git a/main.py b/main.py index 73a1cfc..7c5902a 100644 --- a/main.py +++ b/main.py @@ -36,8 +36,8 @@ async def main(): choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM) parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) - parser.add_argument('--type', type=str, help='crawler type (search | detail)', - choices=["search", "detail"], default=config.CRAWLER_TYPE) + parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)', + choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE) # init db if config.SAVE_DATA_OPTION == "db": diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 1ea48c1..219c35a 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -1,5 +1,6 @@ import asyncio import json +import re from typing import Callable, Dict, List, Optional from urllib.parse import urlencode @@ -73,11 +74,18 @@ class XHSClient: Returns: """ + # return response.text + return_response = kwargs.pop('return_response', False) + async with httpx.AsyncClient(proxies=self.proxies) as client: response = await client.request( method, url, timeout=self.timeout, **kwargs ) + + if return_response: + return response.text + data: Dict = response.json() if data["success"]: return data.get("data", data.get("success", {})) @@ -178,6 +186,56 @@ class XHSClient: } return await self.post(uri, data) + async def get_creator_info_and_notes(self, creator: str) -> Dict: + """ + 获取博主的信息和第一页的笔记 + Args: + creator: 博主ID + Returns: + {"creator":{}, "notes":[]} + """ + path = '/user/profile/'+creator + content = await self.request(method="GET", url=f"https://www.xiaohongshu.com{path}", return_response=True) + match = re.search(r'