diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index 9346fd4..9cfab0a 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -9,10 +9,11 @@ from playwright.async_api import BrowserContext, Page from tools import utils +from .graphql import KuaiShouGraphQL from .exception import DataFetchError, IPBlockError -class KuaishouClient: +class KuaiShouClient: def __init__( self, timeout=10, @@ -25,12 +26,17 @@ class KuaishouClient: self.proxies = proxies self.timeout = timeout self.headers = headers - self._host = "https://www.kuaishou.com" + self._host = "https://www.kuaishou.com/graphql" self.playwright_page = playwright_page self.cookie_dict = cookie_dict + self.graphql = KuaiShouGraphQL() async def _pre_headers(self, url: str, data=None): - pass + self.headers = { + "cookie":"kpf=PC_WEB; clientid=3; did=web_6e79b79fdeac627f5cf52f08cab4e6bd; kpn=KUAISHOU_VISION", + "content-type":"application/json" + } + return self.headers async def request(self, method, url, **kwargs) -> Dict: async with httpx.AsyncClient(proxies=self.proxies) as client: @@ -39,10 +45,10 @@ class KuaishouClient: **kwargs ) data: Dict = response.json() - if data["success"]: - return data.get("data", data.get("success", {})) + if data.get("errors"): + raise DataFetchError(data.get("errors", "unkonw error")) else: - raise DataFetchError(data.get("msg", None)) + return data.get("data", {}) async def get(self, uri: str, params=None) -> Dict: final_uri = uri @@ -53,10 +59,9 @@ class KuaishouClient: return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers) async def post(self, uri: str, data: dict) -> Dict: - headers = await self._pre_headers(uri, data) json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) return await self.request(method="POST", url=f"{self._host}{uri}", - data=json_str, headers=headers) + data=json_str, headers=self.headers) async def pong(self) -> bool: """get a note to check if login state is ok""" @@ -72,4 +77,26 @@ class KuaishouClient: async def update_cookies(self, browser_context: BrowserContext): cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) self.headers["Cookie"] = cookie_str - self.cookie_dict = cookie_dict \ No newline at end of file + self.cookie_dict = cookie_dict + + async def search_info_by_keyword(self, keyword: str, pcursor: str): + """ + KuaiShou Web Search API + :param keyword: search keyword + :param pcursor: limite page curson + :return: + """ + params = { + "operationName": "visionSearchPhoto", + "variables": { + "keyword": keyword, + "pcursor": pcursor, + "page": "search" + }, + "query": self.graphql.get("search_query") + } + return await self.post("", params) + + + async def get_video_info(self, video_id: str) -> Dict: + pass \ No newline at end of file diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index 67d60a6..b7afbbc 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -11,8 +11,9 @@ from base.proxy_account_pool import AccountPool from tools import utils from var import crawler_type_var -from .client import KuaishouClient +from .client import KuaiShouClient from .login import KuaishouLogin +from .exception import DataFetchError class KuaishouCrawler(AbstractCrawler): @@ -20,7 +21,7 @@ class KuaishouCrawler(AbstractCrawler): login_type: str crawler_type: str context_page: Page - ks_client: KuaishouClient + ks_client: KuaiShouClient account_pool: AccountPool browser_context: BrowserContext @@ -76,12 +77,58 @@ class KuaishouCrawler(AbstractCrawler): utils.logger.info("Kuaishou Crawler finished ...") async def search(self): - await asyncio.Event().wait() + utils.logger.info("Begin search kuaishou keywords") + ks_limit_count = 20 # kuaishou limit page fixed value + for keyword in config.KEYWORDS.split(","): + utils.logger.info(f"Current search keyword: {keyword}") + page = 1 + while page * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + video_id_list: List[str] = [] + videos_res = await self.ks_client.search_info_by_keyword( + keyword=keyword, + pcursor=str(page), + ) + if not videos_res: + utils.logger.error(f"search info by keyword:{keyword} not found data") + continue + vision_search_photo = videos_res.get("visionSearchPhoto") + utils.logger.info(f"videos_res:{videos_res}") + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_video_detail(feed_item.get("photo", {}).get("id"), semaphore) + for feed_item in vision_search_photo.get("feeds", {}) + ] + video_details = await asyncio.gather(*task_list) + for video_detail in video_details: + if video_detail is not None: + video_id_list.append(video_detail.get("id")) + page += 1 + utils.logger.info(f"Video details: {video_details}") + await self.batch_get_note_comments(video_id_list) + await asyncio.Event().wait() async def get_specified_notes(self): pass + async def batch_get_note_comments(self, video_id_list: List[str]): + pass + + async def get_video_detail(self, ): + pass + + async def get_video_detail(self, video_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: + """Get note detail""" + async with semaphore: + try: + return await self.ks_client.get_video_info(video_id) + except DataFetchError as ex: + utils.logger.error(f"Get note detail error: {ex}") + return None + except KeyError as ex: + utils.logger.error(f"have not fund note detail note_id:{note_id}, err: {ex}") + return None + def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]: """Create proxy info for playwright and httpx""" # phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888 @@ -97,11 +144,11 @@ class KuaishouCrawler(AbstractCrawler): httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}" return phone, playwright_proxy, httpx_proxy - async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaishouClient: + async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient: """Create xhs client""" utils.logger.info("Begin create kuaishou API client ...") cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) - xhs_client_obj = KuaishouClient( + xhs_client_obj = KuaiShouClient( proxies=httpx_proxy, headers={ "User-Agent": self.user_agent, @@ -147,4 +194,4 @@ class KuaishouCrawler(AbstractCrawler): async def close(self): """Close browser context""" await self.browser_context.close() - utils.logger.info("Browser context closed ...") \ No newline at end of file + utils.logger.info("Browser context closed ...") diff --git a/media_platform/kuaishou/graphql.py b/media_platform/kuaishou/graphql.py new file mode 100644 index 0000000..c73e18d --- /dev/null +++ b/media_platform/kuaishou/graphql.py @@ -0,0 +1,22 @@ +# 快手的数据传输是基于GraphQL实现的 +# 这个类负责获取一些GraphQL的schema +import os + + +class KuaiShouGraphQL: + graphql_queries = {} + + def __init__(self): + self.graphql_dir = "media_platform/kuaishou/graphql/" + self.load_graphql_queries() + + def load_graphql_queries(self): + graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql"] + + for file in graphql_files: + with open(self.graphql_dir + file, mode="r") as f: + query_name = file.split(".")[0] + self.graphql_queries[query_name] = f.read() + + def get(self, query_name: str) -> str: + return self.graphql_queries.get(query_name, "Query not found") diff --git a/media_platform/kuaishou/graphql/comment_list.graphql b/media_platform/kuaishou/graphql/comment_list.graphql new file mode 100644 index 0000000..e69de29 diff --git a/media_platform/kuaishou/graphql/search_query.graphql b/media_platform/kuaishou/graphql/search_query.graphql new file mode 100644 index 0000000..cc3bd8f --- /dev/null +++ b/media_platform/kuaishou/graphql/search_query.graphql @@ -0,0 +1,111 @@ +fragment photoContent on PhotoEntity { + __typename + id + duration + caption + originCaption + likeCount + viewCount + commentCount + realLikeCount + coverUrl + photoUrl + photoH265Url + manifest + manifestH265 + videoResource + coverUrls { + url + __typename + } + timestamp + expTag + animatedCoverUrl + distance + videoRatio + liked + stereoType + profileUserTopPhoto + musicBlocked +} + +fragment recoPhotoFragment on recoPhotoEntity { + __typename + id + duration + caption + originCaption + likeCount + viewCount + commentCount + realLikeCount + coverUrl + photoUrl + photoH265Url + manifest + manifestH265 + videoResource + coverUrls { + url + __typename + } + timestamp + expTag + animatedCoverUrl + distance + videoRatio + liked + stereoType + profileUserTopPhoto + musicBlocked +} + +fragment feedContent on Feed { + type + author { + id + name + headerUrl + following + headerUrls { + url + __typename + } + __typename + } + photo { + ...photoContent + ...recoPhotoFragment + __typename + } + canAddComment + llsid + status + currentPcursor + tags { + type + name + __typename + } + __typename +} + +query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) { + visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) { + result + llsid + webPageArea + feeds { + ...feedContent + __typename + } + searchSessionId + pcursor + aladdinBanner { + imgUrl + link + __typename + } + __typename + } +} diff --git a/media_platform/kuaishou/graphql/video_detail.graphql b/media_platform/kuaishou/graphql/video_detail.graphql new file mode 100644 index 0000000..e69de29