diff --git a/README.md b/README.md index 9ad0002..330f4c5 100644 --- a/README.md +++ b/README.md @@ -22,12 +22,11 @@ |-----|-------|----------|-----|--------|-------|-------|-------| | 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | +| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | - ## 使用方法 ### 创建并激活 python 虚拟环境 diff --git a/config/base_config.py b/config/base_config.py index 9b52e52..4236572 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -103,6 +103,13 @@ BILI_CREATOR_ID_LIST = [ # ........................ ] +# 指定快手创作者ID列表 +KS_CREATOR_ID_LIST = [ + "3x4sm73aye7jq7i", + # ........................ +] + + #词云相关 #是否开启生成评论词云图 ENABLE_GET_WORDCLOUD = False @@ -118,5 +125,3 @@ STOP_WORDS_FILE = "./docs/hit_stopwords.txt" #中文字体文件路径 FONT_PATH= "./docs/STZHONGS.TTF" - - diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index 8ad2155..591a180 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import asyncio import json -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, List, Optional from urllib.parse import urlencode import httpx @@ -67,7 +67,7 @@ class KuaiShouClient(AbstractApiClient): "variables": { "ftype": 1, }, - "query": self.graphql.get("vision_profile") + "query": self.graphql.get("vision_profile_user_list") } res = await self.post("", post_data) if res.get("visionProfileUserList", {}).get("result") == 1: @@ -129,17 +129,60 @@ class KuaiShouClient(AbstractApiClient): "pcursor": pcursor }, "query": self.graphql.get("comment_list") - } return await self.post("", post_data) - async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False, - callback: Optional[Callable] = None): + async def get_video_sub_comments( + self, photo_id: str, rootCommentId: str, pcursor: str = "" + ) -> Dict: + """get video sub comments + :param photo_id: photo id you want to fetch + :param pcursor: last you get pcursor, defaults to "" + :return: + """ + post_data = { + "operationName": "visionSubCommentList", + "variables": { + "photoId": photo_id, + "pcursor": pcursor, + "rootCommentId": rootCommentId, + }, + "query": self.graphql.get("vision_sub_comment_list"), + } + return await self.post("", post_data) + + async def get_creator_profile(self, userId: str) -> Dict: + post_data = { + "operationName": "visionProfile", + "variables": { + "userId": userId + }, + "query": self.graphql.get("vision_profile"), + } + return await self.post("", post_data) + + async def get_video_by_creater(self, userId: str, pcursor: str = "") -> Dict: + post_data = { + "operationName": "visionProfilePhotoList", + "variables": { + "page": "profile", + "pcursor": pcursor, + "userId": userId + }, + "query": self.graphql.get("vision_profile_photo_list"), + } + return await self.post("", post_data) + + async def get_video_all_comments( + self, + photo_id: str, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ): """ get video all comments include sub comments :param photo_id: :param crawl_interval: - :param is_fetch_sub_comments: :param callback: :return: """ @@ -158,7 +201,106 @@ class KuaiShouClient(AbstractApiClient): result.extend(comments) await asyncio.sleep(crawl_interval) - if not is_fetch_sub_comments: - continue - # todo handle get sub comments + sub_comments = await self.get_comments_all_sub_comments( + comments, photo_id, crawl_interval, callback + ) + result.extend(sub_comments) + return result + + async def get_comments_all_sub_comments( + self, + comments: List[Dict], + photo_id, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[Dict]: + """ + 获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息 + Args: + comments: 评论列表 + photo_id: 视频id + crawl_interval: 爬取一次评论的延迟单位(秒) + callback: 一次评论爬取结束后 + Returns: + + """ + if not config.ENABLE_GET_SUB_COMMENTS: + utils.logger.info( + f"[KuaiShouClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled" + ) + return [] + + result = [] + for comment in comments: + sub_comments = comment.get("subComments") + if sub_comments and callback: + await callback(photo_id, sub_comments) + + sub_comment_pcursor = comment.get("subCommentsPcursor") + if sub_comment_pcursor == "no_more": + continue + + root_comment_id = comment.get("commentId") + sub_comment_pcursor = "" + + while sub_comment_pcursor != "no_more": + comments_res = await self.get_video_sub_comments( + photo_id, root_comment_id, sub_comment_pcursor + ) + sub_comment_pcursor = comments_res.get("pcursor", "no_more") + + comments = comments_res.get("subComments", []) + if callback: + await callback(photo_id, comments) + await asyncio.sleep(crawl_interval) + result.extend(comments) + return result + + async def get_creator_info(self, user_id: str) -> Dict: + """ + eg: https://www.kuaishou.com/profile/3x4jtnbfter525a + 快手用户主页 + """ + + visionProfile = await self.get_creator_profile(user_id) + return visionProfile.get("userProfile") + + async def get_all_videos_by_creator( + self, + user_id: str, + crawl_interval: float = 1.0, + callback: Optional[Callable] = None, + ) -> List[Dict]: + """ + 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 + Args: + user_id: 用户ID + crawl_interval: 爬取一次的延迟单位(秒) + callback: 一次分页爬取结束后的更新回调函数 + Returns: + + """ + result = [] + pcursor = "" + + while pcursor != "no_more": + videos_res = await self.get_video_by_creater(user_id, pcursor) + if not videos_res: + utils.logger.error( + f"[KuaiShouClient.get_all_videos_by_creator] The current creator may have been banned by ks, so they cannot access the data." + ) + break + + vision_profile_photo_list = videos_res.get("visionProfilePhotoList", {}) + pcursor = vision_profile_photo_list.get("pcursor", "") + + videos = vision_profile_photo_list.get("feeds", []) + utils.logger.info( + f"[KuaiShouClient.get_all_videos_by_creator] got user_id:{user_id} videos len : {len(videos)}" + ) + + if callback: + await callback(videos) + await asyncio.sleep(crawl_interval) + result.extend(videos) return result diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index d318a9c..51f86e7 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -65,11 +65,14 @@ class KuaishouCrawler(AbstractCrawler): crawler_type_var.set(config.CRAWLER_TYPE) if config.CRAWLER_TYPE == "search": - # Search for notes and retrieve their comment information. + # Search for videos and retrieve their comment information. await self.search() elif config.CRAWLER_TYPE == "detail": # Get the information and comments of the specified post await self.get_specified_videos() + elif config.CRAWLER_TYPE == "creator": + # Get creator's information and their videos and comments + await self.get_creators_and_videos() else: pass @@ -135,7 +138,7 @@ class KuaishouCrawler(AbstractCrawler): utils.logger.error(f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}") return None except KeyError as ex: - utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund note detail video_id:{video_id}, err: {ex}") + utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}") return None async def batch_get_video_comments(self, video_id_list: List[str]): @@ -145,7 +148,7 @@ class KuaishouCrawler(AbstractCrawler): :return: """ if not config.ENABLE_GET_COMMENTS: - utils.logger.info(f"[KuaishouCrawler.batch_get_note_comments] Crawling comment mode is not enabled") + utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled") return utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}") @@ -200,10 +203,10 @@ class KuaishouCrawler(AbstractCrawler): return playwright_proxy, httpx_proxy async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient: - """Create xhs client""" + """Create ks client""" utils.logger.info("[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ...") cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) - xhs_client_obj = KuaiShouClient( + ks_client_obj = KuaiShouClient( proxies=httpx_proxy, headers={ "User-Agent": self.user_agent, @@ -215,7 +218,7 @@ class KuaishouCrawler(AbstractCrawler): playwright_page=self.context_page, cookie_dict=cookie_dict, ) - return xhs_client_obj + return ks_client_obj async def launch_browser( self, @@ -246,6 +249,39 @@ class KuaishouCrawler(AbstractCrawler): ) return browser_context + async def get_creators_and_videos(self) -> None: + """Get creator's videos and retrieve their comment information.""" + utils.logger.info("[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators") + for user_id in config.KS_CREATOR_ID_LIST: + # get creator detail info from web html content + createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id) + if createor_info: + await kuaishou_store.save_creator(user_id, creator=createor_info) + + # Get all video information of the creator + all_video_list = await self.ks_client.get_all_videos_by_creator( + user_id = user_id, + crawl_interval = random.random(), + callback = self.fetch_creator_video_detail + ) + + video_ids = [video_item.get("photo", {}).get("id") for video_item in all_video_list] + await self.batch_get_video_comments(video_ids) + + async def fetch_creator_video_detail(self, video_list: List[Dict]): + """ + Concurrently obtain the specified post list and save the data + """ + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore) for post_item in video_list + ] + + video_details = await asyncio.gather(*task_list) + for video_detail in video_details: + if video_detail is not None: + await kuaishou_store.update_kuaishou_video(video_detail) + async def close(self): """Close browser context""" await self.browser_context.close() diff --git a/media_platform/kuaishou/graphql.py b/media_platform/kuaishou/graphql.py index 215b57f..2d32689 100644 --- a/media_platform/kuaishou/graphql.py +++ b/media_platform/kuaishou/graphql.py @@ -11,7 +11,7 @@ class KuaiShouGraphQL: self.load_graphql_queries() def load_graphql_queries(self): - graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql"] + graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql","vision_profile_photo_list.graphql","vision_profile_user_list.graphql","vision_sub_comment_list.graphql"] for file in graphql_files: with open(self.graphql_dir + file, mode="r") as f: diff --git a/media_platform/kuaishou/graphql/vision_profile.graphql b/media_platform/kuaishou/graphql/vision_profile.graphql index 148165a..5499600 100644 --- a/media_platform/kuaishou/graphql/vision_profile.graphql +++ b/media_platform/kuaishou/graphql/vision_profile.graphql @@ -1,16 +1,27 @@ -query visionProfileUserList($pcursor: String, $ftype: Int) { - visionProfileUserList(pcursor: $pcursor, ftype: $ftype) { +query visionProfile($userId: String) { + visionProfile(userId: $userId) { result - fols { - user_name - headurl - user_text + hostName + userProfile { + ownerCount { + fan + photo + follow + photo_public + __typename + } + profile { + gender + user_name + user_id + headurl + user_text + user_profile_bg_url + __typename + } isFollowing - user_id __typename } - hostName - pcursor __typename } } diff --git a/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql b/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql new file mode 100644 index 0000000..328052e --- /dev/null +++ b/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql @@ -0,0 +1,110 @@ +fragment photoContent on PhotoEntity { + __typename + id + duration + caption + originCaption + likeCount + viewCount + commentCount + realLikeCount + coverUrl + photoUrl + photoH265Url + manifest + manifestH265 + videoResource + coverUrls { + url + __typename + } + timestamp + expTag + animatedCoverUrl + distance + videoRatio + liked + stereoType + profileUserTopPhoto + musicBlocked + riskTagContent + riskTagUrl +} + +fragment recoPhotoFragment on recoPhotoEntity { + __typename + id + duration + caption + originCaption + likeCount + viewCount + commentCount + realLikeCount + coverUrl + photoUrl + photoH265Url + manifest + manifestH265 + videoResource + coverUrls { + url + __typename + } + timestamp + expTag + animatedCoverUrl + distance + videoRatio + liked + stereoType + profileUserTopPhoto + musicBlocked + riskTagContent + riskTagUrl +} + +fragment feedContent on Feed { + type + author { + id + name + headerUrl + following + headerUrls { + url + __typename + } + __typename + } + photo { + ...photoContent + ...recoPhotoFragment + __typename + } + canAddComment + llsid + status + currentPcursor + tags { + type + name + __typename + } + __typename +} + +query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) { + visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) { + result + llsid + webPageArea + feeds { + ...feedContent + __typename + } + hostName + pcursor + __typename + } +} diff --git a/media_platform/kuaishou/graphql/vision_profile_user_list.graphql b/media_platform/kuaishou/graphql/vision_profile_user_list.graphql new file mode 100644 index 0000000..148165a --- /dev/null +++ b/media_platform/kuaishou/graphql/vision_profile_user_list.graphql @@ -0,0 +1,16 @@ +query visionProfileUserList($pcursor: String, $ftype: Int) { + visionProfileUserList(pcursor: $pcursor, ftype: $ftype) { + result + fols { + user_name + headurl + user_text + isFollowing + user_id + __typename + } + hostName + pcursor + __typename + } +} diff --git a/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql b/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql new file mode 100644 index 0000000..31730fc --- /dev/null +++ b/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql @@ -0,0 +1,22 @@ +mutation visionSubCommentList($photoId: String, $rootCommentId: String, $pcursor: String) { + visionSubCommentList(photoId: $photoId, rootCommentId: $rootCommentId, pcursor: $pcursor) { + pcursor + subComments { + commentId + authorId + authorName + content + headurl + timestamp + likedCount + realLikedCount + liked + status + authorLiked + replyToUserName + replyTo + __typename + } + __typename + } +} diff --git a/media_platform/kuaishou/login.py b/media_platform/kuaishou/login.py index cbd578b..be2ce8d 100644 --- a/media_platform/kuaishou/login.py +++ b/media_platform/kuaishou/login.py @@ -8,6 +8,7 @@ from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, wait_fixed) from base.base_crawler import AbstractLogin +import config from tools import utils @@ -57,7 +58,7 @@ class KuaishouLogin(AbstractLogin): # click login button login_button_ele = self.context_page.locator( - "xpath=//p[text()=' 登录 ']" + "xpath=//p[text()='登录']" ) await login_button_ele.click() diff --git a/store/kuaishou/__init__.py b/store/kuaishou/__init__.py index 818c75a..cfdcd29 100644 --- a/store/kuaishou/__init__.py +++ b/store/kuaishou/__init__.py @@ -76,3 +76,22 @@ async def update_ks_video_comment(video_id: str, comment_item: Dict): utils.logger.info( f"[store.kuaishou.update_ks_video_comment] Kuaishou video comment: {comment_id}, content: {save_comment_item.get('content')}") await KuaishouStoreFactory.create_store().store_comment(comment_item=save_comment_item) + +async def save_creator(user_id: str, creator: Dict): + ownerCount = creator.get('ownerCount', {}) + profile = creator.get('profile', {}) + + local_db_item = { + 'user_id': user_id, + 'nickname': profile.get('user_name'), + 'gender': '女' if profile.get('gender') == "F" else '男', + 'avatar': profile.get('headurl'), + 'desc': profile.get('user_text'), + 'ip_location': "", + 'follows': ownerCount.get("follow"), + 'fans': ownerCount.get("fan"), + 'interaction': ownerCount.get("photo_public"), + "last_modify_ts": utils.get_current_timestamp(), + } + utils.logger.info(f"[store.kuaishou.save_creator] creator:{local_db_item}") + await KuaishouStoreFactory.create_store().store_creator(local_db_item) \ No newline at end of file diff --git a/store/kuaishou/kuaishou_store_impl.py b/store/kuaishou/kuaishou_store_impl.py index 4883daa..523f6f0 100644 --- a/store/kuaishou/kuaishou_store_impl.py +++ b/store/kuaishou/kuaishou_store_impl.py @@ -205,3 +205,14 @@ class KuaishouJsonStoreImplement(AbstractStore): """ await self.save_data_to_json(comment_item, "comments") + + async def store_creator(self, creator: Dict): + """ + Kuaishou content JSON storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_json(creator, "creator") \ No newline at end of file