Merge branch 'kuaishou'

2024-06-13 14:54:01 +08:00 · 2024-06-13 14:54:01 +08:00 · fd7407cc29
parent d04fc7552f a001556ba7
commit fd7407cc29
12 changed files with 402 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -22,12 +22,11 @@
 |-----|-------|----------|-----|--------|-------|-------|-------|
 | 小红书 | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
 | 抖音  | ✅     | ✅        | ✅    | ✅       | ✅     | ✅     | ✅    |
-| 快手  | ✅     | ✅        | ❌   | ❌      | ✅     | ✅     | ✅    |
+| 快手  | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
 | B 站 | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
 | 微博  | ✅     | ✅        | ❌   | ❌      | ✅     | ✅     | ✅    |


-
 ## 使用方法

 ### 创建并激活 python 虚拟环境
--- a/config/base_config.py
+++ b/config/base_config.py
@ -103,6 +103,13 @@ BILI_CREATOR_ID_LIST = [
    # ........................
 ]

+# 指定快手创作者ID列表
+KS_CREATOR_ID_LIST = [
+    "3x4sm73aye7jq7i",
+    # ........................
+]
+
+
 #词云相关
 #是否开启生成评论词云图
 ENABLE_GET_WORDCLOUD = False
@ -118,5 +125,3 @@ STOP_WORDS_FILE = "./docs/hit_stopwords.txt"

 #中文字体文件路径
 FONT_PATH= "./docs/STZHONGS.TTF"
-
-
--- a/media_platform/kuaishou/client.py
+++ b/media_platform/kuaishou/client.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import asyncio
 import json
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, List, Optional
 from urllib.parse import urlencode

 import httpx
@ -67,7 +67,7 @@ class KuaiShouClient(AbstractApiClient):
                "variables": {
                    "ftype": 1,
                },
-                "query": self.graphql.get("vision_profile")
+                "query": self.graphql.get("vision_profile_user_list")
            }
            res = await self.post("", post_data)
            if res.get("visionProfileUserList", {}).get("result") == 1:
@ -129,17 +129,60 @@ class KuaiShouClient(AbstractApiClient):
                "pcursor": pcursor
            },
            "query": self.graphql.get("comment_list")
-
        }
        return await self.post("", post_data)

-    async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
-                                     callback: Optional[Callable] = None):
+    async def get_video_sub_comments(
+        self, photo_id: str, rootCommentId: str, pcursor: str = ""
+    ) -> Dict:
+        """get video sub comments
+        :param photo_id: photo id you want to fetch
+        :param pcursor: last you get pcursor, defaults to ""
+        :return:
+        """
+        post_data = {
+            "operationName": "visionSubCommentList",
+            "variables": {
+                "photoId": photo_id,
+                "pcursor": pcursor,
+                "rootCommentId": rootCommentId,
+            },
+            "query": self.graphql.get("vision_sub_comment_list"),
+        }
+        return await self.post("", post_data)
+
+    async def get_creator_profile(self, userId: str) -> Dict:
+        post_data = {
+            "operationName": "visionProfile",
+            "variables": {
+                "userId": userId
+            },
+            "query": self.graphql.get("vision_profile"),
+        }
+        return await self.post("", post_data)
+
+    async def get_video_by_creater(self, userId: str, pcursor: str = "") -> Dict:
+        post_data = {
+            "operationName": "visionProfilePhotoList",
+            "variables": {
+                "page": "profile", 
+                "pcursor": pcursor, 
+                "userId": userId
+            },
+            "query": self.graphql.get("vision_profile_photo_list"),
+        }
+        return await self.post("", post_data)
+
+    async def get_video_all_comments(
+        self,
+        photo_id: str,
+        crawl_interval: float = 1.0,
+        callback: Optional[Callable] = None,
+    ):
        """
        get video all comments include sub comments
        :param photo_id:
        :param crawl_interval:
-        :param is_fetch_sub_comments:
        :param callback:
        :return:
        """
@ -158,7 +201,106 @@ class KuaiShouClient(AbstractApiClient):

            result.extend(comments)
            await asyncio.sleep(crawl_interval)
-            if not is_fetch_sub_comments:
-                continue
-            # todo handle get sub comments
+            sub_comments = await self.get_comments_all_sub_comments(
+                comments, photo_id, crawl_interval, callback
+            )
+            result.extend(sub_comments)
+        return result
+
+    async def get_comments_all_sub_comments(
+        self,
+        comments: List[Dict],
+        photo_id,
+        crawl_interval: float = 1.0,
+        callback: Optional[Callable] = None,
+    ) -> List[Dict]:
+        """
+        获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
+        Args:
+            comments: 评论列表
+            photo_id: 视频id
+            crawl_interval: 爬取一次评论的延迟单位（秒）
+            callback: 一次评论爬取结束后
+        Returns:
+
+        """
+        if not config.ENABLE_GET_SUB_COMMENTS:
+            utils.logger.info(
+                f"[KuaiShouClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled"
+            )
+            return []
+
+        result = []
+        for comment in comments:
+            sub_comments = comment.get("subComments")
+            if sub_comments and callback:
+                await callback(photo_id, sub_comments)
+
+            sub_comment_pcursor = comment.get("subCommentsPcursor")
+            if sub_comment_pcursor == "no_more":
+                continue
+
+            root_comment_id = comment.get("commentId")
+            sub_comment_pcursor = ""
+
+            while sub_comment_pcursor != "no_more":
+                comments_res = await self.get_video_sub_comments(
+                    photo_id, root_comment_id, sub_comment_pcursor
+                )
+                sub_comment_pcursor = comments_res.get("pcursor", "no_more")
+
+                comments = comments_res.get("subComments", [])
+                if callback:
+                    await callback(photo_id, comments)
+                await asyncio.sleep(crawl_interval)
+                result.extend(comments)
+        return result
+
+    async def get_creator_info(self, user_id: str) -> Dict:
+        """
+        eg: https://www.kuaishou.com/profile/3x4jtnbfter525a
+        快手用户主页
+        """
+
+        visionProfile = await self.get_creator_profile(user_id)
+        return visionProfile.get("userProfile")
+
+    async def get_all_videos_by_creator(
+        self,
+        user_id: str,
+        crawl_interval: float = 1.0,
+        callback: Optional[Callable] = None,
+    ) -> List[Dict]:
+        """
+        获取指定用户下的所有发过的帖子，该方法会一直查找一个用户下的所有帖子信息
+        Args:
+            user_id: 用户ID
+            crawl_interval: 爬取一次的延迟单位（秒）
+            callback: 一次分页爬取结束后的更新回调函数
+        Returns:
+
+        """
+        result = []
+        pcursor = ""
+
+        while pcursor != "no_more":
+            videos_res = await self.get_video_by_creater(user_id, pcursor)
+            if not videos_res:
+                utils.logger.error(
+                    f"[KuaiShouClient.get_all_videos_by_creator] The current creator may have been banned by ks, so they cannot access the data."
+                )
+                break
+
+            vision_profile_photo_list = videos_res.get("visionProfilePhotoList", {})
+            pcursor = vision_profile_photo_list.get("pcursor", "")
+
+            videos = vision_profile_photo_list.get("feeds", [])
+            utils.logger.info(
+                f"[KuaiShouClient.get_all_videos_by_creator] got user_id:{user_id} videos len : {len(videos)}"
+            )
+
+            if callback:
+                await callback(videos)
+            await asyncio.sleep(crawl_interval)
+            result.extend(videos)
        return result
--- a/media_platform/kuaishou/core.py
+++ b/media_platform/kuaishou/core.py
@ -65,11 +65,14 @@ class KuaishouCrawler(AbstractCrawler):

            crawler_type_var.set(config.CRAWLER_TYPE)
            if config.CRAWLER_TYPE == "search":
-                # Search for notes and retrieve their comment information.
+                # Search for videos and retrieve their comment information.
                await self.search()
            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
                await self.get_specified_videos()
+            elif config.CRAWLER_TYPE == "creator":
+                # Get creator's information and their videos and comments
+                await self.get_creators_and_videos()
            else:
                pass

@ -135,7 +138,7 @@ class KuaishouCrawler(AbstractCrawler):
                utils.logger.error(f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}")
                return None
            except KeyError as ex:
-                utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund note detail video_id:{video_id}, err: {ex}")
+                utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}")
                return None

    async def batch_get_video_comments(self, video_id_list: List[str]):
@ -145,7 +148,7 @@ class KuaishouCrawler(AbstractCrawler):
        :return:
        """
        if not config.ENABLE_GET_COMMENTS:
-            utils.logger.info(f"[KuaishouCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
+            utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled")
            return

        utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}")
@ -200,10 +203,10 @@ class KuaishouCrawler(AbstractCrawler):
        return playwright_proxy, httpx_proxy

    async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
-        """Create xhs client"""
+        """Create ks client"""
        utils.logger.info("[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ...")
        cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
-        xhs_client_obj = KuaiShouClient(
+        ks_client_obj = KuaiShouClient(
            proxies=httpx_proxy,
            headers={
                "User-Agent": self.user_agent,
@ -215,7 +218,7 @@ class KuaishouCrawler(AbstractCrawler):
            playwright_page=self.context_page,
            cookie_dict=cookie_dict,
        )
-        return xhs_client_obj
+        return ks_client_obj

    async def launch_browser(
            self,
@ -246,6 +249,39 @@ class KuaishouCrawler(AbstractCrawler):
            )
            return browser_context

+    async def get_creators_and_videos(self) -> None:
+        """Get creator's videos and retrieve their comment information."""
+        utils.logger.info("[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators")
+        for user_id in config.KS_CREATOR_ID_LIST:
+            # get creator detail info from web html content
+            createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
+            if createor_info:
+                await kuaishou_store.save_creator(user_id, creator=createor_info)
+
+            # Get all video information of the creator
+            all_video_list = await self.ks_client.get_all_videos_by_creator(
+                user_id = user_id,
+                crawl_interval = random.random(),
+                callback = self.fetch_creator_video_detail
+            )
+
+            video_ids = [video_item.get("photo", {}).get("id") for video_item in all_video_list]
+            await self.batch_get_video_comments(video_ids)
+
+    async def fetch_creator_video_detail(self, video_list: List[Dict]):
+        """
+        Concurrently obtain the specified post list and save the data
+        """
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        task_list = [
+            self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore) for post_item in video_list
+        ]
+
+        video_details = await asyncio.gather(*task_list)
+        for video_detail in video_details:
+            if video_detail is not None:
+                await kuaishou_store.update_kuaishou_video(video_detail)
+
    async def close(self):
        """Close browser context"""
        await self.browser_context.close()
--- a/media_platform/kuaishou/graphql.py
+++ b/media_platform/kuaishou/graphql.py
@ -11,7 +11,7 @@ class KuaiShouGraphQL:
        self.load_graphql_queries()

    def load_graphql_queries(self):
-        graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql"]
+        graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql","vision_profile_photo_list.graphql","vision_profile_user_list.graphql","vision_sub_comment_list.graphql"]

        for file in graphql_files:
            with open(self.graphql_dir + file, mode="r") as f:
--- a/media_platform/kuaishou/graphql/vision_profile.graphql
+++ b/media_platform/kuaishou/graphql/vision_profile.graphql
@ -1,16 +1,27 @@
-query visionProfileUserList($pcursor: String, $ftype: Int) {
-  visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
+query visionProfile($userId: String) {
+  visionProfile(userId: $userId) {
    result
-    fols {
+    hostName
+    userProfile {
+      ownerCount {
+        fan
+        photo
+        follow
+        photo_public
+        __typename
+      }
+      profile {
+        gender
        user_name
+        user_id
        headurl
        user_text
+        user_profile_bg_url
+        __typename
+      }
      isFollowing
-      user_id
      __typename
    }
-    hostName
-    pcursor
    __typename
  }
 }
--- a/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql
+++ b/media_platform/kuaishou/graphql/vision_profile_photo_list.graphql
@ -0,0 +1,110 @@
+fragment photoContent on PhotoEntity {
+  __typename
+  id
+  duration
+  caption
+  originCaption
+  likeCount
+  viewCount
+  commentCount
+  realLikeCount
+  coverUrl
+  photoUrl
+  photoH265Url
+  manifest
+  manifestH265
+  videoResource
+  coverUrls {
+    url
+    __typename
+  }
+  timestamp
+  expTag
+  animatedCoverUrl
+  distance
+  videoRatio
+  liked
+  stereoType
+  profileUserTopPhoto
+  musicBlocked
+  riskTagContent
+  riskTagUrl
+}
+
+fragment recoPhotoFragment on recoPhotoEntity {
+  __typename
+  id
+  duration
+  caption
+  originCaption
+  likeCount
+  viewCount
+  commentCount
+  realLikeCount
+  coverUrl
+  photoUrl
+  photoH265Url
+  manifest
+  manifestH265
+  videoResource
+  coverUrls {
+    url
+    __typename
+  }
+  timestamp
+  expTag
+  animatedCoverUrl
+  distance
+  videoRatio
+  liked
+  stereoType
+  profileUserTopPhoto
+  musicBlocked
+  riskTagContent
+  riskTagUrl
+}
+
+fragment feedContent on Feed {
+  type
+  author {
+    id
+    name
+    headerUrl
+    following
+    headerUrls {
+      url
+      __typename
+    }
+    __typename
+  }
+  photo {
+    ...photoContent
+    ...recoPhotoFragment
+    __typename
+  }
+  canAddComment
+  llsid
+  status
+  currentPcursor
+  tags {
+    type
+    name
+    __typename
+  }
+  __typename
+}
+
+query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {
+  visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {
+    result
+    llsid
+    webPageArea
+    feeds {
+      ...feedContent
+      __typename
+    }
+    hostName
+    pcursor
+    __typename
+  }
+}
--- a/media_platform/kuaishou/graphql/vision_profile_user_list.graphql
+++ b/media_platform/kuaishou/graphql/vision_profile_user_list.graphql
@ -0,0 +1,16 @@
+query visionProfileUserList($pcursor: String, $ftype: Int) {
+  visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
+    result
+    fols {
+      user_name
+      headurl
+      user_text
+      isFollowing
+      user_id
+      __typename
+    }
+    hostName
+    pcursor
+    __typename
+  }
+}
--- a/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql
+++ b/media_platform/kuaishou/graphql/vision_sub_comment_list.graphql
@ -0,0 +1,22 @@
+mutation visionSubCommentList($photoId: String, $rootCommentId: String, $pcursor: String) {
+  visionSubCommentList(photoId: $photoId, rootCommentId: $rootCommentId, pcursor: $pcursor) {
+    pcursor
+    subComments {
+      commentId
+      authorId
+      authorName
+      content
+      headurl
+      timestamp
+      likedCount
+      realLikedCount
+      liked
+      status
+      authorLiked
+      replyToUserName
+      replyTo
+      __typename
+    }
+    __typename
+  }
+}
--- a/media_platform/kuaishou/login.py
+++ b/media_platform/kuaishou/login.py
@ -8,6 +8,7 @@ from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
                      wait_fixed)

 from base.base_crawler import AbstractLogin
+import config
 from tools import utils


--- a/store/kuaishou/init.py
+++ b/store/kuaishou/init.py
@ -76,3 +76,22 @@ async def update_ks_video_comment(video_id: str, comment_item: Dict):
    utils.logger.info(
        f"[store.kuaishou.update_ks_video_comment] Kuaishou video comment: {comment_id}, content: {save_comment_item.get('content')}")
    await KuaishouStoreFactory.create_store().store_comment(comment_item=save_comment_item)
+
+async def save_creator(user_id: str, creator: Dict):
+    ownerCount = creator.get('ownerCount', {})
+    profile = creator.get('profile', {})
+
+    local_db_item = {
+        'user_id': user_id,
+        'nickname': profile.get('user_name'),
+        'gender': '女' if profile.get('gender') == "F" else '男',
+        'avatar': profile.get('headurl'),
+        'desc': profile.get('user_text'),
+        'ip_location': "",
+        'follows': ownerCount.get("follow"),
+        'fans': ownerCount.get("fan"),
+        'interaction': ownerCount.get("photo_public"),
+        "last_modify_ts": utils.get_current_timestamp(),
+    }
+    utils.logger.info(f"[store.kuaishou.save_creator] creator:{local_db_item}")
+    await KuaishouStoreFactory.create_store().store_creator(local_db_item)
--- a/store/kuaishou/kuaishou_store_impl.py
+++ b/store/kuaishou/kuaishou_store_impl.py
@ -205,3 +205,14 @@ class KuaishouJsonStoreImplement(AbstractStore):

        """
        await self.save_data_to_json(comment_item, "comments")
+
+    async def store_creator(self, creator: Dict):
+        """
+        Kuaishou content JSON storage implementation
+        Args:
+            creator: creator dict
+
+        Returns:
+
+        """
+        await self.save_data_to_json(creator, "creator")