Merge branch 'kuaishou'
This commit is contained in:
commit
fd7407cc29
|
@ -22,12 +22,11 @@
|
|||
|-----|-------|----------|-----|--------|-------|-------|-------|
|
||||
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
|
||||
|
||||
|
||||
## 使用方法
|
||||
|
||||
### 创建并激活 python 虚拟环境
|
||||
|
|
|
@ -103,6 +103,13 @@ BILI_CREATOR_ID_LIST = [
|
|||
# ........................
|
||||
]
|
||||
|
||||
# 指定快手创作者ID列表
|
||||
KS_CREATOR_ID_LIST = [
|
||||
"3x4sm73aye7jq7i",
|
||||
# ........................
|
||||
]
|
||||
|
||||
|
||||
#词云相关
|
||||
#是否开启生成评论词云图
|
||||
ENABLE_GET_WORDCLOUD = False
|
||||
|
@ -118,5 +125,3 @@ STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
|
|||
|
||||
#中文字体文件路径
|
||||
FONT_PATH= "./docs/STZHONGS.TTF"
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
|
@ -67,7 +67,7 @@ class KuaiShouClient(AbstractApiClient):
|
|||
"variables": {
|
||||
"ftype": 1,
|
||||
},
|
||||
"query": self.graphql.get("vision_profile")
|
||||
"query": self.graphql.get("vision_profile_user_list")
|
||||
}
|
||||
res = await self.post("", post_data)
|
||||
if res.get("visionProfileUserList", {}).get("result") == 1:
|
||||
|
@ -129,17 +129,60 @@ class KuaiShouClient(AbstractApiClient):
|
|||
"pcursor": pcursor
|
||||
},
|
||||
"query": self.graphql.get("comment_list")
|
||||
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_all_comments(self, photo_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
|
||||
callback: Optional[Callable] = None):
|
||||
async def get_video_sub_comments(
|
||||
self, photo_id: str, rootCommentId: str, pcursor: str = ""
|
||||
) -> Dict:
|
||||
"""get video sub comments
|
||||
:param photo_id: photo id you want to fetch
|
||||
:param pcursor: last you get pcursor, defaults to ""
|
||||
:return:
|
||||
"""
|
||||
post_data = {
|
||||
"operationName": "visionSubCommentList",
|
||||
"variables": {
|
||||
"photoId": photo_id,
|
||||
"pcursor": pcursor,
|
||||
"rootCommentId": rootCommentId,
|
||||
},
|
||||
"query": self.graphql.get("vision_sub_comment_list"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_creator_profile(self, userId: str) -> Dict:
|
||||
post_data = {
|
||||
"operationName": "visionProfile",
|
||||
"variables": {
|
||||
"userId": userId
|
||||
},
|
||||
"query": self.graphql.get("vision_profile"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_by_creater(self, userId: str, pcursor: str = "") -> Dict:
|
||||
post_data = {
|
||||
"operationName": "visionProfilePhotoList",
|
||||
"variables": {
|
||||
"page": "profile",
|
||||
"pcursor": pcursor,
|
||||
"userId": userId
|
||||
},
|
||||
"query": self.graphql.get("vision_profile_photo_list"),
|
||||
}
|
||||
return await self.post("", post_data)
|
||||
|
||||
async def get_video_all_comments(
|
||||
self,
|
||||
photo_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
):
|
||||
"""
|
||||
get video all comments include sub comments
|
||||
:param photo_id:
|
||||
:param crawl_interval:
|
||||
:param is_fetch_sub_comments:
|
||||
:param callback:
|
||||
:return:
|
||||
"""
|
||||
|
@ -158,7 +201,106 @@ class KuaiShouClient(AbstractApiClient):
|
|||
|
||||
result.extend(comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not is_fetch_sub_comments:
|
||||
continue
|
||||
# todo handle get sub comments
|
||||
sub_comments = await self.get_comments_all_sub_comments(
|
||||
comments, photo_id, crawl_interval, callback
|
||||
)
|
||||
result.extend(sub_comments)
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(
|
||||
self,
|
||||
comments: List[Dict],
|
||||
photo_id,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
|
||||
Args:
|
||||
comments: 评论列表
|
||||
photo_id: 视频id
|
||||
crawl_interval: 爬取一次评论的延迟单位(秒)
|
||||
callback: 一次评论爬取结束后
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
utils.logger.info(
|
||||
f"[KuaiShouClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled"
|
||||
)
|
||||
return []
|
||||
|
||||
result = []
|
||||
for comment in comments:
|
||||
sub_comments = comment.get("subComments")
|
||||
if sub_comments and callback:
|
||||
await callback(photo_id, sub_comments)
|
||||
|
||||
sub_comment_pcursor = comment.get("subCommentsPcursor")
|
||||
if sub_comment_pcursor == "no_more":
|
||||
continue
|
||||
|
||||
root_comment_id = comment.get("commentId")
|
||||
sub_comment_pcursor = ""
|
||||
|
||||
while sub_comment_pcursor != "no_more":
|
||||
comments_res = await self.get_video_sub_comments(
|
||||
photo_id, root_comment_id, sub_comment_pcursor
|
||||
)
|
||||
sub_comment_pcursor = comments_res.get("pcursor", "no_more")
|
||||
|
||||
comments = comments_res.get("subComments", [])
|
||||
if callback:
|
||||
await callback(photo_id, comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(comments)
|
||||
return result
|
||||
|
||||
async def get_creator_info(self, user_id: str) -> Dict:
|
||||
"""
|
||||
eg: https://www.kuaishou.com/profile/3x4jtnbfter525a
|
||||
快手用户主页
|
||||
"""
|
||||
|
||||
visionProfile = await self.get_creator_profile(user_id)
|
||||
return visionProfile.get("userProfile")
|
||||
|
||||
async def get_all_videos_by_creator(
|
||||
self,
|
||||
user_id: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
||||
Args:
|
||||
user_id: 用户ID
|
||||
crawl_interval: 爬取一次的延迟单位(秒)
|
||||
callback: 一次分页爬取结束后的更新回调函数
|
||||
Returns:
|
||||
|
||||
"""
|
||||
result = []
|
||||
pcursor = ""
|
||||
|
||||
while pcursor != "no_more":
|
||||
videos_res = await self.get_video_by_creater(user_id, pcursor)
|
||||
if not videos_res:
|
||||
utils.logger.error(
|
||||
f"[KuaiShouClient.get_all_videos_by_creator] The current creator may have been banned by ks, so they cannot access the data."
|
||||
)
|
||||
break
|
||||
|
||||
vision_profile_photo_list = videos_res.get("visionProfilePhotoList", {})
|
||||
pcursor = vision_profile_photo_list.get("pcursor", "")
|
||||
|
||||
videos = vision_profile_photo_list.get("feeds", [])
|
||||
utils.logger.info(
|
||||
f"[KuaiShouClient.get_all_videos_by_creator] got user_id:{user_id} videos len : {len(videos)}"
|
||||
)
|
||||
|
||||
if callback:
|
||||
await callback(videos)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
result.extend(videos)
|
||||
return result
|
||||
|
|
|
@ -65,11 +65,14 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
# Search for videos and retrieve their comment information.
|
||||
await self.search()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_videos()
|
||||
elif config.CRAWLER_TYPE == "creator":
|
||||
# Get creator's information and their videos and comments
|
||||
await self.get_creators_and_videos()
|
||||
else:
|
||||
pass
|
||||
|
||||
|
@ -135,7 +138,7 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund note detail video_id:{video_id}, err: {ex}")
|
||||
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund video detail video_id:{video_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_video_comments(self, video_id_list: List[str]):
|
||||
|
@ -145,7 +148,7 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
:return:
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
utils.logger.info(f"[KuaishouCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
|
||||
utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] Crawling comment mode is not enabled")
|
||||
return
|
||||
|
||||
utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}")
|
||||
|
@ -200,10 +203,10 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
return playwright_proxy, httpx_proxy
|
||||
|
||||
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
|
||||
"""Create xhs client"""
|
||||
"""Create ks client"""
|
||||
utils.logger.info("[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
xhs_client_obj = KuaiShouClient(
|
||||
ks_client_obj = KuaiShouClient(
|
||||
proxies=httpx_proxy,
|
||||
headers={
|
||||
"User-Agent": self.user_agent,
|
||||
|
@ -215,7 +218,7 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
playwright_page=self.context_page,
|
||||
cookie_dict=cookie_dict,
|
||||
)
|
||||
return xhs_client_obj
|
||||
return ks_client_obj
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
|
@ -246,6 +249,39 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
)
|
||||
return browser_context
|
||||
|
||||
async def get_creators_and_videos(self) -> None:
|
||||
"""Get creator's videos and retrieve their comment information."""
|
||||
utils.logger.info("[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators")
|
||||
for user_id in config.KS_CREATOR_ID_LIST:
|
||||
# get creator detail info from web html content
|
||||
createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
|
||||
if createor_info:
|
||||
await kuaishou_store.save_creator(user_id, creator=createor_info)
|
||||
|
||||
# Get all video information of the creator
|
||||
all_video_list = await self.ks_client.get_all_videos_by_creator(
|
||||
user_id = user_id,
|
||||
crawl_interval = random.random(),
|
||||
callback = self.fetch_creator_video_detail
|
||||
)
|
||||
|
||||
video_ids = [video_item.get("photo", {}).get("id") for video_item in all_video_list]
|
||||
await self.batch_get_video_comments(video_ids)
|
||||
|
||||
async def fetch_creator_video_detail(self, video_list: List[Dict]):
|
||||
"""
|
||||
Concurrently obtain the specified post list and save the data
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_video_info_task(post_item.get("photo", {}).get("id"), semaphore) for post_item in video_list
|
||||
]
|
||||
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
for video_detail in video_details:
|
||||
if video_detail is not None:
|
||||
await kuaishou_store.update_kuaishou_video(video_detail)
|
||||
|
||||
async def close(self):
|
||||
"""Close browser context"""
|
||||
await self.browser_context.close()
|
||||
|
|
|
@ -11,7 +11,7 @@ class KuaiShouGraphQL:
|
|||
self.load_graphql_queries()
|
||||
|
||||
def load_graphql_queries(self):
|
||||
graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql"]
|
||||
graphql_files = ["search_query.graphql", "video_detail.graphql", "comment_list.graphql", "vision_profile.graphql","vision_profile_photo_list.graphql","vision_profile_user_list.graphql","vision_sub_comment_list.graphql"]
|
||||
|
||||
for file in graphql_files:
|
||||
with open(self.graphql_dir + file, mode="r") as f:
|
||||
|
|
|
@ -1,16 +1,27 @@
|
|||
query visionProfileUserList($pcursor: String, $ftype: Int) {
|
||||
visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
|
||||
query visionProfile($userId: String) {
|
||||
visionProfile(userId: $userId) {
|
||||
result
|
||||
fols {
|
||||
user_name
|
||||
headurl
|
||||
user_text
|
||||
hostName
|
||||
userProfile {
|
||||
ownerCount {
|
||||
fan
|
||||
photo
|
||||
follow
|
||||
photo_public
|
||||
__typename
|
||||
}
|
||||
profile {
|
||||
gender
|
||||
user_name
|
||||
user_id
|
||||
headurl
|
||||
user_text
|
||||
user_profile_bg_url
|
||||
__typename
|
||||
}
|
||||
isFollowing
|
||||
user_id
|
||||
__typename
|
||||
}
|
||||
hostName
|
||||
pcursor
|
||||
__typename
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
fragment photoContent on PhotoEntity {
|
||||
__typename
|
||||
id
|
||||
duration
|
||||
caption
|
||||
originCaption
|
||||
likeCount
|
||||
viewCount
|
||||
commentCount
|
||||
realLikeCount
|
||||
coverUrl
|
||||
photoUrl
|
||||
photoH265Url
|
||||
manifest
|
||||
manifestH265
|
||||
videoResource
|
||||
coverUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
timestamp
|
||||
expTag
|
||||
animatedCoverUrl
|
||||
distance
|
||||
videoRatio
|
||||
liked
|
||||
stereoType
|
||||
profileUserTopPhoto
|
||||
musicBlocked
|
||||
riskTagContent
|
||||
riskTagUrl
|
||||
}
|
||||
|
||||
fragment recoPhotoFragment on recoPhotoEntity {
|
||||
__typename
|
||||
id
|
||||
duration
|
||||
caption
|
||||
originCaption
|
||||
likeCount
|
||||
viewCount
|
||||
commentCount
|
||||
realLikeCount
|
||||
coverUrl
|
||||
photoUrl
|
||||
photoH265Url
|
||||
manifest
|
||||
manifestH265
|
||||
videoResource
|
||||
coverUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
timestamp
|
||||
expTag
|
||||
animatedCoverUrl
|
||||
distance
|
||||
videoRatio
|
||||
liked
|
||||
stereoType
|
||||
profileUserTopPhoto
|
||||
musicBlocked
|
||||
riskTagContent
|
||||
riskTagUrl
|
||||
}
|
||||
|
||||
fragment feedContent on Feed {
|
||||
type
|
||||
author {
|
||||
id
|
||||
name
|
||||
headerUrl
|
||||
following
|
||||
headerUrls {
|
||||
url
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
photo {
|
||||
...photoContent
|
||||
...recoPhotoFragment
|
||||
__typename
|
||||
}
|
||||
canAddComment
|
||||
llsid
|
||||
status
|
||||
currentPcursor
|
||||
tags {
|
||||
type
|
||||
name
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
|
||||
query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {
|
||||
visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {
|
||||
result
|
||||
llsid
|
||||
webPageArea
|
||||
feeds {
|
||||
...feedContent
|
||||
__typename
|
||||
}
|
||||
hostName
|
||||
pcursor
|
||||
__typename
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
query visionProfileUserList($pcursor: String, $ftype: Int) {
|
||||
visionProfileUserList(pcursor: $pcursor, ftype: $ftype) {
|
||||
result
|
||||
fols {
|
||||
user_name
|
||||
headurl
|
||||
user_text
|
||||
isFollowing
|
||||
user_id
|
||||
__typename
|
||||
}
|
||||
hostName
|
||||
pcursor
|
||||
__typename
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
mutation visionSubCommentList($photoId: String, $rootCommentId: String, $pcursor: String) {
|
||||
visionSubCommentList(photoId: $photoId, rootCommentId: $rootCommentId, pcursor: $pcursor) {
|
||||
pcursor
|
||||
subComments {
|
||||
commentId
|
||||
authorId
|
||||
authorName
|
||||
content
|
||||
headurl
|
||||
timestamp
|
||||
likedCount
|
||||
realLikedCount
|
||||
liked
|
||||
status
|
||||
authorLiked
|
||||
replyToUserName
|
||||
replyTo
|
||||
__typename
|
||||
}
|
||||
__typename
|
||||
}
|
||||
}
|
|
@ -8,6 +8,7 @@ from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
|||
wait_fixed)
|
||||
|
||||
from base.base_crawler import AbstractLogin
|
||||
import config
|
||||
from tools import utils
|
||||
|
||||
|
||||
|
@ -57,7 +58,7 @@ class KuaishouLogin(AbstractLogin):
|
|||
|
||||
# click login button
|
||||
login_button_ele = self.context_page.locator(
|
||||
"xpath=//p[text()=' 登录 ']"
|
||||
"xpath=//p[text()='登录']"
|
||||
)
|
||||
await login_button_ele.click()
|
||||
|
||||
|
|
|
@ -76,3 +76,22 @@ async def update_ks_video_comment(video_id: str, comment_item: Dict):
|
|||
utils.logger.info(
|
||||
f"[store.kuaishou.update_ks_video_comment] Kuaishou video comment: {comment_id}, content: {save_comment_item.get('content')}")
|
||||
await KuaishouStoreFactory.create_store().store_comment(comment_item=save_comment_item)
|
||||
|
||||
async def save_creator(user_id: str, creator: Dict):
|
||||
ownerCount = creator.get('ownerCount', {})
|
||||
profile = creator.get('profile', {})
|
||||
|
||||
local_db_item = {
|
||||
'user_id': user_id,
|
||||
'nickname': profile.get('user_name'),
|
||||
'gender': '女' if profile.get('gender') == "F" else '男',
|
||||
'avatar': profile.get('headurl'),
|
||||
'desc': profile.get('user_text'),
|
||||
'ip_location': "",
|
||||
'follows': ownerCount.get("follow"),
|
||||
'fans': ownerCount.get("fan"),
|
||||
'interaction': ownerCount.get("photo_public"),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
utils.logger.info(f"[store.kuaishou.save_creator] creator:{local_db_item}")
|
||||
await KuaishouStoreFactory.create_store().store_creator(local_db_item)
|
|
@ -205,3 +205,14 @@ class KuaishouJsonStoreImplement(AbstractStore):
|
|||
|
||||
"""
|
||||
await self.save_data_to_json(comment_item, "comments")
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
Kuaishou content JSON storage implementation
|
||||
Args:
|
||||
creator: creator dict
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
await self.save_data_to_json(creator, "creator")
|
Loading…
Reference in New Issue