MediaCrawler/media_platform/douyin/client.py

323 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
# 1. 不得用于任何商业用途。
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
# 3. 不得进行大规模爬取或对平台造成运营干扰。
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
# 5. 不得用于任何非法或不当的用途。
#
# 详细许可条款请参阅项目根目录下的LICENSE文件。
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio
import copy
import json
import urllib.parse
from typing import Any, Callable, Dict, Optional
import requests
from playwright.async_api import BrowserContext
from base.base_crawler import AbstractApiClient
from tools import utils
from var import request_keyword_var
from .exception import *
from .field import *
from .help import *
class DOUYINClient(AbstractApiClient):
def __init__(
self,
timeout=30,
proxies=None,
*,
headers: Dict,
playwright_page: Optional[Page],
cookie_dict: Dict
):
self.proxies = proxies
self.timeout = timeout
self.headers = headers
self._host = "https://www.douyin.com"
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
async def __process_req_params(
self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None,
request_method="GET"
):
if not params:
return
headers = headers or self.headers
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore
common_params = {
"device_platform": "webapp",
"aid": "6383",
"channel": "channel_pc_web",
"version_code": "190600",
"version_name": "19.6.0",
"update_version_code": "170400",
"pc_client_type": "1",
"cookie_enabled": "true",
"browser_language": "zh-CN",
"browser_platform": "MacIntel",
"browser_name": "Chrome",
"browser_version": "125.0.0.0",
"browser_online": "true",
"engine_name": "Blink",
"os_name": "Mac OS",
"os_version": "10.15.7",
"cpu_core_num": "8",
"device_memory": "8",
"engine_version": "109.0",
"platform": "PC",
"screen_width": "2560",
"screen_height": "1440",
'effective_type': '4g',
"round_trip_time": "50",
"webid": get_web_id(),
"msToken": local_storage.get("xmst"),
}
params.update(common_params)
query_string = urllib.parse.urlencode(params)
# 20240927 a-bogus更新JS版本
post_data = {}
if request_method == "POST":
post_data = params
a_bogus = await get_a_bogus(uri, query_string, post_data, headers["User-Agent"], self.playwright_page)
params["a_bogus"] = a_bogus
async def request(self, method, url, **kwargs):
response = None
if method == "GET":
response = requests.request(method, url, **kwargs)
elif method == "POST":
response = requests.request(method, url, **kwargs)
try:
if response.text == "" or response.text == "blocked":
utils.logger.error(f"request params incrr, response.text: {response.text}")
raise Exception("account blocked")
return response.json()
except Exception as e:
raise DataFetchError(f"{e}, {response.text}")
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
"""
GET请求
"""
await self.__process_req_params(uri, params, headers)
headers = headers or self.headers
return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers)
async def post(self, uri: str, data: dict, headers: Optional[Dict] = None):
await self.__process_req_params(uri, data, headers)
headers = headers or self.headers
return await self.request(method="POST", url=f"{self._host}{uri}", data=data, headers=headers)
async def pong(self, browser_context: BrowserContext) -> bool:
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
if local_storage.get("HasUserLogin", "") == "1":
return True
_, cookie_dict = utils.convert_cookies(await browser_context.cookies())
return cookie_dict.get("LOGIN_STATUS") == "1"
async def update_cookies(self, browser_context: BrowserContext):
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
self.headers["Cookie"] = cookie_str
self.cookie_dict = cookie_dict
async def search_info_by_keyword(
self,
keyword: str,
offset: int = 0,
search_channel: SearchChannelType = SearchChannelType.GENERAL,
sort_type: SearchSortType = SearchSortType.GENERAL,
publish_time: PublishTimeType = PublishTimeType.UNLIMITED,
search_id: str = ""
):
"""
DouYin Web Search API
:param keyword:
:param offset:
:param search_channel:
:param sort_type:
:param publish_time: ·
:param search_id: ·
:return:
"""
query_params = {
'search_channel': search_channel.value,
'enable_history': '1',
'keyword': keyword,
'search_source': 'tab_search',
'query_correct_type': '1',
'is_filter_search': '0',
'from_group_id': '7378810571505847586',
'offset': offset,
'count': '15',
'need_filter_settings': '1',
'list_type': 'multi',
'search_id': search_id,
}
if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
query_params["filter_selected"] = json.dumps({
"sort_type": str(sort_type.value),
"publish_time": str(publish_time.value)
})
query_params["is_filter_search"] = 1
query_params["search_source"] = "tab_search"
referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general"
headers = copy.copy(self.headers)
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers)
async def get_video_by_id(self, aweme_id: str) -> Any:
"""
DouYin Video Detail API
:param aweme_id:
:return:
"""
params = {
"aweme_id": aweme_id
}
headers = copy.copy(self.headers)
del headers["Origin"]
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
return res.get("aweme_detail", {})
async def get_aweme_comments(self, aweme_id: str, cursor: int = 0):
"""get note comments
"""
uri = "/aweme/v1/web/comment/list/"
params = {
"aweme_id": aweme_id,
"cursor": cursor,
"count": 20,
"item_type": 0
}
keywords = request_keyword_var.get()
referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general'
headers = copy.copy(self.headers)
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
return await self.get(uri, params)
async def get_sub_comments(self, comment_id: str, cursor: int = 0):
"""
获取子评论
"""
uri = "/aweme/v1/web/comment/list/reply/"
params = {
'comment_id': comment_id,
"cursor": cursor,
"count": 20,
"item_type": 0,
}
keywords = request_keyword_var.get()
referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general'
headers = copy.copy(self.headers)
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
return await self.get(uri, params)
async def get_aweme_all_comments(
self,
aweme_id: str,
crawl_interval: float = 1.0,
is_fetch_sub_comments=False,
callback: Optional[Callable] = None,
max_count: int = 10,
):
"""
获取帖子的所有评论,包括子评论
:param aweme_id: 帖子ID
:param crawl_interval: 抓取间隔
:param is_fetch_sub_comments: 是否抓取子评论
:param callback: 回调函数,用于处理抓取到的评论
:param max_count: 一次帖子爬取的最大评论数量
:return: 评论列表
"""
result = []
comments_has_more = 1
comments_cursor = 0
while comments_has_more and len(result) < max_count:
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
comments_has_more = comments_res.get("has_more", 0)
comments_cursor = comments_res.get("cursor", 0)
comments = comments_res.get("comments", [])
if not comments:
continue
if len(result) + len(comments) > max_count:
comments = comments[:max_count - len(result)]
result.extend(comments)
if callback: # 如果有回调函数,就执行回调函数
await callback(aweme_id, comments)
await asyncio.sleep(crawl_interval)
if not is_fetch_sub_comments:
continue
# 获取二级评论
for comment in comments:
reply_comment_total = comment.get("reply_comment_total")
if reply_comment_total > 0:
comment_id = comment.get("cid")
sub_comments_has_more = 1
sub_comments_cursor = 0
while sub_comments_has_more:
sub_comments_res = await self.get_sub_comments(comment_id, sub_comments_cursor)
sub_comments_has_more = sub_comments_res.get("has_more", 0)
sub_comments_cursor = sub_comments_res.get("cursor", 0)
sub_comments = sub_comments_res.get("comments", [])
if not sub_comments:
continue
result.extend(sub_comments)
if callback: # 如果有回调函数,就执行回调函数
await callback(aweme_id, sub_comments)
await asyncio.sleep(crawl_interval)
return result
async def get_user_info(self, sec_user_id: str):
uri = "/aweme/v1/web/user/profile/other/"
params = {
"sec_user_id": sec_user_id,
"publish_video_strategy_type": 2,
"personal_center_strategy": 1,
}
return await self.get(uri, params)
async def get_user_aweme_posts(self, sec_user_id: str, max_cursor: str = "") -> Dict:
uri = "/aweme/v1/web/aweme/post/"
params = {
"sec_user_id": sec_user_id,
"count": 18,
"max_cursor": max_cursor,
"locate_query": "false",
"publish_video_strategy_type": 2,
'verifyFp': 'verify_lx901cuk_K7kaK4dK_bn2E_4dgk_BxAA_E0XS1VtUi130',
'fp': 'verify_lx901cuk_K7kaK4dK_bn2E_4dgk_BxAA_E0XS1VtUi130'
}
return await self.get(uri, params)
async def get_all_user_aweme_posts(self, sec_user_id: str, callback: Optional[Callable] = None):
posts_has_more = 1
max_cursor = ""
result = []
while posts_has_more == 1:
aweme_post_res = await self.get_user_aweme_posts(sec_user_id, max_cursor)
posts_has_more = aweme_post_res.get("has_more", 0)
max_cursor = aweme_post_res.get("max_cursor")
aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else []
utils.logger.info(
f"[DOUYINClient.get_all_user_aweme_posts] got sec_user_id:{sec_user_id} video len : {len(aweme_list)}")
if callback:
await callback(aweme_list)
result.extend(aweme_list)
return result