feat: 抖音abogus参数更新
This commit is contained in:
parent
0807862b63
commit
f8096e3d58
17
README.md
17
README.md
|
@ -81,6 +81,13 @@
|
|||
|
||||
|
||||
## 开发者服务
|
||||
- MediaCrawler视频课程:
|
||||
> 课程介绍飞书文档链接:https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
|
||||
> 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个视频课程,从设计出发一步步带你如何使用,门槛大大降低
|
||||
>
|
||||
> 同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~<br>
|
||||
|
||||
|
||||
- 知识星球:沉淀高质量常见问题、最佳实践文档、多年编程+爬虫经验分享,提供付费知识星球服务,主动提问,作者会定期回答问题 (每天 1 快钱订阅我的知识服务)
|
||||
<p>
|
||||
<img alt="xingqiu" src="https://nm.zizhi1.com/static/img/8e1312d1f52f2e0ff436ea7196b4e27b.15555424244122T1.webp" style="width: auto;height: 400px" >
|
||||
|
@ -94,20 +101,14 @@
|
|||
|
||||
|
||||
|
||||
- MediaCrawler视频课程:
|
||||
> 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个视频课程,从设计出发一步步带你如何使用,门槛大大降低,同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~<br>
|
||||
> 课程售价非常非常的便宜,几杯咖啡的事儿.<br>
|
||||
> 课程介绍飞书文档链接:https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
|
||||
|
||||
|
||||
|
||||
## 感谢下列Sponsors对本仓库赞助
|
||||
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册这个款免费的GPT助手,帮我获取GPT4额度作为支持。也是我每天在用的一款chrome AI助手插件</a>
|
||||
<br>
|
||||
- 感谢 [JetBrains](https://www.jetbrains.com/?from=gaowei-space/markdown-blog) 对本项目的支持!
|
||||
<a href="https://www.jetbrains.com/?from=NanmiCoder/MediaCrawler" target="_blank">
|
||||
<img src="https://resources.jetbrains.com/storage/products/company/brand/logos/jb_beam.png" width="100" height="100">
|
||||
</a>
|
||||
<br>
|
||||
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册这个款免费的GPT助手,帮我获取GPT4额度作为支持。也是我每天在用的一款chrome AI助手插件</a>
|
||||
|
||||
成为赞助者,展示你的产品在这里,联系作者:relakkes@gmail.com
|
||||
|
||||
|
|
578
libs/douyin.js
578
libs/douyin.js
File diff suppressed because one or more lines are too long
|
@ -2,11 +2,10 @@ import asyncio
|
|||
import copy
|
||||
import json
|
||||
import urllib.parse
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
|
||||
import execjs
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
import requests
|
||||
from playwright.async_api import BrowserContext
|
||||
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from tools import utils
|
||||
|
@ -14,6 +13,7 @@ from var import request_keyword_var
|
|||
|
||||
from .exception import *
|
||||
from .field import *
|
||||
from .help import *
|
||||
|
||||
|
||||
class DOUYINClient(AbstractApiClient):
|
||||
|
@ -33,51 +33,71 @@ class DOUYINClient(AbstractApiClient):
|
|||
self.playwright_page = playwright_page
|
||||
self.cookie_dict = cookie_dict
|
||||
|
||||
async def __process_req_params(self, params: Optional[Dict] = None, headers: Optional[Dict] = None):
|
||||
async def __process_req_params(
|
||||
self, params: Optional[Dict] = None, headers: Optional[Dict] = None,
|
||||
request_method="GET"
|
||||
):
|
||||
|
||||
if not params:
|
||||
return
|
||||
headers = headers or self.headers
|
||||
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore
|
||||
douyin_js_obj = execjs.compile(open('libs/douyin.js').read())
|
||||
common_params = {
|
||||
"device_platform": "webapp",
|
||||
"aid": "6383",
|
||||
"channel": "channel_pc_web",
|
||||
"version_code": "190600",
|
||||
"version_name": "19.6.0",
|
||||
"update_version_code": "170400",
|
||||
"pc_client_type": "1",
|
||||
"cookie_enabled": "true",
|
||||
"browser_language": "zh-CN",
|
||||
"browser_platform": "Win32",
|
||||
"browser_name": "Firefox",
|
||||
"browser_version": "110.0",
|
||||
"browser_platform": "MacIntel",
|
||||
"browser_name": "Chrome",
|
||||
"browser_version": "125.0.0.0",
|
||||
"browser_online": "true",
|
||||
"engine_name": "Gecko",
|
||||
"os_name": "Windows",
|
||||
"os_version": "10",
|
||||
"engine_name": "Blink",
|
||||
"os_name": "Mac OS",
|
||||
"os_version": "10.15.7",
|
||||
"cpu_core_num": "8",
|
||||
"device_memory": "8",
|
||||
"engine_version": "109.0",
|
||||
"platform": "PC",
|
||||
"screen_width": "1920",
|
||||
"screen_height": "1200",
|
||||
# " webid": douyin_js_obj.call("get_web_id"),
|
||||
# "msToken": local_storage.get("xmst"),
|
||||
# "msToken": "abL8SeUTPa9-EToD8qfC7toScSADxpg6yLh2dbNcpWHzE0bT04txM_4UwquIcRvkRb9IU8sifwgM1Kwf1Lsld81o9Irt2_yNyUbbQPSUO8EfVlZJ_78FckDFnwVBVUVK",
|
||||
"screen_width": "2560",
|
||||
"screen_height": "1440",
|
||||
'effective_type': '4g',
|
||||
"round_trip_time": "50",
|
||||
"webid": get_web_id(),
|
||||
"msToken": local_storage.get("xmst"),
|
||||
}
|
||||
params.update(common_params)
|
||||
query = '&'.join([f'{k}={v}' for k, v in params.items()])
|
||||
x_bogus = douyin_js_obj.call('sign', query, headers["User-Agent"])
|
||||
params["X-Bogus"] = x_bogus
|
||||
# print(x_bogus, query)
|
||||
query_string = urllib.parse.urlencode(params)
|
||||
|
||||
# 20240610 a-bogus更新(Playwright版本)
|
||||
post_data = {}
|
||||
if request_method == "POST":
|
||||
post_data = params
|
||||
a_bogus = await get_a_bogus(query_string, post_data, headers["User-Agent"], self.playwright_page)
|
||||
params["a_bogus"] = a_bogus
|
||||
|
||||
async def request(self, method, url, **kwargs):
|
||||
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
||||
response = await client.request(
|
||||
method, url, timeout=self.timeout,
|
||||
**kwargs
|
||||
)
|
||||
response = None
|
||||
if method == "GET":
|
||||
response = requests.request(method, url, **kwargs)
|
||||
elif method == "POST":
|
||||
response = requests.request(method, url, **kwargs)
|
||||
try:
|
||||
if response.text == "" or response.text == "blocked":
|
||||
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||
raise Exception("account blocked")
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
raise DataFetchError(f"{e}, {response.text}")
|
||||
|
||||
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
|
||||
"""
|
||||
GET请求
|
||||
"""
|
||||
await self.__process_req_params(params, headers)
|
||||
headers = headers or self.headers
|
||||
return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers)
|
||||
|
@ -117,27 +137,30 @@ class DOUYINClient(AbstractApiClient):
|
|||
:param publish_time: ·
|
||||
:return:
|
||||
"""
|
||||
params = {
|
||||
"keyword": urllib.parse.quote(keyword),
|
||||
"search_channel": search_channel.value,
|
||||
"search_source": "normal_search",
|
||||
"query_correct_type": 1,
|
||||
"is_filter_search": 0,
|
||||
"offset": offset,
|
||||
"count": 10 # must be set to 10
|
||||
query_params = {
|
||||
'search_channel': search_channel.value,
|
||||
'enable_history': '1',
|
||||
'keyword': urllib.parse.quote(keyword),
|
||||
'search_source': 'tab_search',
|
||||
'query_correct_type': '1',
|
||||
'is_filter_search': '0',
|
||||
'from_group_id': '7378810571505847586',
|
||||
'offset': offset,
|
||||
'count': '15',
|
||||
'need_filter_settings': '1',
|
||||
'list_type': 'multi',
|
||||
}
|
||||
if sort_type != SearchSortType.GENERAL or publish_time != PublishTimeType.UNLIMITED:
|
||||
params["filter_selected"] = urllib.parse.quote(json.dumps({
|
||||
if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
|
||||
query_params["filter_selected"] = urllib.parse.quote(json.dumps({
|
||||
"sort_type": str(sort_type.value),
|
||||
"publish_time": str(publish_time.value)
|
||||
}))
|
||||
params["is_filter_search"] = 1
|
||||
params["search_source"] = "tab_search"
|
||||
referer_url = "https://www.douyin.com/search/" + keyword
|
||||
referer_url += f"?publish_time={publish_time.value}&sort_type={sort_type.value}&type=general"
|
||||
query_params["is_filter_search"] = 1
|
||||
query_params["search_source"] = "tab_search"
|
||||
referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general"
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||
return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers)
|
||||
return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers)
|
||||
|
||||
async def get_video_by_id(self, aweme_id: str) -> Any:
|
||||
"""
|
||||
|
@ -149,7 +172,6 @@ class DOUYINClient(AbstractApiClient):
|
|||
"aweme_id": aweme_id
|
||||
}
|
||||
headers = copy.copy(self.headers)
|
||||
# headers["Cookie"] = "s_v_web_id=verify_lol4a8dv_wpQ1QMyP_xemd_4wON_8Yzr_FJa8DN1vdY2m;"
|
||||
del headers["Origin"]
|
||||
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
|
||||
return res.get("aweme_detail", {})
|
||||
|
@ -259,7 +281,9 @@ class DOUYINClient(AbstractApiClient):
|
|||
"count": 18,
|
||||
"max_cursor": max_cursor,
|
||||
"locate_query": "false",
|
||||
"publish_video_strategy_type": 2
|
||||
"publish_video_strategy_type": 2,
|
||||
'verifyFp': 'verify_lx901cuk_K7kaK4dK_bn2E_4dgk_BxAA_E0XS1VtUi130',
|
||||
'fp': 'verify_lx901cuk_K7kaK4dK_bn2E_4dgk_BxAA_E0XS1VtUi130'
|
||||
}
|
||||
return await self.get(uri, params)
|
||||
|
||||
|
|
|
@ -26,7 +26,6 @@ class DouYinCrawler(AbstractCrawler):
|
|||
browser_context: BrowserContext
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
||||
self.index_url = "https://www.douyin.com"
|
||||
|
||||
async def start(self) -> None:
|
||||
|
@ -42,7 +41,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||
self.browser_context = await self.launch_browser(
|
||||
chromium,
|
||||
None,
|
||||
self.user_agent,
|
||||
user_agent=None,
|
||||
headless=config.HEADLESS
|
||||
)
|
||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||
|
@ -225,7 +224,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||
douyin_client = DOUYINClient(
|
||||
proxies=httpx_proxy,
|
||||
headers={
|
||||
"User-Agent": self.user_agent,
|
||||
"User-Agent": await self.context_page.evaluate("() => navigator.userAgent"),
|
||||
"Cookie": cookie_str,
|
||||
"Host": "www.douyin.com",
|
||||
"Origin": "https://www.douyin.com/",
|
||||
|
|
|
@ -0,0 +1,53 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# @Author : relakkes@gmail.com
|
||||
# @Name : 程序员阿江-Relakkes
|
||||
# @Time : 2024/6/10 02:24
|
||||
# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除
|
||||
|
||||
import random
|
||||
|
||||
from playwright.async_api import Page
|
||||
|
||||
|
||||
def get_web_id():
|
||||
"""
|
||||
生成随机的webid
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
def e(t):
|
||||
if t is not None:
|
||||
return str(t ^ (int(16 * random.random()) >> (t // 4)))
|
||||
else:
|
||||
return ''.join(
|
||||
[str(int(1e7)), '-', str(int(1e3)), '-', str(int(4e3)), '-', str(int(8e3)), '-', str(int(1e11))]
|
||||
)
|
||||
|
||||
web_id = ''.join(
|
||||
e(int(x)) if x in '018' else x for x in e(None)
|
||||
)
|
||||
return web_id.replace('-', '')[:19]
|
||||
|
||||
|
||||
async def get_a_bogus(params: str, post_data: dict, user_agent: str, page: Page = None):
|
||||
"""
|
||||
获取 a_bogus 参数
|
||||
"""
|
||||
return await get_a_bogus_from_playright(params, post_data, user_agent, page)
|
||||
|
||||
|
||||
async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page):
|
||||
"""
|
||||
通过playright获取 a_bogus 参数
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not post_data:
|
||||
post_data = ""
|
||||
a_bogus = await page.evaluate(
|
||||
"([params, post_data, ua]) => window.bdms.init._v[2].p[42].apply(null, [0, 1, 8, params, post_data, ua])",
|
||||
[params, post_data, user_agent])
|
||||
|
||||
return a_bogus
|
||||
|
|
@ -2,7 +2,6 @@ httpx==0.24.0
|
|||
Pillow==9.5.0
|
||||
playwright==1.42.0
|
||||
tenacity==8.2.2
|
||||
PyExecJS==1.5.1
|
||||
opencv-python
|
||||
aiomysql==0.2.0
|
||||
redis~=4.6.0
|
||||
|
@ -14,3 +13,4 @@ python-dotenv==1.0.1
|
|||
jieba==0.42.1
|
||||
wordcloud==1.9.3
|
||||
matplotlib==3.9.0
|
||||
requests==2.32.3
|
|
@ -7,8 +7,8 @@ from typing import List
|
|||
import config
|
||||
|
||||
from . import xhs_store_impl
|
||||
from .xhs_store_impl import *
|
||||
from .xhs_store_image import *
|
||||
from .xhs_store_impl import *
|
||||
|
||||
|
||||
class XhsStoreFactory:
|
||||
|
|
Loading…
Reference in New Issue