feat: 抖音abogus参数更新

This commit is contained in:
Relakkes 2024-07-14 03:20:05 +08:00
parent 0807862b63
commit f8096e3d58
7 changed files with 141 additions and 642 deletions

View File

@ -81,6 +81,13 @@
## 开发者服务
- MediaCrawler视频课程
> 课程介绍飞书文档链接https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
> 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个视频课程,从设计出发一步步带你如何使用,门槛大大降低
>
> 同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~<br>
- 知识星球:沉淀高质量常见问题、最佳实践文档、多年编程+爬虫经验分享,提供付费知识星球服务,主动提问,作者会定期回答问题 (每天 1 快钱订阅我的知识服务)
<p>
<img alt="xingqiu" src="https://nm.zizhi1.com/static/img/8e1312d1f52f2e0ff436ea7196b4e27b.15555424244122T1.webp" style="width: auto;height: 400px" >
@ -94,20 +101,14 @@
- MediaCrawler视频课程
> 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个视频课程,从设计出发一步步带你如何使用,门槛大大降低,同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~<br>
> 课程售价非常非常的便宜,几杯咖啡的事儿.<br>
> 课程介绍飞书文档链接https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
## 感谢下列Sponsors对本仓库赞助
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册这个款免费的GPT助手帮我获取GPT4额度作为支持。也是我每天在用的一款chrome AI助手插件</a>
<br>
- 感谢 [JetBrains](https://www.jetbrains.com/?from=gaowei-space/markdown-blog) 对本项目的支持!
<a href="https://www.jetbrains.com/?from=NanmiCoder/MediaCrawler" target="_blank">
<img src="https://resources.jetbrains.com/storage/products/company/brand/logos/jb_beam.png" width="100" height="100">
</a>
<br>
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册这个款免费的GPT助手帮我获取GPT4额度作为支持。也是我每天在用的一款chrome AI助手插件</a>
成为赞助者展示你的产品在这里联系作者relakkes@gmail.com

File diff suppressed because one or more lines are too long

View File

@ -2,11 +2,10 @@ import asyncio
import copy
import json
import urllib.parse
from typing import Any, Callable, Dict, List, Optional
from typing import Any, Callable, Dict, Optional
import execjs
import httpx
from playwright.async_api import BrowserContext, Page
import requests
from playwright.async_api import BrowserContext
from base.base_crawler import AbstractApiClient
from tools import utils
@ -14,6 +13,7 @@ from var import request_keyword_var
from .exception import *
from .field import *
from .help import *
class DOUYINClient(AbstractApiClient):
@ -33,51 +33,71 @@ class DOUYINClient(AbstractApiClient):
self.playwright_page = playwright_page
self.cookie_dict = cookie_dict
async def __process_req_params(self, params: Optional[Dict] = None, headers: Optional[Dict] = None):
async def __process_req_params(
self, params: Optional[Dict] = None, headers: Optional[Dict] = None,
request_method="GET"
):
if not params:
return
headers = headers or self.headers
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore
douyin_js_obj = execjs.compile(open('libs/douyin.js').read())
common_params = {
"device_platform": "webapp",
"aid": "6383",
"channel": "channel_pc_web",
"version_code": "190600",
"version_name": "19.6.0",
"update_version_code": "170400",
"pc_client_type": "1",
"cookie_enabled": "true",
"browser_language": "zh-CN",
"browser_platform": "Win32",
"browser_name": "Firefox",
"browser_version": "110.0",
"browser_platform": "MacIntel",
"browser_name": "Chrome",
"browser_version": "125.0.0.0",
"browser_online": "true",
"engine_name": "Gecko",
"os_name": "Windows",
"os_version": "10",
"engine_name": "Blink",
"os_name": "Mac OS",
"os_version": "10.15.7",
"cpu_core_num": "8",
"device_memory": "8",
"engine_version": "109.0",
"platform": "PC",
"screen_width": "1920",
"screen_height": "1200",
# " webid": douyin_js_obj.call("get_web_id"),
# "msToken": local_storage.get("xmst"),
# "msToken": "abL8SeUTPa9-EToD8qfC7toScSADxpg6yLh2dbNcpWHzE0bT04txM_4UwquIcRvkRb9IU8sifwgM1Kwf1Lsld81o9Irt2_yNyUbbQPSUO8EfVlZJ_78FckDFnwVBVUVK",
"screen_width": "2560",
"screen_height": "1440",
'effective_type': '4g',
"round_trip_time": "50",
"webid": get_web_id(),
"msToken": local_storage.get("xmst"),
}
params.update(common_params)
query = '&'.join([f'{k}={v}' for k, v in params.items()])
x_bogus = douyin_js_obj.call('sign', query, headers["User-Agent"])
params["X-Bogus"] = x_bogus
# print(x_bogus, query)
query_string = urllib.parse.urlencode(params)
# 20240610 a-bogus更新Playwright版本
post_data = {}
if request_method == "POST":
post_data = params
a_bogus = await get_a_bogus(query_string, post_data, headers["User-Agent"], self.playwright_page)
params["a_bogus"] = a_bogus
async def request(self, method, url, **kwargs):
async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request(
method, url, timeout=self.timeout,
**kwargs
)
response = None
if method == "GET":
response = requests.request(method, url, **kwargs)
elif method == "POST":
response = requests.request(method, url, **kwargs)
try:
if response.text == "" or response.text == "blocked":
utils.logger.error(f"request params incrr, response.text: {response.text}")
raise Exception("account blocked")
return response.json()
except Exception as e:
raise DataFetchError(f"{e}, {response.text}")
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
"""
GET请求
"""
await self.__process_req_params(params, headers)
headers = headers or self.headers
return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers)
@ -117,27 +137,30 @@ class DOUYINClient(AbstractApiClient):
:param publish_time: ·
:return:
"""
params = {
"keyword": urllib.parse.quote(keyword),
"search_channel": search_channel.value,
"search_source": "normal_search",
"query_correct_type": 1,
"is_filter_search": 0,
"offset": offset,
"count": 10 # must be set to 10
query_params = {
'search_channel': search_channel.value,
'enable_history': '1',
'keyword': urllib.parse.quote(keyword),
'search_source': 'tab_search',
'query_correct_type': '1',
'is_filter_search': '0',
'from_group_id': '7378810571505847586',
'offset': offset,
'count': '15',
'need_filter_settings': '1',
'list_type': 'multi',
}
if sort_type != SearchSortType.GENERAL or publish_time != PublishTimeType.UNLIMITED:
params["filter_selected"] = urllib.parse.quote(json.dumps({
if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
query_params["filter_selected"] = urllib.parse.quote(json.dumps({
"sort_type": str(sort_type.value),
"publish_time": str(publish_time.value)
}))
params["is_filter_search"] = 1
params["search_source"] = "tab_search"
referer_url = "https://www.douyin.com/search/" + keyword
referer_url += f"?publish_time={publish_time.value}&sort_type={sort_type.value}&type=general"
query_params["is_filter_search"] = 1
query_params["search_source"] = "tab_search"
referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general"
headers = copy.copy(self.headers)
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers)
return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers)
async def get_video_by_id(self, aweme_id: str) -> Any:
"""
@ -149,7 +172,6 @@ class DOUYINClient(AbstractApiClient):
"aweme_id": aweme_id
}
headers = copy.copy(self.headers)
# headers["Cookie"] = "s_v_web_id=verify_lol4a8dv_wpQ1QMyP_xemd_4wON_8Yzr_FJa8DN1vdY2m;"
del headers["Origin"]
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
return res.get("aweme_detail", {})
@ -259,7 +281,9 @@ class DOUYINClient(AbstractApiClient):
"count": 18,
"max_cursor": max_cursor,
"locate_query": "false",
"publish_video_strategy_type": 2
"publish_video_strategy_type": 2,
'verifyFp': 'verify_lx901cuk_K7kaK4dK_bn2E_4dgk_BxAA_E0XS1VtUi130',
'fp': 'verify_lx901cuk_K7kaK4dK_bn2E_4dgk_BxAA_E0XS1VtUi130'
}
return await self.get(uri, params)

View File

@ -26,7 +26,6 @@ class DouYinCrawler(AbstractCrawler):
browser_context: BrowserContext
def __init__(self) -> None:
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.index_url = "https://www.douyin.com"
async def start(self) -> None:
@ -42,7 +41,7 @@ class DouYinCrawler(AbstractCrawler):
self.browser_context = await self.launch_browser(
chromium,
None,
self.user_agent,
user_agent=None,
headless=config.HEADLESS
)
# stealth.min.js is a js script to prevent the website from detecting the crawler.
@ -225,7 +224,7 @@ class DouYinCrawler(AbstractCrawler):
douyin_client = DOUYINClient(
proxies=httpx_proxy,
headers={
"User-Agent": self.user_agent,
"User-Agent": await self.context_page.evaluate("() => navigator.userAgent"),
"Cookie": cookie_str,
"Host": "www.douyin.com",
"Origin": "https://www.douyin.com/",

View File

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Name : 程序员阿江-Relakkes
# @Time : 2024/6/10 02:24
# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除
import random
from playwright.async_api import Page
def get_web_id():
"""
生成随机的webid
Returns:
"""
def e(t):
if t is not None:
return str(t ^ (int(16 * random.random()) >> (t // 4)))
else:
return ''.join(
[str(int(1e7)), '-', str(int(1e3)), '-', str(int(4e3)), '-', str(int(8e3)), '-', str(int(1e11))]
)
web_id = ''.join(
e(int(x)) if x in '018' else x for x in e(None)
)
return web_id.replace('-', '')[:19]
async def get_a_bogus(params: str, post_data: dict, user_agent: str, page: Page = None):
"""
获取 a_bogus 参数
"""
return await get_a_bogus_from_playright(params, post_data, user_agent, page)
async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page):
"""
通过playright获取 a_bogus 参数
Returns:
"""
if not post_data:
post_data = ""
a_bogus = await page.evaluate(
"([params, post_data, ua]) => window.bdms.init._v[2].p[42].apply(null, [0, 1, 8, params, post_data, ua])",
[params, post_data, user_agent])
return a_bogus

View File

@ -2,7 +2,6 @@ httpx==0.24.0
Pillow==9.5.0
playwright==1.42.0
tenacity==8.2.2
PyExecJS==1.5.1
opencv-python
aiomysql==0.2.0
redis~=4.6.0
@ -14,3 +13,4 @@ python-dotenv==1.0.1
jieba==0.42.1
wordcloud==1.9.3
matplotlib==3.9.0
requests==2.32.3

View File

@ -7,8 +7,8 @@ from typing import List
import config
from . import xhs_store_impl
from .xhs_store_impl import *
from .xhs_store_image import *
from .xhs_store_impl import *
class XhsStoreFactory: