feat: 抖音abogus参数更新

This commit is contained in:
Relakkes 2024-07-14 03:20:05 +08:00
parent 0807862b63
commit f8096e3d58
7 changed files with 141 additions and 642 deletions

View File

@ -81,6 +81,13 @@
## 开发者服务 ## 开发者服务
- MediaCrawler视频课程
> 课程介绍飞书文档链接https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
> 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个视频课程,从设计出发一步步带你如何使用,门槛大大降低
>
> 同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~<br>
- 知识星球:沉淀高质量常见问题、最佳实践文档、多年编程+爬虫经验分享,提供付费知识星球服务,主动提问,作者会定期回答问题 (每天 1 快钱订阅我的知识服务) - 知识星球:沉淀高质量常见问题、最佳实践文档、多年编程+爬虫经验分享,提供付费知识星球服务,主动提问,作者会定期回答问题 (每天 1 快钱订阅我的知识服务)
<p> <p>
<img alt="xingqiu" src="https://nm.zizhi1.com/static/img/8e1312d1f52f2e0ff436ea7196b4e27b.15555424244122T1.webp" style="width: auto;height: 400px" > <img alt="xingqiu" src="https://nm.zizhi1.com/static/img/8e1312d1f52f2e0ff436ea7196b4e27b.15555424244122T1.webp" style="width: auto;height: 400px" >
@ -93,21 +100,15 @@
- [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html) - [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html)
- MediaCrawler视频课程
> 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个视频课程,从设计出发一步步带你如何使用,门槛大大降低,同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~<br>
> 课程售价非常非常的便宜,几杯咖啡的事儿.<br>
> 课程介绍飞书文档链接https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
## 感谢下列Sponsors对本仓库赞助 ## 感谢下列Sponsors对本仓库赞助
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册这个款免费的GPT助手帮我获取GPT4额度作为支持。也是我每天在用的一款chrome AI助手插件</a>
<br>
- 感谢 [JetBrains](https://www.jetbrains.com/?from=gaowei-space/markdown-blog) 对本项目的支持! - 感谢 [JetBrains](https://www.jetbrains.com/?from=gaowei-space/markdown-blog) 对本项目的支持!
<a href="https://www.jetbrains.com/?from=NanmiCoder/MediaCrawler" target="_blank"> <a href="https://www.jetbrains.com/?from=NanmiCoder/MediaCrawler" target="_blank">
<img src="https://resources.jetbrains.com/storage/products/company/brand/logos/jb_beam.png" width="100" height="100"> <img src="https://resources.jetbrains.com/storage/products/company/brand/logos/jb_beam.png" width="100" height="100">
</a> </a>
<br> <br>
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册这个款免费的GPT助手帮我获取GPT4额度作为支持。也是我每天在用的一款chrome AI助手插件</a>
成为赞助者展示你的产品在这里联系作者relakkes@gmail.com 成为赞助者展示你的产品在这里联系作者relakkes@gmail.com

File diff suppressed because one or more lines are too long

View File

@ -2,11 +2,10 @@ import asyncio
import copy import copy
import json import json
import urllib.parse import urllib.parse
from typing import Any, Callable, Dict, List, Optional from typing import Any, Callable, Dict, Optional
import execjs import requests
import httpx from playwright.async_api import BrowserContext
from playwright.async_api import BrowserContext, Page
from base.base_crawler import AbstractApiClient from base.base_crawler import AbstractApiClient
from tools import utils from tools import utils
@ -14,6 +13,7 @@ from var import request_keyword_var
from .exception import * from .exception import *
from .field import * from .field import *
from .help import *
class DOUYINClient(AbstractApiClient): class DOUYINClient(AbstractApiClient):
@ -33,51 +33,71 @@ class DOUYINClient(AbstractApiClient):
self.playwright_page = playwright_page self.playwright_page = playwright_page
self.cookie_dict = cookie_dict self.cookie_dict = cookie_dict
async def __process_req_params(self, params: Optional[Dict] = None, headers: Optional[Dict] = None): async def __process_req_params(
self, params: Optional[Dict] = None, headers: Optional[Dict] = None,
request_method="GET"
):
if not params: if not params:
return return
headers = headers or self.headers headers = headers or self.headers
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore
douyin_js_obj = execjs.compile(open('libs/douyin.js').read())
common_params = { common_params = {
"device_platform": "webapp", "device_platform": "webapp",
"aid": "6383", "aid": "6383",
"channel": "channel_pc_web", "channel": "channel_pc_web",
"version_code": "190600",
"version_name": "19.6.0",
"update_version_code": "170400",
"pc_client_type": "1",
"cookie_enabled": "true", "cookie_enabled": "true",
"browser_language": "zh-CN", "browser_language": "zh-CN",
"browser_platform": "Win32", "browser_platform": "MacIntel",
"browser_name": "Firefox", "browser_name": "Chrome",
"browser_version": "110.0", "browser_version": "125.0.0.0",
"browser_online": "true", "browser_online": "true",
"engine_name": "Gecko", "engine_name": "Blink",
"os_name": "Windows", "os_name": "Mac OS",
"os_version": "10", "os_version": "10.15.7",
"cpu_core_num": "8",
"device_memory": "8",
"engine_version": "109.0", "engine_version": "109.0",
"platform": "PC", "platform": "PC",
"screen_width": "1920", "screen_width": "2560",
"screen_height": "1200", "screen_height": "1440",
# " webid": douyin_js_obj.call("get_web_id"), 'effective_type': '4g',
# "msToken": local_storage.get("xmst"), "round_trip_time": "50",
# "msToken": "abL8SeUTPa9-EToD8qfC7toScSADxpg6yLh2dbNcpWHzE0bT04txM_4UwquIcRvkRb9IU8sifwgM1Kwf1Lsld81o9Irt2_yNyUbbQPSUO8EfVlZJ_78FckDFnwVBVUVK", "webid": get_web_id(),
"msToken": local_storage.get("xmst"),
} }
params.update(common_params) params.update(common_params)
query = '&'.join([f'{k}={v}' for k, v in params.items()]) query_string = urllib.parse.urlencode(params)
x_bogus = douyin_js_obj.call('sign', query, headers["User-Agent"])
params["X-Bogus"] = x_bogus # 20240610 a-bogus更新Playwright版本
# print(x_bogus, query) post_data = {}
if request_method == "POST":
post_data = params
a_bogus = await get_a_bogus(query_string, post_data, headers["User-Agent"], self.playwright_page)
params["a_bogus"] = a_bogus
async def request(self, method, url, **kwargs): async def request(self, method, url, **kwargs):
async with httpx.AsyncClient(proxies=self.proxies) as client: response = None
response = await client.request( if method == "GET":
method, url, timeout=self.timeout, response = requests.request(method, url, **kwargs)
**kwargs elif method == "POST":
) response = requests.request(method, url, **kwargs)
try: try:
return response.json() if response.text == "" or response.text == "blocked":
except Exception as e: utils.logger.error(f"request params incrr, response.text: {response.text}")
raise DataFetchError(f"{e}, {response.text}") raise Exception("account blocked")
return response.json()
except Exception as e:
raise DataFetchError(f"{e}, {response.text}")
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None): async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
"""
GET请求
"""
await self.__process_req_params(params, headers) await self.__process_req_params(params, headers)
headers = headers or self.headers headers = headers or self.headers
return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers) return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers)
@ -117,27 +137,30 @@ class DOUYINClient(AbstractApiClient):
:param publish_time: · :param publish_time: ·
:return: :return:
""" """
params = { query_params = {
"keyword": urllib.parse.quote(keyword), 'search_channel': search_channel.value,
"search_channel": search_channel.value, 'enable_history': '1',
"search_source": "normal_search", 'keyword': urllib.parse.quote(keyword),
"query_correct_type": 1, 'search_source': 'tab_search',
"is_filter_search": 0, 'query_correct_type': '1',
"offset": offset, 'is_filter_search': '0',
"count": 10 # must be set to 10 'from_group_id': '7378810571505847586',
'offset': offset,
'count': '15',
'need_filter_settings': '1',
'list_type': 'multi',
} }
if sort_type != SearchSortType.GENERAL or publish_time != PublishTimeType.UNLIMITED: if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
params["filter_selected"] = urllib.parse.quote(json.dumps({ query_params["filter_selected"] = urllib.parse.quote(json.dumps({
"sort_type": str(sort_type.value), "sort_type": str(sort_type.value),
"publish_time": str(publish_time.value) "publish_time": str(publish_time.value)
})) }))
params["is_filter_search"] = 1 query_params["is_filter_search"] = 1
params["search_source"] = "tab_search" query_params["search_source"] = "tab_search"
referer_url = "https://www.douyin.com/search/" + keyword referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general"
referer_url += f"?publish_time={publish_time.value}&sort_type={sort_type.value}&type=general"
headers = copy.copy(self.headers) headers = copy.copy(self.headers)
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers) return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers)
async def get_video_by_id(self, aweme_id: str) -> Any: async def get_video_by_id(self, aweme_id: str) -> Any:
""" """
@ -149,7 +172,6 @@ class DOUYINClient(AbstractApiClient):
"aweme_id": aweme_id "aweme_id": aweme_id
} }
headers = copy.copy(self.headers) headers = copy.copy(self.headers)
# headers["Cookie"] = "s_v_web_id=verify_lol4a8dv_wpQ1QMyP_xemd_4wON_8Yzr_FJa8DN1vdY2m;"
del headers["Origin"] del headers["Origin"]
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers) res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
return res.get("aweme_detail", {}) return res.get("aweme_detail", {})
@ -259,7 +281,9 @@ class DOUYINClient(AbstractApiClient):
"count": 18, "count": 18,
"max_cursor": max_cursor, "max_cursor": max_cursor,
"locate_query": "false", "locate_query": "false",
"publish_video_strategy_type": 2 "publish_video_strategy_type": 2,
'verifyFp': 'verify_lx901cuk_K7kaK4dK_bn2E_4dgk_BxAA_E0XS1VtUi130',
'fp': 'verify_lx901cuk_K7kaK4dK_bn2E_4dgk_BxAA_E0XS1VtUi130'
} }
return await self.get(uri, params) return await self.get(uri, params)

View File

@ -26,7 +26,6 @@ class DouYinCrawler(AbstractCrawler):
browser_context: BrowserContext browser_context: BrowserContext
def __init__(self) -> None: def __init__(self) -> None:
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.index_url = "https://www.douyin.com" self.index_url = "https://www.douyin.com"
async def start(self) -> None: async def start(self) -> None:
@ -42,7 +41,7 @@ class DouYinCrawler(AbstractCrawler):
self.browser_context = await self.launch_browser( self.browser_context = await self.launch_browser(
chromium, chromium,
None, None,
self.user_agent, user_agent=None,
headless=config.HEADLESS headless=config.HEADLESS
) )
# stealth.min.js is a js script to prevent the website from detecting the crawler. # stealth.min.js is a js script to prevent the website from detecting the crawler.
@ -225,7 +224,7 @@ class DouYinCrawler(AbstractCrawler):
douyin_client = DOUYINClient( douyin_client = DOUYINClient(
proxies=httpx_proxy, proxies=httpx_proxy,
headers={ headers={
"User-Agent": self.user_agent, "User-Agent": await self.context_page.evaluate("() => navigator.userAgent"),
"Cookie": cookie_str, "Cookie": cookie_str,
"Host": "www.douyin.com", "Host": "www.douyin.com",
"Origin": "https://www.douyin.com/", "Origin": "https://www.douyin.com/",

View File

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Name : 程序员阿江-Relakkes
# @Time : 2024/6/10 02:24
# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除
import random
from playwright.async_api import Page
def get_web_id():
"""
生成随机的webid
Returns:
"""
def e(t):
if t is not None:
return str(t ^ (int(16 * random.random()) >> (t // 4)))
else:
return ''.join(
[str(int(1e7)), '-', str(int(1e3)), '-', str(int(4e3)), '-', str(int(8e3)), '-', str(int(1e11))]
)
web_id = ''.join(
e(int(x)) if x in '018' else x for x in e(None)
)
return web_id.replace('-', '')[:19]
async def get_a_bogus(params: str, post_data: dict, user_agent: str, page: Page = None):
"""
获取 a_bogus 参数
"""
return await get_a_bogus_from_playright(params, post_data, user_agent, page)
async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page):
"""
通过playright获取 a_bogus 参数
Returns:
"""
if not post_data:
post_data = ""
a_bogus = await page.evaluate(
"([params, post_data, ua]) => window.bdms.init._v[2].p[42].apply(null, [0, 1, 8, params, post_data, ua])",
[params, post_data, user_agent])
return a_bogus

View File

@ -2,7 +2,6 @@ httpx==0.24.0
Pillow==9.5.0 Pillow==9.5.0
playwright==1.42.0 playwright==1.42.0
tenacity==8.2.2 tenacity==8.2.2
PyExecJS==1.5.1
opencv-python opencv-python
aiomysql==0.2.0 aiomysql==0.2.0
redis~=4.6.0 redis~=4.6.0
@ -13,4 +12,5 @@ uvicorn==0.29.0
python-dotenv==1.0.1 python-dotenv==1.0.1
jieba==0.42.1 jieba==0.42.1
wordcloud==1.9.3 wordcloud==1.9.3
matplotlib==3.9.0 matplotlib==3.9.0
requests==2.32.3

View File

@ -7,8 +7,8 @@ from typing import List
import config import config
from . import xhs_store_impl from . import xhs_store_impl
from .xhs_store_impl import *
from .xhs_store_image import * from .xhs_store_image import *
from .xhs_store_impl import *
class XhsStoreFactory: class XhsStoreFactory: