feat: 抖音abogus参数更新

2024-07-14 03:20:05 +08:00 · 2024-07-14 03:20:05 +08:00 · f8096e3d58
parent 0807862b63
commit f8096e3d58
7 changed files with 141 additions and 642 deletions
--- a/README.md
+++ b/README.md
@ -81,6 +81,13 @@


 ## 开发者服务
+- MediaCrawler视频课程：
+  > 课程介绍飞书文档链接：https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
+  > 如果你想很快入门这个项目，或者想了具体实现原理，我推荐你看看这个视频课程，从设计出发一步步带你如何使用，门槛大大降低
+  > 
+  > 同时也是对我开源的支持，如果你能支持我的课程，我将会非常开心～<br>
+  
+
 - 知识星球：沉淀高质量常见问题、最佳实践文档、多年编程+爬虫经验分享，提供付费知识星球服务，主动提问，作者会定期回答问题 (每天 1 快钱订阅我的知识服务)
  <p>
  <img alt="xingqiu" src="https://nm.zizhi1.com/static/img/8e1312d1f52f2e0ff436ea7196b4e27b.15555424244122T1.webp" style="width: auto;height: 400px" >
@ -93,21 +100,15 @@
  - [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html) 
  
  
-  
- MediaCrawler视频课程：
-  > 如果你想很快入门这个项目，或者想了具体实现原理，我推荐你看看这个视频课程，从设计出发一步步带你如何使用，门槛大大降低，同时也是对我开源的支持，如果你能支持我的课程，我将会非常开心～<br>
-  > 课程售价非常非常的便宜，几杯咖啡的事儿.<br>
-  > 课程介绍飞书文档链接：https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
-
-

 ## 感谢下列Sponsors对本仓库赞助
+- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册这个款免费的GPT助手，帮我获取GPT4额度作为支持。也是我每天在用的一款chrome AI助手插件</a>
+<br>
 - 感谢 [JetBrains](https://www.jetbrains.com/?from=gaowei-space/markdown-blog) 对本项目的支持！
 <a href="https://www.jetbrains.com/?from=NanmiCoder/MediaCrawler" target="_blank">
    <img src="https://resources.jetbrains.com/storage/products/company/brand/logos/jb_beam.png" width="100" height="100">
 </a>
 <br>
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册这个款免费的GPT助手，帮我获取GPT4额度作为支持。也是我每天在用的一款chrome AI助手插件</a>

 成为赞助者，展示你的产品在这里，联系作者：relakkes@gmail.com

--- a/libs/douyin.js
+++ b/libs/douyin.js
--- a/media_platform/douyin/client.py
+++ b/media_platform/douyin/client.py
@ -2,11 +2,10 @@ import asyncio
 import copy
 import json
 import urllib.parse
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, Optional

-import execjs
-import httpx
-from playwright.async_api import BrowserContext, Page
+import requests
+from playwright.async_api import BrowserContext

 from base.base_crawler import AbstractApiClient
 from tools import utils
@ -14,6 +13,7 @@ from var import request_keyword_var

 from .exception import *
 from .field import *
+from .help import *


 class DOUYINClient(AbstractApiClient):
@ -33,51 +33,71 @@ class DOUYINClient(AbstractApiClient):
        self.playwright_page = playwright_page
        self.cookie_dict = cookie_dict

-    async def __process_req_params(self, params: Optional[Dict] = None, headers: Optional[Dict] = None):
+    async def __process_req_params(
+            self, params: Optional[Dict] = None, headers: Optional[Dict] = None,
+            request_method="GET"
+    ):
+
        if not params:
            return
        headers = headers or self.headers
        local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage")  # type: ignore
-        douyin_js_obj = execjs.compile(open('libs/douyin.js').read())
        common_params = {
            "device_platform": "webapp",
            "aid": "6383",
            "channel": "channel_pc_web",
+            "version_code": "190600",
+            "version_name": "19.6.0",
+            "update_version_code": "170400",
+            "pc_client_type": "1",
            "cookie_enabled": "true",
            "browser_language": "zh-CN",
-            "browser_platform": "Win32",
-            "browser_name": "Firefox",
-            "browser_version": "110.0",
+            "browser_platform": "MacIntel",
+            "browser_name": "Chrome",
+            "browser_version": "125.0.0.0",
            "browser_online": "true",
-            "engine_name": "Gecko",
-            "os_name": "Windows",
-            "os_version": "10",
+            "engine_name": "Blink",
+            "os_name": "Mac OS",
+            "os_version": "10.15.7",
+            "cpu_core_num": "8",
+            "device_memory": "8",
            "engine_version": "109.0",
            "platform": "PC",
-            "screen_width": "1920",
-            "screen_height": "1200",
-            # " webid": douyin_js_obj.call("get_web_id"),
-            # "msToken": local_storage.get("xmst"),
-            # "msToken": "abL8SeUTPa9-EToD8qfC7toScSADxpg6yLh2dbNcpWHzE0bT04txM_4UwquIcRvkRb9IU8sifwgM1Kwf1Lsld81o9Irt2_yNyUbbQPSUO8EfVlZJ_78FckDFnwVBVUVK",
+            "screen_width": "2560",
+            "screen_height": "1440",
+            'effective_type': '4g',
+            "round_trip_time": "50",
+            "webid": get_web_id(),
+            "msToken": local_storage.get("xmst"),
        }
        params.update(common_params)
-        query = '&'.join([f'{k}={v}' for k, v in params.items()])
-        x_bogus = douyin_js_obj.call('sign', query, headers["User-Agent"])
-        params["X-Bogus"] = x_bogus
-        # print(x_bogus, query)
+        query_string = urllib.parse.urlencode(params)
+
+        # 20240610 a-bogus更新（Playwright版本）
+        post_data = {}
+        if request_method == "POST":
+            post_data = params
+        a_bogus = await get_a_bogus(query_string, post_data, headers["User-Agent"], self.playwright_page)
+        params["a_bogus"] = a_bogus

    async def request(self, method, url, **kwargs):
-        async with httpx.AsyncClient(proxies=self.proxies) as client:
-            response = await client.request(
-                method, url, timeout=self.timeout,
-                **kwargs
-            )
-            try:
-                return response.json()
-            except Exception as e:
-                raise DataFetchError(f"{e}, {response.text}")
+        response = None
+        if method == "GET":
+            response = requests.request(method, url, **kwargs)
+        elif method == "POST":
+            response = requests.request(method, url, **kwargs)
+        try:
+            if response.text == "" or response.text == "blocked":
+                utils.logger.error(f"request params incrr, response.text: {response.text}")
+                raise Exception("account blocked")
+            return response.json()
+        except Exception as e:
+            raise DataFetchError(f"{e}, {response.text}")

    async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
+        """
+        GET请求
+        """
        await self.__process_req_params(params, headers)
        headers = headers or self.headers
        return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers)
@ -117,27 +137,30 @@ class DOUYINClient(AbstractApiClient):
        :param publish_time: ·
        :return:
        """
-        params = {
-            "keyword": urllib.parse.quote(keyword),
-            "search_channel": search_channel.value,
-            "search_source": "normal_search",
-            "query_correct_type": 1,
-            "is_filter_search": 0,
-            "offset": offset,
-            "count": 10  # must be set to 10
+        query_params = {
+            'search_channel': search_channel.value,
+            'enable_history': '1',
+            'keyword': urllib.parse.quote(keyword),
+            'search_source': 'tab_search',
+            'query_correct_type': '1',
+            'is_filter_search': '0',
+            'from_group_id': '7378810571505847586',
+            'offset': offset,
+            'count': '15',
+            'need_filter_settings': '1',
+            'list_type': 'multi',
        }
-        if sort_type != SearchSortType.GENERAL or publish_time != PublishTimeType.UNLIMITED:
-           params["filter_selected"] = urllib.parse.quote(json.dumps({
-               "sort_type": str(sort_type.value),
-               "publish_time": str(publish_time.value)
-           }))
-           params["is_filter_search"] = 1
-           params["search_source"] = "tab_search"
-        referer_url = "https://www.douyin.com/search/" + keyword
-        referer_url += f"?publish_time={publish_time.value}&sort_type={sort_type.value}&type=general"
+        if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
+            query_params["filter_selected"] = urllib.parse.quote(json.dumps({
+                "sort_type": str(sort_type.value),
+                "publish_time": str(publish_time.value)
+            }))
+            query_params["is_filter_search"] = 1
+            query_params["search_source"] = "tab_search"
+        referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general"
        headers = copy.copy(self.headers)
        headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
-        return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers)
+        return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers)

    async def get_video_by_id(self, aweme_id: str) -> Any:
        """
@ -149,7 +172,6 @@ class DOUYINClient(AbstractApiClient):
            "aweme_id": aweme_id
        }
        headers = copy.copy(self.headers)
-        # headers["Cookie"] = "s_v_web_id=verify_lol4a8dv_wpQ1QMyP_xemd_4wON_8Yzr_FJa8DN1vdY2m;"
        del headers["Origin"]
        res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
        return res.get("aweme_detail", {})
@ -259,7 +281,9 @@ class DOUYINClient(AbstractApiClient):
            "count": 18,
            "max_cursor": max_cursor,
            "locate_query": "false",
-            "publish_video_strategy_type": 2
+            "publish_video_strategy_type": 2,
+            'verifyFp': 'verify_lx901cuk_K7kaK4dK_bn2E_4dgk_BxAA_E0XS1VtUi130',
+            'fp': 'verify_lx901cuk_K7kaK4dK_bn2E_4dgk_BxAA_E0XS1VtUi130'
        }
        return await self.get(uri, params)

--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@ -26,7 +26,6 @@ class DouYinCrawler(AbstractCrawler):
    browser_context: BrowserContext

    def __init__(self) -> None:
-        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"  # fixed
        self.index_url = "https://www.douyin.com"

    async def start(self) -> None:
@ -42,7 +41,7 @@ class DouYinCrawler(AbstractCrawler):
            self.browser_context = await self.launch_browser(
                chromium,
                None,
-                self.user_agent,
+                user_agent=None,
                headless=config.HEADLESS
            )
            # stealth.min.js is a js script to prevent the website from detecting the crawler.
@ -225,7 +224,7 @@ class DouYinCrawler(AbstractCrawler):
        douyin_client = DOUYINClient(
            proxies=httpx_proxy,
            headers={
-                "User-Agent": self.user_agent,
+                "User-Agent": await self.context_page.evaluate("() => navigator.userAgent"),
                "Cookie": cookie_str,
                "Host": "www.douyin.com",
                "Origin": "https://www.douyin.com/",
--- a/media_platform/douyin/help.py
+++ b/media_platform/douyin/help.py
@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+# @Author  : relakkes@gmail.com
+# @Name    : 程序员阿江-Relakkes
+# @Time    : 2024/6/10 02:24
+# @Desc    : 获取 a_bogus 参数, 学习交流使用，请勿用作商业用途，侵权联系作者删除
+
+import random
+
+from playwright.async_api import Page
+
+
+def get_web_id():
+    """
+    生成随机的webid
+    Returns:
+
+    """
+
+    def e(t):
+        if t is not None:
+            return str(t ^ (int(16 * random.random()) >> (t // 4)))
+        else:
+            return ''.join(
+                [str(int(1e7)), '-', str(int(1e3)), '-', str(int(4e3)), '-', str(int(8e3)), '-', str(int(1e11))]
+            )
+
+    web_id = ''.join(
+        e(int(x)) if x in '018' else x for x in e(None)
+    )
+    return web_id.replace('-', '')[:19]
+
+
+async def get_a_bogus(params: str, post_data: dict, user_agent: str, page: Page = None):
+    """
+    获取 a_bogus 参数
+    """
+    return await get_a_bogus_from_playright(params, post_data, user_agent, page)
+
+
+async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page):
+    """
+    通过playright获取 a_bogus 参数
+    Returns:
+
+    """
+    if not post_data:
+        post_data = ""
+    a_bogus = await page.evaluate(
+        "([params, post_data, ua]) => window.bdms.init._v[2].p[42].apply(null, [0, 1, 8, params, post_data, ua])",
+        [params, post_data, user_agent])
+
+    return a_bogus
+
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,6 @@ httpx==0.24.0
 Pillow==9.5.0
 playwright==1.42.0
 tenacity==8.2.2
-PyExecJS==1.5.1
 opencv-python
 aiomysql==0.2.0
 redis~=4.6.0
@ -13,4 +12,5 @@ uvicorn==0.29.0
 python-dotenv==1.0.1
 jieba==0.42.1
 wordcloud==1.9.3
-matplotlib==3.9.0
+matplotlib==3.9.0
+requests==2.32.3
--- a/store/xhs/init.py
+++ b/store/xhs/init.py
@ -7,8 +7,8 @@ from typing import List
 import config

 from . import xhs_store_impl
-from .xhs_store_impl import *
 from .xhs_store_image import *
+from .xhs_store_impl import *


 class XhsStoreFactory: