feat: 抖音支持指定视频列表爬去

2023-11-18 22:07:30 +08:00 · 2023-11-18 22:07:30 +08:00 · 81bc8b51e2
parent 098923d74d
commit 81bc8b51e2
7 changed files with 52 additions and 25 deletions
--- a/README.md
+++ b/README.md
@ -22,6 +22,7 @@
 - [x] 抖音登录（二维码、手机号、cookies）
 - [x] 抖音滑块（模拟滑动实现，准确率不太OK）
 - [x] 抖音指定关键爬取
+- [x] 抖音指定帖子爬取
 - [x] 支持登录成功后的上下文浏览器环境保留
 - [x] 代理池实现（手机号+IP）
 - [x] 并发执行爬虫请求
@ -29,7 +30,7 @@
 - [x] 数据保持到数据库中（可选）

 ## 待实现
- [ ] 抖音指定帖子爬取
+
 - [ ] 快手爬虫实现

 ## 使用方法
@ -70,7 +71,7 @@

 ## 常见程序运行出错问题
 ```shell
-# Q: 爬去抖音报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'`
+# Q: 爬取抖音报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'`
 # A: 该错误为缺少 nodejs 环境这个错误安装 nodejs 环境即可，版本为：`v16.8.0`

 # Q: 可以指定关键词爬取吗？
--- a/config/base_config.py
+++ b/config/base_config.py
@ -33,3 +33,10 @@ XHS_SPECIFIED_ID_LIST = [
 "64ca1b73000000000b028dd2",
 "630d5b85000000001203ab41",
 ]
+
+
+# douyin specified note id list
+DY_SPECIFIED_ID_LIST = [
+"7280854932641664319",
+"7202432992642387233"
+]
--- a/main.py
+++ b/main.py
@ -23,13 +23,13 @@ class CrawlerFactory:
 async def main():
    # define command line params ...
    parser = argparse.ArgumentParser(description='Media crawler program.')
-    parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy)', choices=["xhs", "dy"],
-                        default=config.PLATFORM)
+    parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy)',
+                        choices=["xhs", "dy"], default=config.PLATFORM)
    parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
                        choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
-
    parser.add_argument('--type', type=str, help='crawler type (search | detail)',
-                        choices=["search","detail"],default=config.CRAWLER_TYPE)
+                        choices=["search", "detail"], default=config.CRAWLER_TYPE)
+
    # init account pool
    account_pool = proxy_account_pool.create_account_pool()

--- a/media_platform/douyin/client.py
+++ b/media_platform/douyin/client.py
@ -1,7 +1,7 @@
 import asyncio
 import copy
 import urllib.parse
-from typing import Callable, Dict, Optional
+from typing import Any, Callable, Dict, Optional

 import execjs
 import httpx
@ -129,7 +129,7 @@ class DOUYINClient:
        headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
        return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers)

-    async def get_video_by_id(self, aweme_id: str):
+    async def get_video_by_id(self, aweme_id: str) -> Any:
        """
        DouYin Video Detail API
        :param aweme_id:
@ -139,9 +139,10 @@ class DOUYINClient:
            "aweme_id": aweme_id
        }
        headers = copy.copy(self.headers)
-        headers["Cookie"] = "s_v_web_id=verify_leytkxgn_kvO5kOmO_SdMs_4t1o_B5ml_BUqtWM1mP6BF;"
+        # headers["Cookie"] = "s_v_web_id=verify_lol4a8dv_wpQ1QMyP_xemd_4wON_8Yzr_FJa8DN1vdY2m;"
        del headers["Origin"]
-        return await self.get("/aweme/v1/web/aweme/detail/", params, headers)
+        res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
+        return res.get("aweme_detail", {})

    async def get_aweme_comments(self, aweme_id: str, cursor: int = 0):
        """get note comments
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@ -1,7 +1,7 @@
 import asyncio
 import os
 from asyncio import Task
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple

 from playwright.async_api import (BrowserContext, BrowserType, Page,
                                  async_playwright)
@ -11,7 +11,7 @@ from base.base_crawler import AbstractCrawler
 from base.proxy_account_pool import AccountPool
 from models import douyin
 from tools import utils
-from var import request_keyword_var
+from var import crawler_type_var

 from .client import DOUYINClient
 from .exception import DataFetchError
@ -64,20 +64,19 @@ class DouYinCrawler(AbstractCrawler):
                )
                await login_obj.begin()
                await self.dy_client.update_cookies(browser_context=self.browser_context)
-
+            crawler_type_var.set(self.crawler_type)
            if self.crawler_type == "search":
                # Search for notes and retrieve their comment information.
                await self.search()
            elif self.crawler_type == "detail":
                # Get the information and comments of the specified post
-                await self.get_specified_notes()
+                await self.get_specified_awemes()

            utils.logger.info("Douyin Crawler finished ...")

    async def search(self) -> None:
        utils.logger.info("Begin search douyin keywords")
        for keyword in config.KEYWORDS.split(","):
-            request_keyword_var.set(keyword)
            utils.logger.info(f"Current keyword: {keyword}")
            aweme_list: List[str] = []
            dy_limit_count = 10
@ -101,10 +100,29 @@ class DouYinCrawler(AbstractCrawler):
            utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
            await self.batch_get_note_comments(aweme_list)

-    async def get_specified_notes(self):
+    async def get_specified_awemes(self):
        """Get the information and comments of the specified post"""
-        # todo douyin support
-        pass
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        task_list = [
+            self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST
+        ]
+        aweme_details = await asyncio.gather(*task_list)
+        for aweme_detail in aweme_details:
+            if aweme_detail is not None:
+                await douyin.update_douyin_aweme(aweme_detail)
+        await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
+
+    async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
+        """Get note detail"""
+        async with semaphore:
+            try:
+                return await self.dy_client.get_video_by_id(aweme_id)
+            except DataFetchError as ex:
+                utils.logger.error(f"Get aweme detail error: {ex}")
+                return None
+            except KeyError as ex:
+                utils.logger.error(f"have not fund note detail aweme_id:{aweme_id}, err: {ex}")
+                return None

    async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
        task_list: List[Task] = []
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@ -74,13 +74,12 @@ class XiaoHongShuCrawler(AbstractCrawler):
                await login_obj.begin()
                await self.xhs_client.update_cookies(browser_context=self.browser_context)

+            crawler_type_var.set(self.crawler_type)
            if self.crawler_type == "search":
                # Search for notes and retrieve their comment information.
-                crawler_type_var.set("search")
                await self.search()
            elif self.crawler_type == "detail":
                # Get the information and comments of the specified post
-                crawler_type_var.set("detail")
                await self.get_specified_notes()
            else:
                pass
--- a/models/douyin.py
+++ b/models/douyin.py
@ -8,7 +8,7 @@ from tortoise.models import Model

 import config
 from tools import utils
-from var import request_keyword_var
+from var import crawler_type_var


 class DouyinBaseModel(Model):
@ -104,9 +104,9 @@ async def update_douyin_aweme(aweme_item: Dict):
            await DouyinAweme.filter(aweme_id=aweme_id).update(**douyin_data.dict())
    else:
        # Below is a simple way to save it in CSV format.
-        source_keywords = request_keyword_var.get()
        pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True)
-        with open(f"data/dy/aweme_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f:
+        save_file_name = f"data/dy/{crawler_type_var.get()}_awemes_{utils.get_current_date()}.csv"
+        with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
            writer = csv.writer(f)
            if f.tell() == 0:
                writer.writerow(local_db_item.keys())
@ -161,9 +161,10 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
            comment_pydantic.validate(comment_data)
            await DouyinAwemeComment.filter(comment_id=comment_id).update(**comment_data.dict())
    else:
-        source_keywords = request_keyword_var.get()
+
        pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True)
-        with open(f"data/dy/comment_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f:
+        save_file_name = f"data/dy/{crawler_type_var.get()}_comments_{utils.get_current_date()}.csv"
+        with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
            writer = csv.writer(f)
            if f.tell() == 0:
                writer.writerow(local_db_item.keys())