MediaCrawler/media_platform/douyin/core.py

import asyncio
import os
import random
from asyncio import Task
from typing import Any, Dict, List, Optional, Tuple

from playwright.async_api import (BrowserContext, BrowserType, Page,
                                  async_playwright)

import config
from base.base_crawler import AbstractCrawler
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import douyin as douyin_store
from tools import utils
from var import crawler_type_var

from .client import DOUYINClient
from .exception import DataFetchError
from .field import PublishTimeType
from .login import DouYinLogin


class DouYinCrawler(AbstractCrawler):
    platform: str
    login_type: str
    crawler_type: str
    context_page: Page
    dy_client: DOUYINClient
    browser_context: BrowserContext
    start_page: int
    keyword: str

    def __init__(self) -> None:
        self.start_page = None
        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"  # fixed
        self.index_url = "https://www.douyin.com"

    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
        self.platform = platform
        self.login_type = login_type
        self.crawler_type = crawler_type
        self.start_page = start_page
        self.keyword = keyword

    async def start(self) -> None:
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
            ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
            ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
            playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info)

        async with async_playwright() as playwright:
            # Launch a browser context.
            chromium = playwright.chromium
            self.browser_context = await self.launch_browser(
                chromium,
                None,
                self.user_agent,
                headless=config.HEADLESS
            )
            # stealth.min.js is a js script to prevent the website from detecting the crawler.
            await self.browser_context.add_init_script(path="libs/stealth.min.js")
            self.context_page = await self.browser_context.new_page()
            await self.context_page.goto(self.index_url)

            self.dy_client = await self.create_douyin_client(httpx_proxy_format)
            if not await self.dy_client.pong(browser_context=self.browser_context):
                login_obj = DouYinLogin(
                    login_type=self.login_type,
                    login_phone="",  # you phone number
                    browser_context=self.browser_context,
                    context_page=self.context_page,
                    cookie_str=config.COOKIES
                )
                await login_obj.begin()
                await self.dy_client.update_cookies(browser_context=self.browser_context)
            crawler_type_var.set(self.crawler_type)
            if self.crawler_type == "search":
                # Search for notes and retrieve their comment information.
                await self.search()
            elif self.crawler_type == "detail":
                # Get the information and comments of the specified post
                await self.get_specified_awemes()
            elif self.crawler_type == "creator":
                # Get the information and comments of the specified creator
                await self.get_creators_and_videos()

            utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...")

    async def search(self) -> None:
        utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords")
        dy_limit_count = 10  # douyin limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
        start_page = self.start_page  # start page number
        for keyword in self.keyword.split(","):
            utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
            aweme_list: List[str] = []
            page = 0
            while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
                if page < start_page:
                    utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
                    page += 1
                    continue
                try:
                    posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
                                                                            offset=page * dy_limit_count,
                                                                            publish_time=PublishTimeType.UNLIMITED
                                                                            )
                except DataFetchError:
                    utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
                    break

                page += 1
                if "data" not in posts_res:
                    utils.logger.error(
                        f"[DouYinCrawler.search] search douyin keyword: {keyword} failed，账号也许被风控了。")
                    break

                for post_item in posts_res.get("data"):
                    try:
                        aweme_info: Dict = post_item.get("aweme_info") or \
                                           post_item.get("aweme_mix_info", {}).get("mix_items")[0]
                    except TypeError:
                        continue
                    aweme_list.append(aweme_info.get("aweme_id", ""))
                    await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
            utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
            await self.batch_get_note_comments(aweme_list)

    async def get_specified_awemes(self):
        """Get the information and comments of the specified post"""
        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
        task_list = [
            self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST
        ]
        aweme_details = await asyncio.gather(*task_list)
        for aweme_detail in aweme_details:
            if aweme_detail is not None:
                await douyin_store.update_douyin_aweme(aweme_detail)
        await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)

    async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
        """Get note detail"""
        async with semaphore:
            try:
                return await self.dy_client.get_video_by_id(aweme_id)
            except DataFetchError as ex:
                utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
                return None
            except KeyError as ex:
                utils.logger.error(
                    f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
                return None

    async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
        """
        Batch get note comments
        """
        if not config.ENABLE_GET_COMMENTS:
            utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
            return

        task_list: List[Task] = []
        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
        for aweme_id in aweme_list:
            task = asyncio.create_task(
                self.get_comments(aweme_id, semaphore), name=aweme_id)
            task_list.append(task)
        if len(task_list) > 0:
            await asyncio.wait(task_list)

    async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
        async with semaphore:
            try:
                # 将关键词列表传递给 get_aweme_all_comments 方法
                await self.dy_client.get_aweme_all_comments(
                    aweme_id=aweme_id,
                    crawl_interval=random.random(),
                    is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
                    callback=douyin_store.batch_update_dy_aweme_comments
                )
                utils.logger.info(
                    f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
            except DataFetchError as e:
                utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")

    async def get_creators_and_videos(self) -> None:
        """
        Get the information and videos of the specified creator
        """
        utils.logger.info("[DouYinCrawler.get_creators_and_videos] Begin get douyin creators")
        for user_id in config.DY_CREATOR_ID_LIST:
            creator_info: Dict = await self.dy_client.get_user_info(user_id)
            if creator_info:
                await douyin_store.save_creator(user_id, creator=creator_info)

            # Get all video information of the creator
            all_video_list = await self.dy_client.get_all_user_aweme_posts(
                sec_user_id=user_id,
                callback=self.fetch_creator_video_detail
            )

            video_ids = [video_item.get("aweme_id") for video_item in all_video_list]
            await self.batch_get_note_comments(video_ids)

    async def fetch_creator_video_detail(self, video_list: List[Dict]):
        """
        Concurrently obtain the specified post list and save the data
        """
        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
        task_list = [
            self.get_aweme_detail(post_item.get("aweme_id"), semaphore) for post_item in video_list
        ]

        note_details = await asyncio.gather(*task_list)
        for aweme_item in note_details:
            if aweme_item is not None:
                await douyin_store.update_douyin_aweme(aweme_item)

    @staticmethod
    def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
        """format proxy info for playwright and httpx"""
        playwright_proxy = {
            "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
            "username": ip_proxy_info.user,
            "password": ip_proxy_info.password,
        }
        httpx_proxy = {
            f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
        }
        return playwright_proxy, httpx_proxy

    async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient:
        """Create douyin client"""
        cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())  # type: ignore
        douyin_client = DOUYINClient(
            proxies=httpx_proxy,
            headers={
                "User-Agent": self.user_agent,
                "Cookie": cookie_str,
                "Host": "www.douyin.com",
                "Origin": "https://www.douyin.com/",
                "Referer": "https://www.douyin.com/",
                "Content-Type": "application/json;charset=UTF-8"
            },
            playwright_page=self.context_page,
            cookie_dict=cookie_dict,
        )
        return douyin_client

    async def launch_browser(
            self,
            chromium: BrowserType,
            playwright_proxy: Optional[Dict],
            user_agent: Optional[str],
            headless: bool = True
    ) -> BrowserContext:
        """Launch browser and create browser context"""
        if config.SAVE_LOGIN_STATE:
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
                                         config.USER_DATA_DIR % self.platform)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
                headless=headless,
                proxy=playwright_proxy,  # type: ignore
                viewport={"width": 1920, "height": 1080},
                user_agent=user_agent
            )  # type: ignore
            return browser_context
        else:
            browser = await chromium.launch(headless=headless, proxy=playwright_proxy)  # type: ignore
            browser_context = await browser.new_context(
                viewport={"width": 1920, "height": 1080},
                user_agent=user_agent
            )
            return browser_context

    async def close(self) -> None:
        """Close browser context"""
        await self.browser_context.close()
        utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
-												feat: 小红书笔记搜索，评论获取done

docs: update docs

Create .gitattributes

Update README.md

											
										
										
											2023-06-09 12:41:53 +00:00
+								import asyncio
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								import os
-												refactor: 移除评论中指定数量和过滤特定关键词的逻辑

											
										
										
											2024-01-17 15:02:05 +00:00
+								import random
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
+								from asyncio import Task
-												feat: 抖音支持指定视频列表爬去

											
										
										
											2023-11-18 14:07:30 +00:00
+								from typing import Any, Dict, List, Optional, Tuple
-												feat: 小红书笔记搜索，评论获取done

docs: update docs

Create .gitattributes

Update README.md

											
										
										
											2023-06-09 12:41:53 +00:00
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								from playwright.async_api import (BrowserContext, BrowserType, Page,
 								                                  async_playwright)
-												feat: 小红书笔记搜索，评论获取done

docs: update docs

Create .gitattributes

Update README.md

											
										
										
											2023-06-09 12:41:53 +00:00
-												refactor:优化部分代码
feat: 增加IP代理账号池

											
										
										
											2023-06-27 15:38:30 +00:00
+								import config
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								from base.base_crawler import AbstractCrawler
-												feat: Bilibili comment done

											
										
										
											2023-12-09 13:10:01 +00:00
+								from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
-												refactor: 数据存储重构，分离不同类型的存储实现

											
										
										
											2024-01-14 14:06:31 +00:00
+								from store import douyin as douyin_store
-												refactor:优化部分代码
feat: 增加IP代理账号池

											
										
										
											2023-06-27 15:38:30 +00:00
+								from tools import utils
-												feat: 抖音支持指定视频列表爬去

											
										
										
											2023-11-18 14:07:30 +00:00
+								from var import crawler_type_var
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
-												feat: 小红书笔记搜索，评论获取done

docs: update docs

Create .gitattributes

Update README.md

											
										
										
											2023-06-09 12:41:53 +00:00
+								from .client import DOUYINClient
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
+								from .exception import DataFetchError
-												fix: issue #140

											
										
										
											2024-02-26 15:47:02 +00:00
+								from .field import PublishTimeType
-												feat: add abstract api client to all platform

											
										
										
											2024-03-30 13:27:25 +00:00
+								from .login import DouYinLogin
-												feat: 小红书笔记搜索，评论获取done

docs: update docs

Create .gitattributes

Update README.md

											
										
										
											2023-06-09 12:41:53 +00:00
-												refactor:优化部分代码
feat: 增加IP代理账号池

											
										
										
											2023-06-27 15:38:30 +00:00
+								class DouYinCrawler(AbstractCrawler):
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								    platform: str
 								    login_type: str
-												feat: 小红书增加指定帖子爬取功能
fix: 修复程序一些异常 bug
refactor: 优化部分代码逻辑

											
										
										
											2023-11-18 05:38:11 +00:00
+								    crawler_type: str
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								    context_page: Page
-												feat: 完善类型注释，增加 mypy 类型检测

											
										
										
											2023-07-16 09:57:18 +00:00
+								    dy_client: DOUYINClient
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								    browser_context: BrowserContext
-												feat: 抖音指定创作者done

											
										
										
											2024-05-27 17:07:19 +00:00
+								    start_page: int
 								    keyword: str
-												feat: 完善类型注释，增加 mypy 类型检测

											
										
										
											2023-07-16 09:57:18 +00:00
 								    def __init__(self) -> None:
-												feat: 抖音指定创作者done

											
										
										
											2024-05-27 17:07:19 +00:00
+								        self.start_page = None
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
+								        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"  # fixed
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								        self.index_url = "https://www.douyin.com"
-												feat: 小红书笔记搜索，评论获取done

docs: update docs

Create .gitattributes

Update README.md

											
										
										
											2023-06-09 12:41:53 +00:00
-												feat(core): 新增控制爬虫  参数起始页面的页数start_page;perf(argparse): 向命令行解析器添加程序参数起始页面页数和关键字

											
										
										
											2024-04-11 16:52:47 +00:00
+								    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								        self.platform = platform
 								        self.login_type = login_type
-												feat: 小红书增加指定帖子爬取功能
fix: 修复程序一些异常 bug
refactor: 优化部分代码逻辑

											
										
										
											2023-11-18 05:38:11 +00:00
+								        self.crawler_type = crawler_type
-												feat(core): 新增控制爬虫  参数起始页面的页数start_page;perf(argparse): 向命令行解析器添加程序参数起始页面页数和关键字

											
										
										
											2024-04-11 16:52:47 +00:00
+								        self.start_page = start_page
 								        self.keyword = keyword
-												feat: 小红书笔记搜索，评论获取done

docs: update docs

Create .gitattributes

Update README.md

											
										
										
											2023-06-09 12:41:53 +00:00
-												feat: 完善类型注释，增加 mypy 类型检测

											
										
										
											2023-07-16 09:57:18 +00:00
+								    async def start(self) -> None:
-												feat: 代理IP功能 Done

											
										
										
											2023-12-07 16:10:04 +00:00
+								        playwright_proxy_format, httpx_proxy_format = None, None
 								        if config.ENABLE_IP_PROXY:
 								            ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
 								            ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
 								            playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info)
-												feat: 小红书笔记搜索，评论获取done

docs: update docs

Create .gitattributes

Update README.md

											
										
										
											2023-06-09 12:41:53 +00:00
+								        async with async_playwright() as playwright:
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								            # Launch a browser context.
-												feat: 小红书笔记搜索，评论获取done

docs: update docs

Create .gitattributes

Update README.md

											
										
										
											2023-06-09 12:41:53 +00:00
+								            chromium = playwright.chromium
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								            self.browser_context = await self.launch_browser(
 								                chromium,
-												feat: 代理IP功能 Done

											
										
										
											2023-12-07 16:10:04 +00:00
+								                None,
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								                self.user_agent,
 								                headless=config.HEADLESS
-												feat: 小红书笔记搜索，评论获取done

docs: update docs

Create .gitattributes

Update README.md

											
										
										
											2023-06-09 12:41:53 +00:00
+								            )
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								            # stealth.min.js is a js script to prevent the website from detecting the crawler.
-												feat: 小红书笔记搜索，评论获取done

docs: update docs

Create .gitattributes

Update README.md

											
										
										
											2023-06-09 12:41:53 +00:00
+								            await self.browser_context.add_init_script(path="libs/stealth.min.js")
 								            self.context_page = await self.browser_context.new_page()
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								            await self.context_page.goto(self.index_url)
-												feat: 代理IP功能 Done

											
										
										
											2023-12-07 16:10:04 +00:00
+								            self.dy_client = await self.create_douyin_client(httpx_proxy_format)
-												fix: 修复B站搜索Field命名 bug
refactor: ping接口统一更换为pong

											
										
										
											2023-12-05 14:47:36 +00:00
+								            if not await self.dy_client.pong(browser_context=self.browser_context):
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								                login_obj = DouYinLogin(
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								                    login_type=self.login_type,
-												feat: 抖音指定创作者done

											
										
										
											2024-05-27 17:07:19 +00:00
+								                    login_phone="",  # you phone number
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								                    browser_context=self.browser_context,
 								                    context_page=self.context_page,
 								                    cookie_str=config.COOKIES
 								                )
 								                await login_obj.begin()
 								                await self.dy_client.update_cookies(browser_context=self.browser_context)
-												feat: 抖音支持指定视频列表爬去

											
										
										
											2023-11-18 14:07:30 +00:00
+								            crawler_type_var.set(self.crawler_type)
-												feat: 小红书增加指定帖子爬取功能
fix: 修复程序一些异常 bug
refactor: 优化部分代码逻辑

											
										
										
											2023-11-18 05:38:11 +00:00
+								            if self.crawler_type == "search":
 								                # Search for notes and retrieve their comment information.
 								                await self.search()
 								            elif self.crawler_type == "detail":
 								                # Get the information and comments of the specified post
-												feat: 抖音支持指定视频列表爬去

											
										
										
											2023-11-18 14:07:30 +00:00
+								                await self.get_specified_awemes()
-												feat: 抖音指定创作者done

											
										
										
											2024-05-27 17:07:19 +00:00
+								            elif self.crawler_type == "creator":
 								                # Get the information and comments of the specified creator
 								                await self.get_creators_and_videos()
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
-												refactor: 规范日志打印
feat: B站指定视频ID爬取（bvid）

											
										
										
											2023-12-22 17:04:08 +00:00
+								            utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...")
-												feat: 小红书笔记搜索，评论获取done

docs: update docs

Create .gitattributes

Update README.md

											
										
										
											2023-06-09 12:41:53 +00:00
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								    async def search(self) -> None:
-												refactor: 规范日志打印
feat: B站指定视频ID爬取（bvid）

											
										
										
											2023-12-22 17:04:08 +00:00
+								        utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords")
-												chore: 简化判断方式

											
										
										
											2024-04-03 16:11:22 +00:00
+								        dy_limit_count = 10  # douyin limit page fixed value
 								        if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
 								            config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
-												feat(core): 新增控制爬虫  参数起始页面的页数start_page;perf(argparse): 向命令行解析器添加程序参数起始页面页数和关键字

											
										
										
											2024-04-11 16:52:47 +00:00
+								        start_page = self.start_page  # start page number
 								        for keyword in self.keyword.split(","):
-												refactor: 规范日志打印
feat: B站指定视频ID爬取（bvid）

											
										
										
											2023-12-22 17:04:08 +00:00
+								            utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
+								            aweme_list: List[str] = []
 								            page = 0
-												feat(core): 新增控制爬虫  参数起始页面的页数start_page;perf(argparse): 向命令行解析器添加程序参数起始页面页数和关键字

											
										
										
											2024-04-11 16:52:47 +00:00
+								            while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
 								                if page < start_page:
 								                    utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
 								                    page += 1
 								                    continue
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
+								                try:
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								                    posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
-												fix: issue #140

											
										
										
											2024-02-26 15:47:02 +00:00
+								                                                                            offset=page * dy_limit_count,
 								                                                                            publish_time=PublishTimeType.UNLIMITED
 								                                                                            )
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
+								                except DataFetchError:
-												refactor: 规范日志打印
feat: B站指定视频ID爬取（bvid）

											
										
										
											2023-12-22 17:04:08 +00:00
+								                    utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
+								                    break
-												fix: #237

											
										
										
											2024-04-17 15:32:17 +00:00
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
+								                page += 1
-												fix: #237

											
										
										
											2024-04-17 15:32:17 +00:00
+								                if "data" not in posts_res:
-												feat: 抖音指定创作者done

											
										
										
											2024-05-27 17:07:19 +00:00
+								                    utils.logger.error(
 								                        f"[DouYinCrawler.search] search douyin keyword: {keyword} failed，账号也许被风控了。")
-												fix: #237

											
										
										
											2024-04-17 15:32:17 +00:00
+								                    break
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
+								                for post_item in posts_res.get("data"):
 								                    try:
 								                        aweme_info: Dict = post_item.get("aweme_info") or \
 								                                           post_item.get("aweme_mix_info", {}).get("mix_items")[0]
 								                    except TypeError:
 								                        continue
-												feat: 增加配置项支持自由选择数据是否保存到关系型数据库中

											
										
										
											2023-07-24 12:59:43 +00:00
+								                    aweme_list.append(aweme_info.get("aweme_id", ""))
-												refactor: 数据存储重构，分离不同类型的存储实现

											
										
										
											2024-01-14 14:06:31 +00:00
+								                    await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
-												refactor: 规范日志打印
feat: B站指定视频ID爬取（bvid）

											
										
										
											2023-12-22 17:04:08 +00:00
+								            utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
-												feat: 支持数据保存到CSV中

											
										
										
											2023-08-16 11:49:41 +00:00
+								            await self.batch_get_note_comments(aweme_list)
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
-												feat: 抖音支持指定视频列表爬去

											
										
										
											2023-11-18 14:07:30 +00:00
+								    async def get_specified_awemes(self):
-												feat: 小红书增加指定帖子爬取功能
fix: 修复程序一些异常 bug
refactor: 优化部分代码逻辑

											
										
										
											2023-11-18 05:38:11 +00:00
+								        """Get the information and comments of the specified post"""
-												feat: 抖音支持指定视频列表爬去

											
										
										
											2023-11-18 14:07:30 +00:00
+								        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
 								        task_list = [
 								            self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST
 								        ]
 								        aweme_details = await asyncio.gather(*task_list)
 								        for aweme_detail in aweme_details:
 								            if aweme_detail is not None:
-												refactor: 数据存储重构，分离不同类型的存储实现

											
										
										
											2024-01-14 14:06:31 +00:00
+								                await douyin_store.update_douyin_aweme(aweme_detail)
-												feat: 抖音支持指定视频列表爬去

											
										
										
											2023-11-18 14:07:30 +00:00
+								        await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
 								    async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
 								        """Get note detail"""
 								        async with semaphore:
 								            try:
 								                return await self.dy_client.get_video_by_id(aweme_id)
 								            except DataFetchError as ex:
-												refactor: 规范日志打印
feat: B站指定视频ID爬取（bvid）

											
										
										
											2023-12-22 17:04:08 +00:00
+								                utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
-												feat: 抖音支持指定视频列表爬去

											
										
										
											2023-11-18 14:07:30 +00:00
+								                return None
 								            except KeyError as ex:
-												feat: 抖音指定创作者done

											
										
										
											2024-05-27 17:07:19 +00:00
+								                utils.logger.error(
 								                    f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
-												feat: 抖音支持指定视频列表爬去

											
										
										
											2023-11-18 14:07:30 +00:00
+								                return None
-												feat: 小红书增加指定帖子爬取功能
fix: 修复程序一些异常 bug
refactor: 优化部分代码逻辑

											
										
										
											2023-11-18 05:38:11 +00:00
-												feat: 支持数据保存到CSV中

											
										
										
											2023-08-16 11:49:41 +00:00
+								    async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
-												feat: 抖音指定创作者done

											
										
										
											2024-05-27 17:07:19 +00:00
+								        """
 								        Batch get note comments
 								        """
-												feat: 支持评论模式是否开启爬取选项

											
										
										
											2024-03-16 03:52:42 +00:00
+								        if not config.ENABLE_GET_COMMENTS:
 								            utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled")
 								            return
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
+								        task_list: List[Task] = []
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
+								        for aweme_id in aweme_list:
-												添加功能:抖音每个视频抓取评论最大条数限制,抖音评论关键词筛选

											
										
										
											2023-12-05 03:21:47 +00:00
+								            task = asyncio.create_task(
-												refactor: 移除评论中指定数量和过滤特定关键词的逻辑

											
										
										
											2024-01-17 15:02:05 +00:00
+								                self.get_comments(aweme_id, semaphore), name=aweme_id)
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
+								            task_list.append(task)
-												feat: 抖音指定创作者done

											
										
										
											2024-05-27 17:07:19 +00:00
+								        if len(task_list) > 0:
-												fix: 修复抖音关键词搜索为中文的情况下，有bug

											
										
										
											2024-03-03 11:36:36 +00:00
+								            await asyncio.wait(task_list)
-												feat: 抖音评论done

											
										
										
											2023-06-25 13:05:30 +00:00
-												refactor: 移除评论中指定数量和过滤特定关键词的逻辑

											
										
										
											2024-01-17 15:02:05 +00:00
+								    async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
-												feat: 增加配置项支持自由选择数据是否保存到关系型数据库中

											
										
										
											2023-07-24 12:59:43 +00:00
+								        async with semaphore:
 								            try:
-												添加功能:抖音每个视频抓取评论最大条数限制,抖音评论关键词筛选

											
										
										
											2023-12-05 03:21:47 +00:00
+								                # 将关键词列表传递给 get_aweme_all_comments 方法
-												feat: 支持评论模式是否开启爬取选项

											
										
										
											2024-03-16 03:52:42 +00:00
+								                await self.dy_client.get_aweme_all_comments(
-												feat: 增加配置项支持自由选择数据是否保存到关系型数据库中

											
										
										
											2023-07-24 12:59:43 +00:00
+								                    aweme_id=aweme_id,
-												refactor: 移除评论中指定数量和过滤特定关键词的逻辑

											
										
										
											2024-01-17 15:02:05 +00:00
+								                    crawl_interval=random.random(),
-												抖音二级评论

											
										
										
											2024-05-28 22:35:37 +00:00
+								                    is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
-												refactor: 移除评论中指定数量和过滤特定关键词的逻辑

											
										
										
											2024-01-17 15:02:05 +00:00
+								                    callback=douyin_store.batch_update_dy_aweme_comments
-												feat: 增加配置项支持自由选择数据是否保存到关系型数据库中

											
										
										
											2023-07-24 12:59:43 +00:00
+								                )
-												feat: 抖音指定创作者done

											
										
										
											2024-05-27 17:07:19 +00:00
+								                utils.logger.info(
 								                    f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
-												feat: 增加配置项支持自由选择数据是否保存到关系型数据库中

											
										
										
											2023-07-24 12:59:43 +00:00
+								            except DataFetchError as e:
-												refactor: 规范日志打印
feat: B站指定视频ID爬取（bvid）

											
										
										
											2023-12-22 17:04:08 +00:00
+								                utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
-												feat: 抖音指定创作者done

											
										
										
											2024-05-27 17:07:19 +00:00
+								    async def get_creators_and_videos(self) -> None:
 								        """
 								        Get the information and videos of the specified creator
 								        """
 								        utils.logger.info("[DouYinCrawler.get_creators_and_videos] Begin get douyin creators")
 								        for user_id in config.DY_CREATOR_ID_LIST:
 								            creator_info: Dict = await self.dy_client.get_user_info(user_id)
 								            if creator_info:
 								                await douyin_store.save_creator(user_id, creator=creator_info)
 								            # Get all video information of the creator
 								            all_video_list = await self.dy_client.get_all_user_aweme_posts(
 								                sec_user_id=user_id,
 								                callback=self.fetch_creator_video_detail
 								            )
 								            video_ids = [video_item.get("aweme_id") for video_item in all_video_list]
 								            await self.batch_get_note_comments(video_ids)
 								    async def fetch_creator_video_detail(self, video_list: List[Dict]):
 								        """
 								        Concurrently obtain the specified post list and save the data
 								        """
 								        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
 								        task_list = [
 								            self.get_aweme_detail(post_item.get("aweme_id"), semaphore) for post_item in video_list
 								        ]
 								        note_details = await asyncio.gather(*task_list)
 								        for aweme_item in note_details:
 								            if aweme_item is not None:
 								                await douyin_store.update_douyin_aweme(aweme_item)
-												feat: 代理IP功能 Done

											
										
										
											2023-12-07 16:10:04 +00:00
+								    @staticmethod
 								    def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
 								        """format proxy info for playwright and httpx"""
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								        playwright_proxy = {
-												feat: 代理IP功能 Done

											
										
										
											2023-12-07 16:10:04 +00:00
+								            "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
 								            "username": ip_proxy_info.user,
 								            "password": ip_proxy_info.password,
 								        }
 								        httpx_proxy = {
-												fix: 修复代理Bug

											
										
										
											2024-01-13 07:50:02 +00:00
+								            f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								        }
-												feat: 代理IP功能 Done

											
										
										
											2023-12-07 16:10:04 +00:00
+								        return playwright_proxy, httpx_proxy
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
-												feat: 完善类型注释，增加 mypy 类型检测

											
										
										
											2023-07-16 09:57:18 +00:00
+								    async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient:
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								        """Create douyin client"""
-												feat: 增加配置项支持自由选择数据是否保存到关系型数据库中

											
										
										
											2023-07-24 12:59:43 +00:00
+								        cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())  # type: ignore
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								        douyin_client = DOUYINClient(
 								            proxies=httpx_proxy,
 								            headers={
 								                "User-Agent": self.user_agent,
 								                "Cookie": cookie_str,
 								                "Host": "www.douyin.com",
 								                "Origin": "https://www.douyin.com/",
 								                "Referer": "https://www.douyin.com/",
 								                "Content-Type": "application/json;charset=UTF-8"
 								            },
 								            playwright_page=self.context_page,
 								            cookie_dict=cookie_dict,
 								        )
 								        return douyin_client
 								    async def launch_browser(
 								            self,
 								            chromium: BrowserType,
 								            playwright_proxy: Optional[Dict],
 								            user_agent: Optional[str],
 								            headless: bool = True
 								    ) -> BrowserContext:
 								        """Launch browser and create browser context"""
 								        if config.SAVE_LOGIN_STATE:
-												feat: 增加配置项支持自由选择数据是否保存到关系型数据库中

											
										
										
											2023-07-24 12:59:43 +00:00
+								            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								                                         config.USER_DATA_DIR % self.platform)  # type: ignore
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								            browser_context = await chromium.launch_persistent_context(
 								                user_data_dir=user_data_dir,
 								                accept_downloads=True,
 								                headless=headless,
-												feat: 增加配置项支持自由选择数据是否保存到关系型数据库中

											
										
										
											2023-07-24 12:59:43 +00:00
+								                proxy=playwright_proxy,  # type: ignore
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								                viewport={"width": 1920, "height": 1080},
 								                user_agent=user_agent
-												feat: 增加配置项支持自由选择数据是否保存到关系型数据库中

											
										
										
											2023-07-24 12:59:43 +00:00
+								            )  # type: ignore
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								            return browser_context
 								        else:
-												feat: 增加配置项支持自由选择数据是否保存到关系型数据库中

											
										
										
											2023-07-24 12:59:43 +00:00
+								            browser = await chromium.launch(headless=headless, proxy=playwright_proxy)  # type: ignore
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								            browser_context = await browser.new_context(
 								                viewport={"width": 1920, "height": 1080},
 								                user_agent=user_agent
 								            )
 								            return browser_context
-												refactor: 优化代码

											
										
										
											2023-07-29 07:35:40 +00:00
+								    async def close(self) -> None:
-												refactor: 优化抖音Crawler部分代码
fix: 日志初始化错误修复

											
										
										
											2023-07-15 13:30:12 +00:00
+								        """Close browser context"""
 								        await self.browser_context.close()
-												refactor: 规范日志打印
feat: B站指定视频ID爬取（bvid）

											
										
										
											2023-12-22 17:04:08 +00:00
+								        utils.logger.info("[DouYinCrawler.close] Browser context closed ...")