diff --git a/README.md b/README.md index c5e2612..509c0a7 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ - [x] 抖音登录(二维码、手机号、cookies) - [x] 抖音滑块(模拟滑动实现,准确率不太OK) - [x] 抖音指定关键爬取 +- [x] 抖音指定帖子爬取 - [x] 支持登录成功后的上下文浏览器环境保留 - [x] 代理池实现(手机号+IP) - [x] 并发执行爬虫请求 @@ -29,7 +30,7 @@ - [x] 数据保持到数据库中(可选) ## 待实现 -- [ ] 抖音指定帖子爬取 + - [ ] 快手爬虫实现 ## 使用方法 @@ -70,7 +71,7 @@ ## 常见程序运行出错问题 ```shell -# Q: 爬去抖音报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'` +# Q: 爬取抖音报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'` # A: 该错误为缺少 nodejs 环境这个错误安装 nodejs 环境即可,版本为:`v16.8.0` # Q: 可以指定关键词爬取吗? diff --git a/config/base_config.py b/config/base_config.py index 38f662f..de5567e 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -33,3 +33,10 @@ XHS_SPECIFIED_ID_LIST = [ "64ca1b73000000000b028dd2", "630d5b85000000001203ab41", ] + + +# douyin specified note id list +DY_SPECIFIED_ID_LIST = [ +"7280854932641664319", +"7202432992642387233" +] \ No newline at end of file diff --git a/main.py b/main.py index 2b3c711..dd3dec8 100644 --- a/main.py +++ b/main.py @@ -23,13 +23,13 @@ class CrawlerFactory: async def main(): # define command line params ... parser = argparse.ArgumentParser(description='Media crawler program.') - parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy)', choices=["xhs", "dy"], - default=config.PLATFORM) + parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy)', + choices=["xhs", "dy"], default=config.PLATFORM) parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) - parser.add_argument('--type', type=str, help='crawler type (search | detail)', - choices=["search","detail"],default=config.CRAWLER_TYPE) + choices=["search", "detail"], default=config.CRAWLER_TYPE) + # init account pool account_pool = proxy_account_pool.create_account_pool() diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index 818694f..de4ed6d 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -1,7 +1,7 @@ import asyncio import copy import urllib.parse -from typing import Callable, Dict, Optional +from typing import Any, Callable, Dict, Optional import execjs import httpx @@ -129,7 +129,7 @@ class DOUYINClient: headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers) - async def get_video_by_id(self, aweme_id: str): + async def get_video_by_id(self, aweme_id: str) -> Any: """ DouYin Video Detail API :param aweme_id: @@ -139,9 +139,10 @@ class DOUYINClient: "aweme_id": aweme_id } headers = copy.copy(self.headers) - headers["Cookie"] = "s_v_web_id=verify_leytkxgn_kvO5kOmO_SdMs_4t1o_B5ml_BUqtWM1mP6BF;" + # headers["Cookie"] = "s_v_web_id=verify_lol4a8dv_wpQ1QMyP_xemd_4wON_8Yzr_FJa8DN1vdY2m;" del headers["Origin"] - return await self.get("/aweme/v1/web/aweme/detail/", params, headers) + res = await self.get("/aweme/v1/web/aweme/detail/", params, headers) + return res.get("aweme_detail", {}) async def get_aweme_comments(self, aweme_id: str, cursor: int = 0): """get note comments diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 35d7af5..a4e2ac4 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -1,7 +1,7 @@ import asyncio import os from asyncio import Task -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple from playwright.async_api import (BrowserContext, BrowserType, Page, async_playwright) @@ -11,7 +11,7 @@ from base.base_crawler import AbstractCrawler from base.proxy_account_pool import AccountPool from models import douyin from tools import utils -from var import request_keyword_var +from var import crawler_type_var from .client import DOUYINClient from .exception import DataFetchError @@ -64,20 +64,19 @@ class DouYinCrawler(AbstractCrawler): ) await login_obj.begin() await self.dy_client.update_cookies(browser_context=self.browser_context) - + crawler_type_var.set(self.crawler_type) if self.crawler_type == "search": # Search for notes and retrieve their comment information. await self.search() elif self.crawler_type == "detail": # Get the information and comments of the specified post - await self.get_specified_notes() + await self.get_specified_awemes() utils.logger.info("Douyin Crawler finished ...") async def search(self) -> None: utils.logger.info("Begin search douyin keywords") for keyword in config.KEYWORDS.split(","): - request_keyword_var.set(keyword) utils.logger.info(f"Current keyword: {keyword}") aweme_list: List[str] = [] dy_limit_count = 10 @@ -101,10 +100,29 @@ class DouYinCrawler(AbstractCrawler): utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}") await self.batch_get_note_comments(aweme_list) - async def get_specified_notes(self): + async def get_specified_awemes(self): """Get the information and comments of the specified post""" - # todo douyin support - pass + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST + ] + aweme_details = await asyncio.gather(*task_list) + for aweme_detail in aweme_details: + if aweme_detail is not None: + await douyin.update_douyin_aweme(aweme_detail) + await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST) + + async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any: + """Get note detail""" + async with semaphore: + try: + return await self.dy_client.get_video_by_id(aweme_id) + except DataFetchError as ex: + utils.logger.error(f"Get aweme detail error: {ex}") + return None + except KeyError as ex: + utils.logger.error(f"have not fund note detail aweme_id:{aweme_id}, err: {ex}") + return None async def batch_get_note_comments(self, aweme_list: List[str]) -> None: task_list: List[Task] = [] diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index ce210fe..91611bf 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -74,13 +74,12 @@ class XiaoHongShuCrawler(AbstractCrawler): await login_obj.begin() await self.xhs_client.update_cookies(browser_context=self.browser_context) + crawler_type_var.set(self.crawler_type) if self.crawler_type == "search": # Search for notes and retrieve their comment information. - crawler_type_var.set("search") await self.search() elif self.crawler_type == "detail": # Get the information and comments of the specified post - crawler_type_var.set("detail") await self.get_specified_notes() else: pass diff --git a/models/douyin.py b/models/douyin.py index 6181133..69b9f19 100644 --- a/models/douyin.py +++ b/models/douyin.py @@ -8,7 +8,7 @@ from tortoise.models import Model import config from tools import utils -from var import request_keyword_var +from var import crawler_type_var class DouyinBaseModel(Model): @@ -104,9 +104,9 @@ async def update_douyin_aweme(aweme_item: Dict): await DouyinAweme.filter(aweme_id=aweme_id).update(**douyin_data.dict()) else: # Below is a simple way to save it in CSV format. - source_keywords = request_keyword_var.get() pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True) - with open(f"data/dy/aweme_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f: + save_file_name = f"data/dy/{crawler_type_var.get()}_awemes_{utils.get_current_date()}.csv" + with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: writer = csv.writer(f) if f.tell() == 0: writer.writerow(local_db_item.keys()) @@ -161,9 +161,10 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): comment_pydantic.validate(comment_data) await DouyinAwemeComment.filter(comment_id=comment_id).update(**comment_data.dict()) else: - source_keywords = request_keyword_var.get() + pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True) - with open(f"data/dy/comment_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f: + save_file_name = f"data/dy/{crawler_type_var.get()}_comments_{utils.get_current_date()}.csv" + with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: writer = csv.writer(f) if f.tell() == 0: writer.writerow(local_db_item.keys())