feat: 抖音支持指定视频列表爬去

This commit is contained in:
Relakkes 2023-11-18 22:07:30 +08:00
parent 098923d74d
commit 81bc8b51e2
7 changed files with 52 additions and 25 deletions

View File

@ -22,6 +22,7 @@
- [x] 抖音登录二维码、手机号、cookies - [x] 抖音登录二维码、手机号、cookies
- [x] 抖音滑块模拟滑动实现准确率不太OK - [x] 抖音滑块模拟滑动实现准确率不太OK
- [x] 抖音指定关键爬取 - [x] 抖音指定关键爬取
- [x] 抖音指定帖子爬取
- [x] 支持登录成功后的上下文浏览器环境保留 - [x] 支持登录成功后的上下文浏览器环境保留
- [x] 代理池实现(手机号+IP - [x] 代理池实现(手机号+IP
- [x] 并发执行爬虫请求 - [x] 并发执行爬虫请求
@ -29,7 +30,7 @@
- [x] 数据保持到数据库中(可选) - [x] 数据保持到数据库中(可选)
## 待实现 ## 待实现
- [ ] 抖音指定帖子爬取
- [ ] 快手爬虫实现 - [ ] 快手爬虫实现
## 使用方法 ## 使用方法
@ -70,7 +71,7 @@
## 常见程序运行出错问题 ## 常见程序运行出错问题
```shell ```shell
# Q: 爬抖音报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'` # Q: 爬抖音报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'`
# A: 该错误为缺少 nodejs 环境这个错误安装 nodejs 环境即可,版本为:`v16.8.0` # A: 该错误为缺少 nodejs 环境这个错误安装 nodejs 环境即可,版本为:`v16.8.0`
# Q: 可以指定关键词爬取吗? # Q: 可以指定关键词爬取吗?

View File

@ -33,3 +33,10 @@ XHS_SPECIFIED_ID_LIST = [
"64ca1b73000000000b028dd2", "64ca1b73000000000b028dd2",
"630d5b85000000001203ab41", "630d5b85000000001203ab41",
] ]
# douyin specified note id list
DY_SPECIFIED_ID_LIST = [
"7280854932641664319",
"7202432992642387233"
]

View File

@ -23,13 +23,13 @@ class CrawlerFactory:
async def main(): async def main():
# define command line params ... # define command line params ...
parser = argparse.ArgumentParser(description='Media crawler program.') parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy)', choices=["xhs", "dy"], parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy)',
default=config.PLATFORM) choices=["xhs", "dy"], default=config.PLATFORM)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
parser.add_argument('--type', type=str, help='crawler type (search | detail)', parser.add_argument('--type', type=str, help='crawler type (search | detail)',
choices=["search","detail"],default=config.CRAWLER_TYPE) choices=["search", "detail"], default=config.CRAWLER_TYPE)
# init account pool # init account pool
account_pool = proxy_account_pool.create_account_pool() account_pool = proxy_account_pool.create_account_pool()

View File

@ -1,7 +1,7 @@
import asyncio import asyncio
import copy import copy
import urllib.parse import urllib.parse
from typing import Callable, Dict, Optional from typing import Any, Callable, Dict, Optional
import execjs import execjs
import httpx import httpx
@ -129,7 +129,7 @@ class DOUYINClient:
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers) return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers)
async def get_video_by_id(self, aweme_id: str): async def get_video_by_id(self, aweme_id: str) -> Any:
""" """
DouYin Video Detail API DouYin Video Detail API
:param aweme_id: :param aweme_id:
@ -139,9 +139,10 @@ class DOUYINClient:
"aweme_id": aweme_id "aweme_id": aweme_id
} }
headers = copy.copy(self.headers) headers = copy.copy(self.headers)
headers["Cookie"] = "s_v_web_id=verify_leytkxgn_kvO5kOmO_SdMs_4t1o_B5ml_BUqtWM1mP6BF;" # headers["Cookie"] = "s_v_web_id=verify_lol4a8dv_wpQ1QMyP_xemd_4wON_8Yzr_FJa8DN1vdY2m;"
del headers["Origin"] del headers["Origin"]
return await self.get("/aweme/v1/web/aweme/detail/", params, headers) res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
return res.get("aweme_detail", {})
async def get_aweme_comments(self, aweme_id: str, cursor: int = 0): async def get_aweme_comments(self, aweme_id: str, cursor: int = 0):
"""get note comments """get note comments

View File

@ -1,7 +1,7 @@
import asyncio import asyncio
import os import os
from asyncio import Task from asyncio import Task
from typing import Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
from playwright.async_api import (BrowserContext, BrowserType, Page, from playwright.async_api import (BrowserContext, BrowserType, Page,
async_playwright) async_playwright)
@ -11,7 +11,7 @@ from base.base_crawler import AbstractCrawler
from base.proxy_account_pool import AccountPool from base.proxy_account_pool import AccountPool
from models import douyin from models import douyin
from tools import utils from tools import utils
from var import request_keyword_var from var import crawler_type_var
from .client import DOUYINClient from .client import DOUYINClient
from .exception import DataFetchError from .exception import DataFetchError
@ -64,20 +64,19 @@ class DouYinCrawler(AbstractCrawler):
) )
await login_obj.begin() await login_obj.begin()
await self.dy_client.update_cookies(browser_context=self.browser_context) await self.dy_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type)
if self.crawler_type == "search": if self.crawler_type == "search":
# Search for notes and retrieve their comment information. # Search for notes and retrieve their comment information.
await self.search() await self.search()
elif self.crawler_type == "detail": elif self.crawler_type == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_notes() await self.get_specified_awemes()
utils.logger.info("Douyin Crawler finished ...") utils.logger.info("Douyin Crawler finished ...")
async def search(self) -> None: async def search(self) -> None:
utils.logger.info("Begin search douyin keywords") utils.logger.info("Begin search douyin keywords")
for keyword in config.KEYWORDS.split(","): for keyword in config.KEYWORDS.split(","):
request_keyword_var.set(keyword)
utils.logger.info(f"Current keyword: {keyword}") utils.logger.info(f"Current keyword: {keyword}")
aweme_list: List[str] = [] aweme_list: List[str] = []
dy_limit_count = 10 dy_limit_count = 10
@ -101,10 +100,29 @@ class DouYinCrawler(AbstractCrawler):
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}") utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list) await self.batch_get_note_comments(aweme_list)
async def get_specified_notes(self): async def get_specified_awemes(self):
"""Get the information and comments of the specified post""" """Get the information and comments of the specified post"""
# todo douyin support semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
pass task_list = [
self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST
]
aweme_details = await asyncio.gather(*task_list)
for aweme_detail in aweme_details:
if aweme_detail is not None:
await douyin.update_douyin_aweme(aweme_detail)
await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
"""Get note detail"""
async with semaphore:
try:
return await self.dy_client.get_video_by_id(aweme_id)
except DataFetchError as ex:
utils.logger.error(f"Get aweme detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"have not fund note detail aweme_id:{aweme_id}, err: {ex}")
return None
async def batch_get_note_comments(self, aweme_list: List[str]) -> None: async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
task_list: List[Task] = [] task_list: List[Task] = []

View File

@ -74,13 +74,12 @@ class XiaoHongShuCrawler(AbstractCrawler):
await login_obj.begin() await login_obj.begin()
await self.xhs_client.update_cookies(browser_context=self.browser_context) await self.xhs_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type)
if self.crawler_type == "search": if self.crawler_type == "search":
# Search for notes and retrieve their comment information. # Search for notes and retrieve their comment information.
crawler_type_var.set("search")
await self.search() await self.search()
elif self.crawler_type == "detail": elif self.crawler_type == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
crawler_type_var.set("detail")
await self.get_specified_notes() await self.get_specified_notes()
else: else:
pass pass

View File

@ -8,7 +8,7 @@ from tortoise.models import Model
import config import config
from tools import utils from tools import utils
from var import request_keyword_var from var import crawler_type_var
class DouyinBaseModel(Model): class DouyinBaseModel(Model):
@ -104,9 +104,9 @@ async def update_douyin_aweme(aweme_item: Dict):
await DouyinAweme.filter(aweme_id=aweme_id).update(**douyin_data.dict()) await DouyinAweme.filter(aweme_id=aweme_id).update(**douyin_data.dict())
else: else:
# Below is a simple way to save it in CSV format. # Below is a simple way to save it in CSV format.
source_keywords = request_keyword_var.get()
pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True) pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True)
with open(f"data/dy/aweme_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f: save_file_name = f"data/dy/{crawler_type_var.get()}_awemes_{utils.get_current_date()}.csv"
with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
writer = csv.writer(f) writer = csv.writer(f)
if f.tell() == 0: if f.tell() == 0:
writer.writerow(local_db_item.keys()) writer.writerow(local_db_item.keys())
@ -161,9 +161,10 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
comment_pydantic.validate(comment_data) comment_pydantic.validate(comment_data)
await DouyinAwemeComment.filter(comment_id=comment_id).update(**comment_data.dict()) await DouyinAwemeComment.filter(comment_id=comment_id).update(**comment_data.dict())
else: else:
source_keywords = request_keyword_var.get()
pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True) pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True)
with open(f"data/dy/comment_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f: save_file_name = f"data/dy/{crawler_type_var.get()}_comments_{utils.get_current_date()}.csv"
with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
writer = csv.writer(f) writer = csv.writer(f)
if f.tell() == 0: if f.tell() == 0:
writer.writerow(local_db_item.keys()) writer.writerow(local_db_item.keys())