feat: 抖音支持指定视频列表爬去
This commit is contained in:
parent
098923d74d
commit
81bc8b51e2
|
@ -22,6 +22,7 @@
|
||||||
- [x] 抖音登录(二维码、手机号、cookies)
|
- [x] 抖音登录(二维码、手机号、cookies)
|
||||||
- [x] 抖音滑块(模拟滑动实现,准确率不太OK)
|
- [x] 抖音滑块(模拟滑动实现,准确率不太OK)
|
||||||
- [x] 抖音指定关键爬取
|
- [x] 抖音指定关键爬取
|
||||||
|
- [x] 抖音指定帖子爬取
|
||||||
- [x] 支持登录成功后的上下文浏览器环境保留
|
- [x] 支持登录成功后的上下文浏览器环境保留
|
||||||
- [x] 代理池实现(手机号+IP)
|
- [x] 代理池实现(手机号+IP)
|
||||||
- [x] 并发执行爬虫请求
|
- [x] 并发执行爬虫请求
|
||||||
|
@ -29,7 +30,7 @@
|
||||||
- [x] 数据保持到数据库中(可选)
|
- [x] 数据保持到数据库中(可选)
|
||||||
|
|
||||||
## 待实现
|
## 待实现
|
||||||
- [ ] 抖音指定帖子爬取
|
|
||||||
- [ ] 快手爬虫实现
|
- [ ] 快手爬虫实现
|
||||||
|
|
||||||
## 使用方法
|
## 使用方法
|
||||||
|
@ -70,7 +71,7 @@
|
||||||
|
|
||||||
## 常见程序运行出错问题
|
## 常见程序运行出错问题
|
||||||
```shell
|
```shell
|
||||||
# Q: 爬去抖音报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'`
|
# Q: 爬取抖音报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'`
|
||||||
# A: 该错误为缺少 nodejs 环境这个错误安装 nodejs 环境即可,版本为:`v16.8.0`
|
# A: 该错误为缺少 nodejs 环境这个错误安装 nodejs 环境即可,版本为:`v16.8.0`
|
||||||
|
|
||||||
# Q: 可以指定关键词爬取吗?
|
# Q: 可以指定关键词爬取吗?
|
||||||
|
|
|
@ -33,3 +33,10 @@ XHS_SPECIFIED_ID_LIST = [
|
||||||
"64ca1b73000000000b028dd2",
|
"64ca1b73000000000b028dd2",
|
||||||
"630d5b85000000001203ab41",
|
"630d5b85000000001203ab41",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# douyin specified note id list
|
||||||
|
DY_SPECIFIED_ID_LIST = [
|
||||||
|
"7280854932641664319",
|
||||||
|
"7202432992642387233"
|
||||||
|
]
|
8
main.py
8
main.py
|
@ -23,13 +23,13 @@ class CrawlerFactory:
|
||||||
async def main():
|
async def main():
|
||||||
# define command line params ...
|
# define command line params ...
|
||||||
parser = argparse.ArgumentParser(description='Media crawler program.')
|
parser = argparse.ArgumentParser(description='Media crawler program.')
|
||||||
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy)', choices=["xhs", "dy"],
|
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy)',
|
||||||
default=config.PLATFORM)
|
choices=["xhs", "dy"], default=config.PLATFORM)
|
||||||
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
|
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
|
||||||
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
||||||
|
|
||||||
parser.add_argument('--type', type=str, help='crawler type (search | detail)',
|
parser.add_argument('--type', type=str, help='crawler type (search | detail)',
|
||||||
choices=["search","detail"],default=config.CRAWLER_TYPE)
|
choices=["search", "detail"], default=config.CRAWLER_TYPE)
|
||||||
|
|
||||||
# init account pool
|
# init account pool
|
||||||
account_pool = proxy_account_pool.create_account_pool()
|
account_pool = proxy_account_pool.create_account_pool()
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import copy
|
import copy
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from typing import Callable, Dict, Optional
|
from typing import Any, Callable, Dict, Optional
|
||||||
|
|
||||||
import execjs
|
import execjs
|
||||||
import httpx
|
import httpx
|
||||||
|
@ -129,7 +129,7 @@ class DOUYINClient:
|
||||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||||
return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers)
|
return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers)
|
||||||
|
|
||||||
async def get_video_by_id(self, aweme_id: str):
|
async def get_video_by_id(self, aweme_id: str) -> Any:
|
||||||
"""
|
"""
|
||||||
DouYin Video Detail API
|
DouYin Video Detail API
|
||||||
:param aweme_id:
|
:param aweme_id:
|
||||||
|
@ -139,9 +139,10 @@ class DOUYINClient:
|
||||||
"aweme_id": aweme_id
|
"aweme_id": aweme_id
|
||||||
}
|
}
|
||||||
headers = copy.copy(self.headers)
|
headers = copy.copy(self.headers)
|
||||||
headers["Cookie"] = "s_v_web_id=verify_leytkxgn_kvO5kOmO_SdMs_4t1o_B5ml_BUqtWM1mP6BF;"
|
# headers["Cookie"] = "s_v_web_id=verify_lol4a8dv_wpQ1QMyP_xemd_4wON_8Yzr_FJa8DN1vdY2m;"
|
||||||
del headers["Origin"]
|
del headers["Origin"]
|
||||||
return await self.get("/aweme/v1/web/aweme/detail/", params, headers)
|
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
|
||||||
|
return res.get("aweme_detail", {})
|
||||||
|
|
||||||
async def get_aweme_comments(self, aweme_id: str, cursor: int = 0):
|
async def get_aweme_comments(self, aweme_id: str, cursor: int = 0):
|
||||||
"""get note comments
|
"""get note comments
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
from asyncio import Task
|
from asyncio import Task
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
from playwright.async_api import (BrowserContext, BrowserType, Page,
|
from playwright.async_api import (BrowserContext, BrowserType, Page,
|
||||||
async_playwright)
|
async_playwright)
|
||||||
|
@ -11,7 +11,7 @@ from base.base_crawler import AbstractCrawler
|
||||||
from base.proxy_account_pool import AccountPool
|
from base.proxy_account_pool import AccountPool
|
||||||
from models import douyin
|
from models import douyin
|
||||||
from tools import utils
|
from tools import utils
|
||||||
from var import request_keyword_var
|
from var import crawler_type_var
|
||||||
|
|
||||||
from .client import DOUYINClient
|
from .client import DOUYINClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
|
@ -64,20 +64,19 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
)
|
)
|
||||||
await login_obj.begin()
|
await login_obj.begin()
|
||||||
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
||||||
|
crawler_type_var.set(self.crawler_type)
|
||||||
if self.crawler_type == "search":
|
if self.crawler_type == "search":
|
||||||
# Search for notes and retrieve their comment information.
|
# Search for notes and retrieve their comment information.
|
||||||
await self.search()
|
await self.search()
|
||||||
elif self.crawler_type == "detail":
|
elif self.crawler_type == "detail":
|
||||||
# Get the information and comments of the specified post
|
# Get the information and comments of the specified post
|
||||||
await self.get_specified_notes()
|
await self.get_specified_awemes()
|
||||||
|
|
||||||
utils.logger.info("Douyin Crawler finished ...")
|
utils.logger.info("Douyin Crawler finished ...")
|
||||||
|
|
||||||
async def search(self) -> None:
|
async def search(self) -> None:
|
||||||
utils.logger.info("Begin search douyin keywords")
|
utils.logger.info("Begin search douyin keywords")
|
||||||
for keyword in config.KEYWORDS.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
request_keyword_var.set(keyword)
|
|
||||||
utils.logger.info(f"Current keyword: {keyword}")
|
utils.logger.info(f"Current keyword: {keyword}")
|
||||||
aweme_list: List[str] = []
|
aweme_list: List[str] = []
|
||||||
dy_limit_count = 10
|
dy_limit_count = 10
|
||||||
|
@ -101,10 +100,29 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
||||||
await self.batch_get_note_comments(aweme_list)
|
await self.batch_get_note_comments(aweme_list)
|
||||||
|
|
||||||
async def get_specified_notes(self):
|
async def get_specified_awemes(self):
|
||||||
"""Get the information and comments of the specified post"""
|
"""Get the information and comments of the specified post"""
|
||||||
# todo douyin support
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
pass
|
task_list = [
|
||||||
|
self.get_aweme_detail(aweme_id=aweme_id, semaphore=semaphore) for aweme_id in config.DY_SPECIFIED_ID_LIST
|
||||||
|
]
|
||||||
|
aweme_details = await asyncio.gather(*task_list)
|
||||||
|
for aweme_detail in aweme_details:
|
||||||
|
if aweme_detail is not None:
|
||||||
|
await douyin.update_douyin_aweme(aweme_detail)
|
||||||
|
await self.batch_get_note_comments(config.DY_SPECIFIED_ID_LIST)
|
||||||
|
|
||||||
|
async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Any:
|
||||||
|
"""Get note detail"""
|
||||||
|
async with semaphore:
|
||||||
|
try:
|
||||||
|
return await self.dy_client.get_video_by_id(aweme_id)
|
||||||
|
except DataFetchError as ex:
|
||||||
|
utils.logger.error(f"Get aweme detail error: {ex}")
|
||||||
|
return None
|
||||||
|
except KeyError as ex:
|
||||||
|
utils.logger.error(f"have not fund note detail aweme_id:{aweme_id}, err: {ex}")
|
||||||
|
return None
|
||||||
|
|
||||||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||||||
task_list: List[Task] = []
|
task_list: List[Task] = []
|
||||||
|
|
|
@ -74,13 +74,12 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
await login_obj.begin()
|
await login_obj.begin()
|
||||||
await self.xhs_client.update_cookies(browser_context=self.browser_context)
|
await self.xhs_client.update_cookies(browser_context=self.browser_context)
|
||||||
|
|
||||||
|
crawler_type_var.set(self.crawler_type)
|
||||||
if self.crawler_type == "search":
|
if self.crawler_type == "search":
|
||||||
# Search for notes and retrieve their comment information.
|
# Search for notes and retrieve their comment information.
|
||||||
crawler_type_var.set("search")
|
|
||||||
await self.search()
|
await self.search()
|
||||||
elif self.crawler_type == "detail":
|
elif self.crawler_type == "detail":
|
||||||
# Get the information and comments of the specified post
|
# Get the information and comments of the specified post
|
||||||
crawler_type_var.set("detail")
|
|
||||||
await self.get_specified_notes()
|
await self.get_specified_notes()
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -8,7 +8,7 @@ from tortoise.models import Model
|
||||||
|
|
||||||
import config
|
import config
|
||||||
from tools import utils
|
from tools import utils
|
||||||
from var import request_keyword_var
|
from var import crawler_type_var
|
||||||
|
|
||||||
|
|
||||||
class DouyinBaseModel(Model):
|
class DouyinBaseModel(Model):
|
||||||
|
@ -104,9 +104,9 @@ async def update_douyin_aweme(aweme_item: Dict):
|
||||||
await DouyinAweme.filter(aweme_id=aweme_id).update(**douyin_data.dict())
|
await DouyinAweme.filter(aweme_id=aweme_id).update(**douyin_data.dict())
|
||||||
else:
|
else:
|
||||||
# Below is a simple way to save it in CSV format.
|
# Below is a simple way to save it in CSV format.
|
||||||
source_keywords = request_keyword_var.get()
|
|
||||||
pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True)
|
pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True)
|
||||||
with open(f"data/dy/aweme_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f:
|
save_file_name = f"data/dy/{crawler_type_var.get()}_awemes_{utils.get_current_date()}.csv"
|
||||||
|
with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
|
||||||
writer = csv.writer(f)
|
writer = csv.writer(f)
|
||||||
if f.tell() == 0:
|
if f.tell() == 0:
|
||||||
writer.writerow(local_db_item.keys())
|
writer.writerow(local_db_item.keys())
|
||||||
|
@ -161,9 +161,10 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
|
||||||
comment_pydantic.validate(comment_data)
|
comment_pydantic.validate(comment_data)
|
||||||
await DouyinAwemeComment.filter(comment_id=comment_id).update(**comment_data.dict())
|
await DouyinAwemeComment.filter(comment_id=comment_id).update(**comment_data.dict())
|
||||||
else:
|
else:
|
||||||
source_keywords = request_keyword_var.get()
|
|
||||||
pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True)
|
pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True)
|
||||||
with open(f"data/dy/comment_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f:
|
save_file_name = f"data/dy/{crawler_type_var.get()}_comments_{utils.get_current_date()}.csv"
|
||||||
|
with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
|
||||||
writer = csv.writer(f)
|
writer = csv.writer(f)
|
||||||
if f.tell() == 0:
|
if f.tell() == 0:
|
||||||
writer.writerow(local_db_item.keys())
|
writer.writerow(local_db_item.keys())
|
||||||
|
|
Loading…
Reference in New Issue