parent
f24c892471
commit
700946b28a
30
README.md
30
README.md
|
@ -16,15 +16,21 @@
|
||||||
|
|
||||||
- [x] 小红书登录(二维码、手机号、cookies)
|
- [x] 小红书登录(二维码、手机号、cookies)
|
||||||
- [x] 小红书Sign请求签名
|
- [x] 小红书Sign请求签名
|
||||||
|
- [x] 小红书指定关键词爬去
|
||||||
|
- [x] 小红书指定帖子爬去
|
||||||
- [x] 抖音Sign请求签名
|
- [x] 抖音Sign请求签名
|
||||||
- [x] 代理池实现(手机号+IP)
|
|
||||||
- [x] 并发执行爬虫请求
|
|
||||||
- [x] 抖音登录(二维码、手机号、cookies)
|
- [x] 抖音登录(二维码、手机号、cookies)
|
||||||
- [x] 抖音滑块(模拟滑动实现,准确率不太OK)
|
- [x] 抖音滑块(模拟滑动实现,准确率不太OK)
|
||||||
|
- [x] 抖音指定关键爬取
|
||||||
- [x] 支持登录成功后的上下文浏览器环境保留
|
- [x] 支持登录成功后的上下文浏览器环境保留
|
||||||
|
- [x] 代理池实现(手机号+IP)
|
||||||
|
- [x] 并发执行爬虫请求
|
||||||
- [x] 数据保存到CSV中(默认)
|
- [x] 数据保存到CSV中(默认)
|
||||||
- [x] 数据保持到数据库中(可选)
|
- [x] 数据保持到数据库中(可选)
|
||||||
|
|
||||||
|
## 待实现
|
||||||
|
- [ ] 抖音指定帖子爬取
|
||||||
|
- [ ] 快手爬虫实现
|
||||||
|
|
||||||
## 使用方法
|
## 使用方法
|
||||||
|
|
||||||
|
@ -51,13 +57,31 @@
|
||||||
4. 运行爬虫程序
|
4. 运行爬虫程序
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
python main.py --platform xhs --lt qrcode
|
# 从配置文件中读取关键词搜索相关的帖子并爬去帖子信息与评论
|
||||||
|
python main.py --platform xhs --lt qrcode --type search
|
||||||
|
|
||||||
|
# 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息
|
||||||
|
python main.py --platform xhs --lt qrcode --type detail
|
||||||
```
|
```
|
||||||
|
|
||||||
5. 打开对应APP扫二维码登录
|
5. 打开对应APP扫二维码登录
|
||||||
|
|
||||||
6. 等待爬虫程序执行完毕,数据会保存到 `data/xhs` 目录下
|
6. 等待爬虫程序执行完毕,数据会保存到 `data/xhs` 目录下
|
||||||
|
|
||||||
|
## 常见程序运行出错问题
|
||||||
|
```shell
|
||||||
|
# Q: 爬去抖音报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'`
|
||||||
|
# A: 该错误为缺少 nodejs 环境这个错误安装 nodejs 环境即可,版本为:`v16.8.0`
|
||||||
|
|
||||||
|
# Q: 可以指定关键词爬取吗?
|
||||||
|
# A: 在config/base_config.py 中 KEYWORDS 参数用于控制需要爬去的关键词
|
||||||
|
|
||||||
|
# Q: 可以指定帖子爬去吗?
|
||||||
|
# A:在config/base_config.py 中 SPECIFIED_ID_LIST 参数用于控制需要指定爬去的帖子ID列表
|
||||||
|
|
||||||
|
# Q: 刚开始能爬取数据,过一段时间就是失效了?
|
||||||
|
# A:出现这种情况多半是由于你的账号触发了平台风控机制了,❗️❗️请勿大规模对平台进行爬虫,影响平台。
|
||||||
|
```
|
||||||
|
|
||||||
## 项目代码结构
|
## 项目代码结构
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ from base.proxy_account_pool import AccountPool
|
||||||
|
|
||||||
class AbstractCrawler(ABC):
|
class AbstractCrawler(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def init_config(self, platform: str, login_type: str, account_pool: AccountPool):
|
def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
|
|
@ -3,6 +3,7 @@ PLATFORM = "xhs"
|
||||||
KEYWORDS = "python,golang"
|
KEYWORDS = "python,golang"
|
||||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
||||||
COOKIES = "" # login by cookie, if login_type is cookie, you must set this value
|
COOKIES = "" # login by cookie, if login_type is cookie, you must set this value
|
||||||
|
CRAWLER_TYPE = "search"
|
||||||
|
|
||||||
# enable ip proxy
|
# enable ip proxy
|
||||||
ENABLE_IP_PROXY = False
|
ENABLE_IP_PROXY = False
|
||||||
|
@ -24,3 +25,11 @@ CRAWLER_MAX_NOTES_COUNT = 20
|
||||||
|
|
||||||
# max concurrency num
|
# max concurrency num
|
||||||
MAX_CONCURRENCY_NUM = 10
|
MAX_CONCURRENCY_NUM = 10
|
||||||
|
|
||||||
|
|
||||||
|
# specified note id list
|
||||||
|
SPECIFIED_ID_LIST = [
|
||||||
|
"6422c2750000000027000d88",
|
||||||
|
"64ca1b73000000000b028dd2",
|
||||||
|
"630d5b85000000001203ab41",
|
||||||
|
]
|
||||||
|
|
7
main.py
7
main.py
|
@ -23,11 +23,13 @@ class CrawlerFactory:
|
||||||
async def main():
|
async def main():
|
||||||
# define command line params ...
|
# define command line params ...
|
||||||
parser = argparse.ArgumentParser(description='Media crawler program.')
|
parser = argparse.ArgumentParser(description='Media crawler program.')
|
||||||
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)', choices=["xhs", "dy"],
|
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy)', choices=["xhs", "dy"],
|
||||||
default=config.PLATFORM)
|
default=config.PLATFORM)
|
||||||
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
|
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
|
||||||
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
||||||
|
|
||||||
|
parser.add_argument('--type', type=str, help='crawler type (search | detail)',
|
||||||
|
choices=["search","detail"],default=config.CRAWLER_TYPE)
|
||||||
# init account pool
|
# init account pool
|
||||||
account_pool = proxy_account_pool.create_account_pool()
|
account_pool = proxy_account_pool.create_account_pool()
|
||||||
|
|
||||||
|
@ -40,7 +42,8 @@ async def main():
|
||||||
crawler.init_config(
|
crawler.init_config(
|
||||||
platform=args.platform,
|
platform=args.platform,
|
||||||
login_type=args.lt,
|
login_type=args.lt,
|
||||||
account_pool=account_pool
|
account_pool=account_pool,
|
||||||
|
crawler_type=args.type
|
||||||
)
|
)
|
||||||
await crawler.start()
|
await crawler.start()
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ from .login import DouYinLogin
|
||||||
class DouYinCrawler(AbstractCrawler):
|
class DouYinCrawler(AbstractCrawler):
|
||||||
platform: str
|
platform: str
|
||||||
login_type: str
|
login_type: str
|
||||||
|
crawler_type: str
|
||||||
context_page: Page
|
context_page: Page
|
||||||
dy_client: DOUYINClient
|
dy_client: DOUYINClient
|
||||||
account_pool: AccountPool
|
account_pool: AccountPool
|
||||||
|
@ -30,10 +31,11 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
||||||
self.index_url = "https://www.douyin.com"
|
self.index_url = "https://www.douyin.com"
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
|
def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None:
|
||||||
self.platform = platform
|
self.platform = platform
|
||||||
self.login_type = login_type
|
self.login_type = login_type
|
||||||
self.account_pool = account_pool
|
self.account_pool = account_pool
|
||||||
|
self.crawler_type = crawler_type
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
|
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
|
||||||
|
@ -63,8 +65,12 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
await login_obj.begin()
|
await login_obj.begin()
|
||||||
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
||||||
|
|
||||||
# search_posts
|
if self.crawler_type == "search":
|
||||||
await self.search()
|
# Search for notes and retrieve their comment information.
|
||||||
|
await self.search()
|
||||||
|
elif self.crawler_type == "detail":
|
||||||
|
# Get the information and comments of the specified post
|
||||||
|
await self.get_specified_notes()
|
||||||
|
|
||||||
utils.logger.info("Douyin Crawler finished ...")
|
utils.logger.info("Douyin Crawler finished ...")
|
||||||
|
|
||||||
|
@ -74,7 +80,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
request_keyword_var.set(keyword)
|
request_keyword_var.set(keyword)
|
||||||
utils.logger.info(f"Current keyword: {keyword}")
|
utils.logger.info(f"Current keyword: {keyword}")
|
||||||
aweme_list: List[str] = []
|
aweme_list: List[str] = []
|
||||||
dy_limit_count = 10 # douyin fixed limit page 10
|
dy_limit_count = 10
|
||||||
page = 0
|
page = 0
|
||||||
while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
try:
|
try:
|
||||||
|
@ -95,6 +101,11 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
||||||
await self.batch_get_note_comments(aweme_list)
|
await self.batch_get_note_comments(aweme_list)
|
||||||
|
|
||||||
|
async def get_specified_notes(self):
|
||||||
|
"""Get the information and comments of the specified post"""
|
||||||
|
# todo douyin support
|
||||||
|
pass
|
||||||
|
|
||||||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||||||
task_list: List[Task] = []
|
task_list: List[Task] = []
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
|
|
|
@ -12,7 +12,7 @@ from base.base_crawler import AbstractCrawler
|
||||||
from base.proxy_account_pool import AccountPool
|
from base.proxy_account_pool import AccountPool
|
||||||
from models import xiaohongshu as xhs_model
|
from models import xiaohongshu as xhs_model
|
||||||
from tools import utils
|
from tools import utils
|
||||||
from var import request_keyword_var
|
from var import crawler_type_var
|
||||||
|
|
||||||
from .client import XHSClient
|
from .client import XHSClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
|
@ -22,6 +22,7 @@ from .login import XHSLogin
|
||||||
class XiaoHongShuCrawler(AbstractCrawler):
|
class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
platform: str
|
platform: str
|
||||||
login_type: str
|
login_type: str
|
||||||
|
crawler_type: str
|
||||||
context_page: Page
|
context_page: Page
|
||||||
xhs_client: XHSClient
|
xhs_client: XHSClient
|
||||||
account_pool: AccountPool
|
account_pool: AccountPool
|
||||||
|
@ -31,10 +32,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
self.index_url = "https://www.xiaohongshu.com"
|
self.index_url = "https://www.xiaohongshu.com"
|
||||||
self.user_agent = utils.get_user_agent()
|
self.user_agent = utils.get_user_agent()
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
|
def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None:
|
||||||
self.platform = platform
|
self.platform = platform
|
||||||
self.login_type = login_type
|
self.login_type = login_type
|
||||||
self.account_pool = account_pool
|
self.account_pool = account_pool
|
||||||
|
self.crawler_type =crawler_type
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
|
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
|
||||||
|
@ -72,8 +74,16 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
await login_obj.begin()
|
await login_obj.begin()
|
||||||
await self.xhs_client.update_cookies(browser_context=self.browser_context)
|
await self.xhs_client.update_cookies(browser_context=self.browser_context)
|
||||||
|
|
||||||
# Search for notes and retrieve their comment information.
|
if self.crawler_type == "search":
|
||||||
await self.search()
|
# Search for notes and retrieve their comment information.
|
||||||
|
crawler_type_var.set("search")
|
||||||
|
await self.search()
|
||||||
|
elif self.crawler_type == "detail":
|
||||||
|
# Get the information and comments of the specified post
|
||||||
|
crawler_type_var.set("detail")
|
||||||
|
await self.get_specified_notes()
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
utils.logger.info("Xhs Crawler finished ...")
|
utils.logger.info("Xhs Crawler finished ...")
|
||||||
|
|
||||||
|
@ -82,8 +92,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
utils.logger.info("Begin search xiaohongshu keywords")
|
utils.logger.info("Begin search xiaohongshu keywords")
|
||||||
xhs_limit_count = 20 # xhs limit page fixed value
|
xhs_limit_count = 20 # xhs limit page fixed value
|
||||||
for keyword in config.KEYWORDS.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
# set keyword to context var
|
|
||||||
request_keyword_var.set(keyword)
|
|
||||||
utils.logger.info(f"Current search keyword: {keyword}")
|
utils.logger.info(f"Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
@ -107,6 +115,19 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
utils.logger.info(f"Note details: {note_details}")
|
utils.logger.info(f"Note details: {note_details}")
|
||||||
await self.batch_get_note_comments(note_id_list)
|
await self.batch_get_note_comments(note_id_list)
|
||||||
|
|
||||||
|
async def get_specified_notes(self):
|
||||||
|
"""Get the information and comments of the specified post"""
|
||||||
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
|
task_list = [
|
||||||
|
self.get_note_detail(note_id=note_id, semaphore=semaphore) for note_id in config.SPECIFIED_ID_LIST
|
||||||
|
]
|
||||||
|
note_details = await asyncio.gather(*task_list)
|
||||||
|
for note_detail in note_details:
|
||||||
|
if note_detail is not None:
|
||||||
|
await xhs_model.update_xhs_note(note_detail)
|
||||||
|
await self.batch_get_note_comments(config.SPECIFIED_ID_LIST)
|
||||||
|
|
||||||
|
|
||||||
async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
||||||
"""Get note detail"""
|
"""Get note detail"""
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
|
@ -115,6 +136,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
except DataFetchError as ex:
|
except DataFetchError as ex:
|
||||||
utils.logger.error(f"Get note detail error: {ex}")
|
utils.logger.error(f"Get note detail error: {ex}")
|
||||||
return None
|
return None
|
||||||
|
except KeyError as ex:
|
||||||
|
utils.logger.error(f"have not fund note detail note_id:{note_id}, err: {ex}")
|
||||||
|
return None
|
||||||
|
|
||||||
async def batch_get_note_comments(self, note_list: List[str]):
|
async def batch_get_note_comments(self, note_list: List[str]):
|
||||||
"""Batch get note comments"""
|
"""Batch get note comments"""
|
||||||
|
|
|
@ -8,7 +8,7 @@ from tortoise.models import Model
|
||||||
|
|
||||||
import config
|
import config
|
||||||
from tools import utils
|
from tools import utils
|
||||||
from var import request_keyword_var
|
from var import crawler_type_var
|
||||||
|
|
||||||
|
|
||||||
class XhsBaseModel(Model):
|
class XhsBaseModel(Model):
|
||||||
|
@ -101,9 +101,9 @@ async def update_xhs_note(note_item: Dict):
|
||||||
await XHSNote.filter(note_id=note_id).update(**note_data.dict())
|
await XHSNote.filter(note_id=note_id).update(**note_data.dict())
|
||||||
else:
|
else:
|
||||||
# Below is a simple way to save it in CSV format.
|
# Below is a simple way to save it in CSV format.
|
||||||
source_keywords = request_keyword_var.get()
|
|
||||||
pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True)
|
pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True)
|
||||||
with open(f"data/xhs/notes_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f:
|
save_file_name = f"data/xhs/{crawler_type_var.get()}_notes_{utils.get_current_date()}.csv"
|
||||||
|
with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
|
||||||
writer = csv.writer(f)
|
writer = csv.writer(f)
|
||||||
if f.tell() == 0:
|
if f.tell() == 0:
|
||||||
writer.writerow(local_db_item.keys())
|
writer.writerow(local_db_item.keys())
|
||||||
|
@ -141,9 +141,9 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
|
||||||
await XHSNoteComment.filter(comment_id=comment_id).update(**comment_data.dict())
|
await XHSNoteComment.filter(comment_id=comment_id).update(**comment_data.dict())
|
||||||
else:
|
else:
|
||||||
# Below is a simple way to save it in CSV format.
|
# Below is a simple way to save it in CSV format.
|
||||||
source_keywords = request_keyword_var.get()
|
|
||||||
pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True)
|
pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True)
|
||||||
with open(f"data/xhs/comment_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f:
|
save_file_name = f"data/xhs/{crawler_type_var.get()}_comment_{utils.get_current_date()}.csv"
|
||||||
|
with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
|
||||||
writer = csv.writer(f)
|
writer = csv.writer(f)
|
||||||
if f.tell() == 0:
|
if f.tell() == 0:
|
||||||
writer.writerow(local_db_item.keys())
|
writer.writerow(local_db_item.keys())
|
||||||
|
|
|
@ -267,3 +267,12 @@ def get_tracks(distance: int, level: str = "easy") -> List[int]:
|
||||||
from . import easing
|
from . import easing
|
||||||
_, tricks = easing.get_tracks(distance, seconds=2, ease_func="ease_out_expo")
|
_, tricks = easing.get_tracks(distance, seconds=2, ease_func="ease_out_expo")
|
||||||
return tricks
|
return tricks
|
||||||
|
|
||||||
|
|
||||||
|
def get_current_time():
|
||||||
|
ISOTIMEFORMAT = '%Y-%m-%d %X'
|
||||||
|
return tme.strftime(ISOTIMEFORMAT, time.localtime())
|
||||||
|
|
||||||
|
def get_current_date():
|
||||||
|
ISOTIMEFORMAT = '%Y-%m-%d'
|
||||||
|
return time.strftime(ISOTIMEFORMAT, time.localtime())
|
1
var.py
1
var.py
|
@ -1,3 +1,4 @@
|
||||||
from contextvars import ContextVar
|
from contextvars import ContextVar
|
||||||
|
|
||||||
request_keyword_var: ContextVar[str] = ContextVar("request_keyword", default="")
|
request_keyword_var: ContextVar[str] = ContextVar("request_keyword", default="")
|
||||||
|
crawler_type_var: ContextVar[str] = ContextVar("crawler_type", default="")
|
Loading…
Reference in New Issue