feat: 支持数据保存到CSV中
This commit is contained in:
parent
c1a3f06c7a
commit
9177c38521
|
@ -164,3 +164,4 @@ cython_debug/
|
|||
.idea
|
||||
/temp_image/
|
||||
/browser_data/
|
||||
/data/
|
||||
|
|
|
@ -21,7 +21,8 @@
|
|||
- [x] 抖音登录(二维码、手机号、cookies)
|
||||
- [x] 抖音滑块(模拟滑动实现,准确率不太OK)
|
||||
- [x] 支持登录成功后的上下文浏览器环境保留
|
||||
- [x] 数据持久化到硬盘(关系型数据库)
|
||||
- [x] 数据保存到CSV中(默认)
|
||||
- [x] 数据保持到数据库中(可选)
|
||||
|
||||
|
||||
## 使用方法
|
||||
|
@ -54,6 +55,9 @@
|
|||
|
||||
5. 打开对应APP扫二维码登录
|
||||
|
||||
6. 等待爬虫程序执行完毕,数据会保存到 `data/xhs` 目录下
|
||||
|
||||
|
||||
## 项目代码结构
|
||||
|
||||
```
|
||||
|
@ -61,10 +65,12 @@ MediaCrawler
|
|||
├── base
|
||||
│ ├── base_crawler.py # 项目的抽象类
|
||||
│ └── proxy_account_pool.py # 账号与IP代理池
|
||||
├── browser_data # 浏览器数据目录
|
||||
├── config
|
||||
│ ├── account_config.py # 账号代理池配置
|
||||
│ ├── base_config.py # 基础配置
|
||||
│ └── db_config.py # 数据库配置
|
||||
├── data # 数据保存目录
|
||||
├── libs
|
||||
│ ├── douyin.js # 抖音Sign函数
|
||||
│ └── stealth.min.js # 去除浏览器自动化特征的JS
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# Desc: base config
|
||||
PLATFORM = "xhs"
|
||||
KEYWORDS = "健身,旅游"
|
||||
KEYWORDS = "python,golang"
|
||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
||||
COOKIES = "" # login by cookie, if login_type is cookie, you must set this value
|
||||
|
||||
|
|
|
@ -9,4 +9,4 @@ RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456") # your relation db pas
|
|||
RELATION_DB_URL = f"mysql://root:{RELATION_DB_PWD}@localhost:3306/media_crawler"
|
||||
|
||||
# save data to database option
|
||||
IS_SAVED_DATABASED = True # if you want to save data to database, set True
|
||||
IS_SAVED_DATABASED = False # if you want to save data to database, set True
|
||||
|
|
|
@ -8,6 +8,7 @@ import httpx
|
|||
from playwright.async_api import BrowserContext, Page
|
||||
|
||||
from tools import utils
|
||||
from var import request_keyword_var
|
||||
|
||||
from .exception import *
|
||||
from .field import *
|
||||
|
@ -142,7 +143,7 @@ class DOUYINClient:
|
|||
del headers["Origin"]
|
||||
return await self.get("/aweme/v1/web/aweme/detail/", params, headers)
|
||||
|
||||
async def get_aweme_comments(self, aweme_id: str, cursor: int = 0, keywords: str = ""):
|
||||
async def get_aweme_comments(self, aweme_id: str, cursor: int = 0):
|
||||
"""get note comments
|
||||
|
||||
"""
|
||||
|
@ -153,6 +154,7 @@ class DOUYINClient:
|
|||
"count": 20,
|
||||
"item_type": 0
|
||||
}
|
||||
keywords = request_keyword_var.get()
|
||||
referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general'
|
||||
headers = copy.copy(self.headers)
|
||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||
|
@ -164,7 +166,6 @@ class DOUYINClient:
|
|||
crawl_interval: float = 1.0,
|
||||
is_fetch_sub_comments=False,
|
||||
callback: Optional[Callable] = None,
|
||||
keywords: str = ""
|
||||
):
|
||||
"""
|
||||
get note all comments include sub comments
|
||||
|
@ -172,14 +173,13 @@ class DOUYINClient:
|
|||
:param crawl_interval:
|
||||
:param is_fetch_sub_comments:
|
||||
:param callback:
|
||||
:param keywords:
|
||||
:return:
|
||||
"""
|
||||
result = []
|
||||
comments_has_more = 1
|
||||
comments_cursor = 0
|
||||
while comments_has_more:
|
||||
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor, keywords)
|
||||
comments_res = await self.get_aweme_comments(aweme_id, comments_cursor)
|
||||
comments_has_more = comments_res.get("has_more", 0)
|
||||
comments_cursor = comments_res.get("cursor", comments_cursor + 20)
|
||||
comments = comments_res.get("comments")
|
||||
|
|
|
@ -11,6 +11,7 @@ from base.base_crawler import AbstractCrawler
|
|||
from base.proxy_account_pool import AccountPool
|
||||
from models import douyin
|
||||
from tools import utils
|
||||
from var import request_keyword_var
|
||||
|
||||
from .client import DOUYINClient
|
||||
from .exception import DataFetchError
|
||||
|
@ -70,14 +71,15 @@ class DouYinCrawler(AbstractCrawler):
|
|||
async def search(self) -> None:
|
||||
utils.logger.info("Begin search douyin keywords")
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
request_keyword_var.set(keyword)
|
||||
utils.logger.info(f"Current keyword: {keyword}")
|
||||
aweme_list: List[str] = []
|
||||
dy_limite_count = 10 # douyin fixed limit page 10
|
||||
dy_limit_count = 10 # douyin fixed limit page 10
|
||||
page = 0
|
||||
while (page + 1) * dy_limite_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
try:
|
||||
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
|
||||
offset=page * dy_limite_count)
|
||||
offset=page * dy_limit_count)
|
||||
except DataFetchError:
|
||||
utils.logger.error(f"search douyin keyword: {keyword} failed")
|
||||
break
|
||||
|
@ -91,23 +93,22 @@ class DouYinCrawler(AbstractCrawler):
|
|||
aweme_list.append(aweme_info.get("aweme_id", ""))
|
||||
await douyin.update_douyin_aweme(aweme_item=aweme_info)
|
||||
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
||||
await self.batch_get_note_comments(aweme_list, keyword)
|
||||
await self.batch_get_note_comments(aweme_list)
|
||||
|
||||
async def batch_get_note_comments(self, aweme_list: List[str], keywords: str) -> None:
|
||||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||||
task_list: List[Task] = []
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
for aweme_id in aweme_list:
|
||||
task = asyncio.create_task(self.get_comments(aweme_id, semaphore, keywords), name=aweme_id)
|
||||
task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
|
||||
task_list.append(task)
|
||||
await asyncio.wait(task_list)
|
||||
|
||||
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, keywords: str) -> None:
|
||||
async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
|
||||
async with semaphore:
|
||||
try:
|
||||
await self.dy_client.get_aweme_all_comments(
|
||||
aweme_id=aweme_id,
|
||||
callback=douyin.batch_update_dy_aweme_comments,
|
||||
keywords=keywords
|
||||
)
|
||||
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
|
||||
except DataFetchError as e:
|
||||
|
|
|
@ -12,6 +12,7 @@ from base.base_crawler import AbstractCrawler
|
|||
from base.proxy_account_pool import AccountPool
|
||||
from models import xiaohongshu as xhs_model
|
||||
from tools import utils
|
||||
from var import request_keyword_var
|
||||
|
||||
from .client import XHSClient
|
||||
from .exception import DataFetchError
|
||||
|
@ -81,6 +82,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
utils.logger.info("Begin search xiaohongshu keywords")
|
||||
xhs_limit_count = 20 # xhs limit page fixed value
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
# set keyword to context var
|
||||
request_keyword_var.set(keyword)
|
||||
utils.logger.info(f"Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import json
|
||||
import csv
|
||||
import pathlib
|
||||
from typing import Dict, List
|
||||
|
||||
from tortoise import fields
|
||||
|
@ -7,6 +8,7 @@ from tortoise.contrib.pydantic import pydantic_model_creator
|
|||
|
||||
import config
|
||||
from tools import utils
|
||||
from var import request_keyword_var
|
||||
|
||||
|
||||
class DouyinBaseModel(Model):
|
||||
|
@ -98,6 +100,15 @@ async def update_douyin_aweme(aweme_item: Dict):
|
|||
douyin_data = douyin_aweme_pydantic(**local_db_item)
|
||||
douyin_aweme_pydantic.validate(douyin_data)
|
||||
await DouyinAweme.filter(aweme_id=aweme_id).update(**douyin_data.dict())
|
||||
else:
|
||||
# Below is a simple way to save it in CSV format.
|
||||
source_keywords = request_keyword_var.get()
|
||||
pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True)
|
||||
with open(f"data/dy/aweme_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f:
|
||||
writer = csv.writer(f)
|
||||
if f.tell() == 0:
|
||||
writer.writerow(local_db_item.keys())
|
||||
writer.writerow(local_db_item.values())
|
||||
|
||||
|
||||
async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
|
||||
|
@ -147,3 +158,11 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
|
|||
comment_data = comment_pydantic(**local_db_item)
|
||||
comment_pydantic.validate(comment_data)
|
||||
await DouyinAwemeComment.filter(comment_id=comment_id).update(**comment_data.dict())
|
||||
else:
|
||||
source_keywords = request_keyword_var.get()
|
||||
pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True)
|
||||
with open(f"data/dy/comment_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f:
|
||||
writer = csv.writer(f)
|
||||
if f.tell() == 0:
|
||||
writer.writerow(local_db_item.keys())
|
||||
writer.writerow(local_db_item.values())
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
import csv
|
||||
import pathlib
|
||||
from typing import Dict, List
|
||||
|
||||
from tortoise import fields
|
||||
|
@ -6,6 +8,7 @@ from tortoise.models import Model
|
|||
|
||||
import config
|
||||
from tools import utils
|
||||
from var import request_keyword_var
|
||||
|
||||
|
||||
class XhsBaseModel(Model):
|
||||
|
@ -94,6 +97,15 @@ async def update_xhs_note(note_item: Dict):
|
|||
note_data = note_pydantic(**local_db_item)
|
||||
note_pydantic.validate(note_data)
|
||||
await XHSNote.filter(note_id=note_id).update(**note_data.dict())
|
||||
else:
|
||||
# Below is a simple way to save it in CSV format.
|
||||
source_keywords = request_keyword_var.get()
|
||||
pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True)
|
||||
with open(f"data/xhs/notes_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f:
|
||||
writer = csv.writer(f)
|
||||
if f.tell() == 0:
|
||||
writer.writerow(local_db_item.keys())
|
||||
writer.writerow(local_db_item.values())
|
||||
|
||||
|
||||
async def update_xhs_note_comment(note_id: str, comment_item: Dict):
|
||||
|
@ -125,3 +137,12 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
|
|||
comment_data = comment_pydantic(**local_db_item)
|
||||
comment_pydantic.validate(comment_data)
|
||||
await XHSNoteComment.filter(comment_id=comment_id).update(**comment_data.dict())
|
||||
else:
|
||||
# Below is a simple way to save it in CSV format.
|
||||
source_keywords = request_keyword_var.get()
|
||||
pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True)
|
||||
with open(f"data/xhs/comment_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f:
|
||||
writer = csv.writer(f)
|
||||
if f.tell() == 0:
|
||||
writer.writerow(local_db_item.keys())
|
||||
writer.writerow(local_db_item.values())
|
||||
|
|
Loading…
Reference in New Issue