diff --git a/.gitignore b/.gitignore index 03f9900..8996d3e 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,4 @@ cython_debug/ .idea /temp_image/ /browser_data/ +/data/ diff --git a/README.md b/README.md index 4fdb488..b7be1e0 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,8 @@ - [x] 抖音登录(二维码、手机号、cookies) - [x] 抖音滑块(模拟滑动实现,准确率不太OK) - [x] 支持登录成功后的上下文浏览器环境保留 -- [x] 数据持久化到硬盘(关系型数据库) +- [x] 数据保存到CSV中(默认) +- [x] 数据保持到数据库中(可选) ## 使用方法 @@ -54,6 +55,9 @@ 5. 打开对应APP扫二维码登录 +6. 等待爬虫程序执行完毕,数据会保存到 `data/xhs` 目录下 + + ## 项目代码结构 ``` @@ -61,10 +65,12 @@ MediaCrawler ├── base │ ├── base_crawler.py # 项目的抽象类 │ └── proxy_account_pool.py # 账号与IP代理池 +├── browser_data # 浏览器数据目录 ├── config │ ├── account_config.py # 账号代理池配置 │ ├── base_config.py # 基础配置 │ └── db_config.py # 数据库配置 +├── data # 数据保存目录 ├── libs │ ├── douyin.js # 抖音Sign函数 │ └── stealth.min.js # 去除浏览器自动化特征的JS diff --git a/config/base_config.py b/config/base_config.py index 56873b1..b8f20c3 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -1,6 +1,6 @@ # Desc: base config PLATFORM = "xhs" -KEYWORDS = "健身,旅游" +KEYWORDS = "python,golang" LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" # login by cookie, if login_type is cookie, you must set this value diff --git a/config/db_config.py b/config/db_config.py index 1e9d267..270edd3 100644 --- a/config/db_config.py +++ b/config/db_config.py @@ -9,4 +9,4 @@ RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456") # your relation db pas RELATION_DB_URL = f"mysql://root:{RELATION_DB_PWD}@localhost:3306/media_crawler" # save data to database option -IS_SAVED_DATABASED = True # if you want to save data to database, set True +IS_SAVED_DATABASED = False # if you want to save data to database, set True diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index d4f2f3f..818694f 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -8,6 +8,7 @@ import httpx from playwright.async_api import BrowserContext, Page from tools import utils +from var import request_keyword_var from .exception import * from .field import * @@ -142,7 +143,7 @@ class DOUYINClient: del headers["Origin"] return await self.get("/aweme/v1/web/aweme/detail/", params, headers) - async def get_aweme_comments(self, aweme_id: str, cursor: int = 0, keywords: str = ""): + async def get_aweme_comments(self, aweme_id: str, cursor: int = 0): """get note comments """ @@ -153,6 +154,7 @@ class DOUYINClient: "count": 20, "item_type": 0 } + keywords = request_keyword_var.get() referer_url = "https://www.douyin.com/search/" + keywords + '?aid=3a3cec5a-9e27-4040-b6aa-ef548c2c1138&publish_time=0&sort_type=0&source=search_history&type=general' headers = copy.copy(self.headers) headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') @@ -164,7 +166,6 @@ class DOUYINClient: crawl_interval: float = 1.0, is_fetch_sub_comments=False, callback: Optional[Callable] = None, - keywords: str = "" ): """ get note all comments include sub comments @@ -172,14 +173,13 @@ class DOUYINClient: :param crawl_interval: :param is_fetch_sub_comments: :param callback: - :param keywords: :return: """ result = [] comments_has_more = 1 comments_cursor = 0 while comments_has_more: - comments_res = await self.get_aweme_comments(aweme_id, comments_cursor, keywords) + comments_res = await self.get_aweme_comments(aweme_id, comments_cursor) comments_has_more = comments_res.get("has_more", 0) comments_cursor = comments_res.get("cursor", comments_cursor + 20) comments = comments_res.get("comments") diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 19117f0..92685b2 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -11,6 +11,7 @@ from base.base_crawler import AbstractCrawler from base.proxy_account_pool import AccountPool from models import douyin from tools import utils +from var import request_keyword_var from .client import DOUYINClient from .exception import DataFetchError @@ -70,14 +71,15 @@ class DouYinCrawler(AbstractCrawler): async def search(self) -> None: utils.logger.info("Begin search douyin keywords") for keyword in config.KEYWORDS.split(","): + request_keyword_var.set(keyword) utils.logger.info(f"Current keyword: {keyword}") aweme_list: List[str] = [] - dy_limite_count = 10 # douyin fixed limit page 10 + dy_limit_count = 10 # douyin fixed limit page 10 page = 0 - while (page + 1) * dy_limite_count <= config.CRAWLER_MAX_NOTES_COUNT: + while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: try: posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword, - offset=page * dy_limite_count) + offset=page * dy_limit_count) except DataFetchError: utils.logger.error(f"search douyin keyword: {keyword} failed") break @@ -91,23 +93,22 @@ class DouYinCrawler(AbstractCrawler): aweme_list.append(aweme_info.get("aweme_id", "")) await douyin.update_douyin_aweme(aweme_item=aweme_info) utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}") - await self.batch_get_note_comments(aweme_list, keyword) + await self.batch_get_note_comments(aweme_list) - async def batch_get_note_comments(self, aweme_list: List[str], keywords: str) -> None: + async def batch_get_note_comments(self, aweme_list: List[str]) -> None: task_list: List[Task] = [] semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) for aweme_id in aweme_list: - task = asyncio.create_task(self.get_comments(aweme_id, semaphore, keywords), name=aweme_id) + task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id) task_list.append(task) await asyncio.wait(task_list) - async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore, keywords: str) -> None: + async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None: async with semaphore: try: await self.dy_client.get_aweme_all_comments( aweme_id=aweme_id, callback=douyin.batch_update_dy_aweme_comments, - keywords=keywords ) utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...") except DataFetchError as e: diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 0e6acdf..d62ac1d 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -12,6 +12,7 @@ from base.base_crawler import AbstractCrawler from base.proxy_account_pool import AccountPool from models import xiaohongshu as xhs_model from tools import utils +from var import request_keyword_var from .client import XHSClient from .exception import DataFetchError @@ -81,6 +82,8 @@ class XiaoHongShuCrawler(AbstractCrawler): utils.logger.info("Begin search xiaohongshu keywords") xhs_limit_count = 20 # xhs limit page fixed value for keyword in config.KEYWORDS.split(","): + # set keyword to context var + request_keyword_var.set(keyword) utils.logger.info(f"Current search keyword: {keyword}") page = 1 while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: diff --git a/models/douyin.py b/models/douyin.py index 2d463dc..23cddeb 100644 --- a/models/douyin.py +++ b/models/douyin.py @@ -1,4 +1,5 @@ -import json +import csv +import pathlib from typing import Dict, List from tortoise import fields @@ -7,6 +8,7 @@ from tortoise.contrib.pydantic import pydantic_model_creator import config from tools import utils +from var import request_keyword_var class DouyinBaseModel(Model): @@ -98,6 +100,15 @@ async def update_douyin_aweme(aweme_item: Dict): douyin_data = douyin_aweme_pydantic(**local_db_item) douyin_aweme_pydantic.validate(douyin_data) await DouyinAweme.filter(aweme_id=aweme_id).update(**douyin_data.dict()) + else: + # Below is a simple way to save it in CSV format. + source_keywords = request_keyword_var.get() + pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True) + with open(f"data/dy/aweme_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + if f.tell() == 0: + writer.writerow(local_db_item.keys()) + writer.writerow(local_db_item.values()) async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]): @@ -147,3 +158,11 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): comment_data = comment_pydantic(**local_db_item) comment_pydantic.validate(comment_data) await DouyinAwemeComment.filter(comment_id=comment_id).update(**comment_data.dict()) + else: + source_keywords = request_keyword_var.get() + pathlib.Path(f"data/dy").mkdir(parents=True, exist_ok=True) + with open(f"data/dy/comment_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + if f.tell() == 0: + writer.writerow(local_db_item.keys()) + writer.writerow(local_db_item.values()) diff --git a/models/xiaohongshu.py b/models/xiaohongshu.py index aead421..6ed6f9f 100644 --- a/models/xiaohongshu.py +++ b/models/xiaohongshu.py @@ -1,3 +1,5 @@ +import csv +import pathlib from typing import Dict, List from tortoise import fields @@ -6,6 +8,7 @@ from tortoise.models import Model import config from tools import utils +from var import request_keyword_var class XhsBaseModel(Model): @@ -94,6 +97,15 @@ async def update_xhs_note(note_item: Dict): note_data = note_pydantic(**local_db_item) note_pydantic.validate(note_data) await XHSNote.filter(note_id=note_id).update(**note_data.dict()) + else: + # Below is a simple way to save it in CSV format. + source_keywords = request_keyword_var.get() + pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True) + with open(f"data/xhs/notes_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + if f.tell() == 0: + writer.writerow(local_db_item.keys()) + writer.writerow(local_db_item.values()) async def update_xhs_note_comment(note_id: str, comment_item: Dict): @@ -125,3 +137,12 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict): comment_data = comment_pydantic(**local_db_item) comment_pydantic.validate(comment_data) await XHSNoteComment.filter(comment_id=comment_id).update(**comment_data.dict()) + else: + # Below is a simple way to save it in CSV format. + source_keywords = request_keyword_var.get() + pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True) + with open(f"data/xhs/comment_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + if f.tell() == 0: + writer.writerow(local_db_item.keys()) + writer.writerow(local_db_item.values()) diff --git a/var.py b/var.py new file mode 100644 index 0000000..11c2974 --- /dev/null +++ b/var.py @@ -0,0 +1,3 @@ +from contextvars import ContextVar + +request_keyword_var: ContextVar[str] = ContextVar("request_keyword", default="")