From 700946b28aa06d694b08bfa9137f65aec632396c Mon Sep 17 00:00:00 2001 From: Relakkes Date: Sat, 18 Nov 2023 13:38:11 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=B0=8F=E7=BA=A2=E4=B9=A6=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E6=8C=87=E5=AE=9A=E5=B8=96=E5=AD=90=E7=88=AC=E5=8F=96?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=20fix:=20=E4=BF=AE=E5=A4=8D=E7=A8=8B?= =?UTF-8?q?=E5=BA=8F=E4=B8=80=E4=BA=9B=E5=BC=82=E5=B8=B8=20bug=20refactor:?= =?UTF-8?q?=20=E4=BC=98=E5=8C=96=E9=83=A8=E5=88=86=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 30 ++++++++++++++++++++++++++--- base/base_crawler.py | 2 +- config/base_config.py | 9 +++++++++ main.py | 7 +++++-- media_platform/douyin/core.py | 19 ++++++++++++++---- media_platform/xhs/core.py | 36 +++++++++++++++++++++++++++++------ models/xiaohongshu.py | 10 +++++----- tools/utils.py | 9 +++++++++ var.py | 1 + 9 files changed, 102 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 2e9421e..99becb5 100644 --- a/README.md +++ b/README.md @@ -16,15 +16,21 @@ - [x] 小红书登录(二维码、手机号、cookies) - [x] 小红书Sign请求签名 +- [x] 小红书指定关键词爬去 +- [x] 小红书指定帖子爬去 - [x] 抖音Sign请求签名 -- [x] 代理池实现(手机号+IP) -- [x] 并发执行爬虫请求 - [x] 抖音登录(二维码、手机号、cookies) - [x] 抖音滑块(模拟滑动实现,准确率不太OK) +- [x] 抖音指定关键爬取 - [x] 支持登录成功后的上下文浏览器环境保留 +- [x] 代理池实现(手机号+IP) +- [x] 并发执行爬虫请求 - [x] 数据保存到CSV中(默认) - [x] 数据保持到数据库中(可选) +## 待实现 +- [ ] 抖音指定帖子爬取 +- [ ] 快手爬虫实现 ## 使用方法 @@ -51,13 +57,31 @@ 4. 运行爬虫程序 ```shell - python main.py --platform xhs --lt qrcode + # 从配置文件中读取关键词搜索相关的帖子并爬去帖子信息与评论 + python main.py --platform xhs --lt qrcode --type search + + # 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息 + python main.py --platform xhs --lt qrcode --type detail ``` 5. 打开对应APP扫二维码登录 6. 等待爬虫程序执行完毕,数据会保存到 `data/xhs` 目录下 +## 常见程序运行出错问题 +```shell +# Q: 爬去抖音报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'` +# A: 该错误为缺少 nodejs 环境这个错误安装 nodejs 环境即可,版本为:`v16.8.0` + +# Q: 可以指定关键词爬取吗? +# A: 在config/base_config.py 中 KEYWORDS 参数用于控制需要爬去的关键词 + +# Q: 可以指定帖子爬去吗? +# A:在config/base_config.py 中 SPECIFIED_ID_LIST 参数用于控制需要指定爬去的帖子ID列表 + +# Q: 刚开始能爬取数据,过一段时间就是失效了? +# A:出现这种情况多半是由于你的账号触发了平台风控机制了,❗️❗️请勿大规模对平台进行爬虫,影响平台。 +``` ## 项目代码结构 diff --git a/base/base_crawler.py b/base/base_crawler.py index 52ce177..f05ae67 100644 --- a/base/base_crawler.py +++ b/base/base_crawler.py @@ -5,7 +5,7 @@ from base.proxy_account_pool import AccountPool class AbstractCrawler(ABC): @abstractmethod - def init_config(self, platform: str, login_type: str, account_pool: AccountPool): + def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str): pass @abstractmethod diff --git a/config/base_config.py b/config/base_config.py index b8f20c3..318c58d 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -3,6 +3,7 @@ PLATFORM = "xhs" KEYWORDS = "python,golang" LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" # login by cookie, if login_type is cookie, you must set this value +CRAWLER_TYPE = "search" # enable ip proxy ENABLE_IP_PROXY = False @@ -24,3 +25,11 @@ CRAWLER_MAX_NOTES_COUNT = 20 # max concurrency num MAX_CONCURRENCY_NUM = 10 + + +# specified note id list +SPECIFIED_ID_LIST = [ +"6422c2750000000027000d88", +"64ca1b73000000000b028dd2", +"630d5b85000000001203ab41", +] diff --git a/main.py b/main.py index 46c94cf..2b3c711 100644 --- a/main.py +++ b/main.py @@ -23,11 +23,13 @@ class CrawlerFactory: async def main(): # define command line params ... parser = argparse.ArgumentParser(description='Media crawler program.') - parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)', choices=["xhs", "dy"], + parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy)', choices=["xhs", "dy"], default=config.PLATFORM) parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) + parser.add_argument('--type', type=str, help='crawler type (search | detail)', + choices=["search","detail"],default=config.CRAWLER_TYPE) # init account pool account_pool = proxy_account_pool.create_account_pool() @@ -40,7 +42,8 @@ async def main(): crawler.init_config( platform=args.platform, login_type=args.lt, - account_pool=account_pool + account_pool=account_pool, + crawler_type=args.type ) await crawler.start() diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 92685b2..35d7af5 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -21,6 +21,7 @@ from .login import DouYinLogin class DouYinCrawler(AbstractCrawler): platform: str login_type: str + crawler_type: str context_page: Page dy_client: DOUYINClient account_pool: AccountPool @@ -30,10 +31,11 @@ class DouYinCrawler(AbstractCrawler): self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed self.index_url = "https://www.douyin.com" - def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None: + def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None: self.platform = platform self.login_type = login_type self.account_pool = account_pool + self.crawler_type = crawler_type async def start(self) -> None: account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() @@ -63,8 +65,12 @@ class DouYinCrawler(AbstractCrawler): await login_obj.begin() await self.dy_client.update_cookies(browser_context=self.browser_context) - # search_posts - await self.search() + if self.crawler_type == "search": + # Search for notes and retrieve their comment information. + await self.search() + elif self.crawler_type == "detail": + # Get the information and comments of the specified post + await self.get_specified_notes() utils.logger.info("Douyin Crawler finished ...") @@ -74,7 +80,7 @@ class DouYinCrawler(AbstractCrawler): request_keyword_var.set(keyword) utils.logger.info(f"Current keyword: {keyword}") aweme_list: List[str] = [] - dy_limit_count = 10 # douyin fixed limit page 10 + dy_limit_count = 10 page = 0 while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: try: @@ -95,6 +101,11 @@ class DouYinCrawler(AbstractCrawler): utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}") await self.batch_get_note_comments(aweme_list) + async def get_specified_notes(self): + """Get the information and comments of the specified post""" + # todo douyin support + pass + async def batch_get_note_comments(self, aweme_list: List[str]) -> None: task_list: List[Task] = [] semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 60136cf..87e4c03 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -12,7 +12,7 @@ from base.base_crawler import AbstractCrawler from base.proxy_account_pool import AccountPool from models import xiaohongshu as xhs_model from tools import utils -from var import request_keyword_var +from var import crawler_type_var from .client import XHSClient from .exception import DataFetchError @@ -22,6 +22,7 @@ from .login import XHSLogin class XiaoHongShuCrawler(AbstractCrawler): platform: str login_type: str + crawler_type: str context_page: Page xhs_client: XHSClient account_pool: AccountPool @@ -31,10 +32,11 @@ class XiaoHongShuCrawler(AbstractCrawler): self.index_url = "https://www.xiaohongshu.com" self.user_agent = utils.get_user_agent() - def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None: + def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None: self.platform = platform self.login_type = login_type self.account_pool = account_pool + self.crawler_type =crawler_type async def start(self) -> None: account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() @@ -72,8 +74,16 @@ class XiaoHongShuCrawler(AbstractCrawler): await login_obj.begin() await self.xhs_client.update_cookies(browser_context=self.browser_context) - # Search for notes and retrieve their comment information. - await self.search() + if self.crawler_type == "search": + # Search for notes and retrieve their comment information. + crawler_type_var.set("search") + await self.search() + elif self.crawler_type == "detail": + # Get the information and comments of the specified post + crawler_type_var.set("detail") + await self.get_specified_notes() + else: + pass utils.logger.info("Xhs Crawler finished ...") @@ -82,8 +92,6 @@ class XiaoHongShuCrawler(AbstractCrawler): utils.logger.info("Begin search xiaohongshu keywords") xhs_limit_count = 20 # xhs limit page fixed value for keyword in config.KEYWORDS.split(","): - # set keyword to context var - request_keyword_var.set(keyword) utils.logger.info(f"Current search keyword: {keyword}") page = 1 while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: @@ -107,6 +115,19 @@ class XiaoHongShuCrawler(AbstractCrawler): utils.logger.info(f"Note details: {note_details}") await self.batch_get_note_comments(note_id_list) + async def get_specified_notes(self): + """Get the information and comments of the specified post""" + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_note_detail(note_id=note_id, semaphore=semaphore) for note_id in config.SPECIFIED_ID_LIST + ] + note_details = await asyncio.gather(*task_list) + for note_detail in note_details: + if note_detail is not None: + await xhs_model.update_xhs_note(note_detail) + await self.batch_get_note_comments(config.SPECIFIED_ID_LIST) + + async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: """Get note detail""" async with semaphore: @@ -115,6 +136,9 @@ class XiaoHongShuCrawler(AbstractCrawler): except DataFetchError as ex: utils.logger.error(f"Get note detail error: {ex}") return None + except KeyError as ex: + utils.logger.error(f"have not fund note detail note_id:{note_id}, err: {ex}") + return None async def batch_get_note_comments(self, note_list: List[str]): """Batch get note comments""" diff --git a/models/xiaohongshu.py b/models/xiaohongshu.py index a598948..3c7c5a3 100644 --- a/models/xiaohongshu.py +++ b/models/xiaohongshu.py @@ -8,7 +8,7 @@ from tortoise.models import Model import config from tools import utils -from var import request_keyword_var +from var import crawler_type_var class XhsBaseModel(Model): @@ -101,9 +101,9 @@ async def update_xhs_note(note_item: Dict): await XHSNote.filter(note_id=note_id).update(**note_data.dict()) else: # Below is a simple way to save it in CSV format. - source_keywords = request_keyword_var.get() pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True) - with open(f"data/xhs/notes_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f: + save_file_name = f"data/xhs/{crawler_type_var.get()}_notes_{utils.get_current_date()}.csv" + with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: writer = csv.writer(f) if f.tell() == 0: writer.writerow(local_db_item.keys()) @@ -141,9 +141,9 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict): await XHSNoteComment.filter(comment_id=comment_id).update(**comment_data.dict()) else: # Below is a simple way to save it in CSV format. - source_keywords = request_keyword_var.get() pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True) - with open(f"data/xhs/comment_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f: + save_file_name = f"data/xhs/{crawler_type_var.get()}_comment_{utils.get_current_date()}.csv" + with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: writer = csv.writer(f) if f.tell() == 0: writer.writerow(local_db_item.keys()) diff --git a/tools/utils.py b/tools/utils.py index 7bddfc4..250604a 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -267,3 +267,12 @@ def get_tracks(distance: int, level: str = "easy") -> List[int]: from . import easing _, tricks = easing.get_tracks(distance, seconds=2, ease_func="ease_out_expo") return tricks + + +def get_current_time(): + ISOTIMEFORMAT = '%Y-%m-%d %X' + return tme.strftime(ISOTIMEFORMAT, time.localtime()) + +def get_current_date(): + ISOTIMEFORMAT = '%Y-%m-%d' + return time.strftime(ISOTIMEFORMAT, time.localtime()) \ No newline at end of file diff --git a/var.py b/var.py index 11c2974..e1b6368 100644 --- a/var.py +++ b/var.py @@ -1,3 +1,4 @@ from contextvars import ContextVar request_keyword_var: ContextVar[str] = ContextVar("request_keyword", default="") +crawler_type_var: ContextVar[str] = ContextVar("crawler_type", default="") \ No newline at end of file