From 700946b28aa06d694b08bfa9137f65aec632396c Mon Sep 17 00:00:00 2001
From: Relakkes <relakkes@gmail.com>
Date: Sat, 18 Nov 2023 13:38:11 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E5=B0=8F=E7=BA=A2=E4=B9=A6=E5=A2=9E?=
 =?UTF-8?q?=E5=8A=A0=E6=8C=87=E5=AE=9A=E5=B8=96=E5=AD=90=E7=88=AC=E5=8F=96?=
 =?UTF-8?q?=E5=8A=9F=E8=83=BD=20fix:=20=E4=BF=AE=E5=A4=8D=E7=A8=8B?=
 =?UTF-8?q?=E5=BA=8F=E4=B8=80=E4=BA=9B=E5=BC=82=E5=B8=B8=20bug=20refactor:?=
 =?UTF-8?q?=20=E4=BC=98=E5=8C=96=E9=83=A8=E5=88=86=E4=BB=A3=E7=A0=81?=
 =?UTF-8?q?=E9=80=BB=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                     | 30 ++++++++++++++++++++++++++---
 base/base_crawler.py          |  2 +-
 config/base_config.py         |  9 +++++++++
 main.py                       |  7 +++++--
 media_platform/douyin/core.py | 19 ++++++++++++++----
 media_platform/xhs/core.py    | 36 +++++++++++++++++++++++++++++------
 models/xiaohongshu.py         | 10 +++++-----
 tools/utils.py                |  9 +++++++++
 var.py                        |  1 +
 9 files changed, 102 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 2e9421e..99becb5 100644
--- a/README.md
+++ b/README.md
@@ -16,15 +16,21 @@
 
 - [x] 小红书登录（二维码、手机号、cookies）
 - [x] 小红书Sign请求签名
+- [x] 小红书指定关键词爬去
+- [x] 小红书指定帖子爬去
 - [x] 抖音Sign请求签名
-- [x] 代理池实现（手机号+IP）
-- [x] 并发执行爬虫请求
 - [x] 抖音登录（二维码、手机号、cookies）
 - [x] 抖音滑块（模拟滑动实现，准确率不太OK）
+- [x] 抖音指定关键爬取
 - [x] 支持登录成功后的上下文浏览器环境保留
+- [x] 代理池实现（手机号+IP）
+- [x] 并发执行爬虫请求
 - [x] 数据保存到CSV中（默认）
 - [x] 数据保持到数据库中（可选）
 
+## 待实现
+- [ ] 抖音指定帖子爬取
+- [ ] 快手爬虫实现
 
 ## 使用方法
 
@@ -51,13 +57,31 @@
 4. 运行爬虫程序
 
    ```shell
-   python main.py --platform xhs --lt qrcode
+   # 从配置文件中读取关键词搜索相关的帖子并爬去帖子信息与评论
+   python main.py --platform xhs --lt qrcode --type search
+   
+   # 从配置文件中读取指定的帖子ID列表获取指定帖子的信息与评论信息
+   python main.py --platform xhs --lt qrcode --type detail
    ```
 
 5. 打开对应APP扫二维码登录
 
 6. 等待爬虫程序执行完毕，数据会保存到 `data/xhs` 目录下
 
+## 常见程序运行出错问题
+```shell
+# Q: 爬去抖音报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'`
+# A: 该错误为缺少 nodejs 环境这个错误安装 nodejs 环境即可，版本为：`v16.8.0`
+
+# Q: 可以指定关键词爬取吗？
+# A: 在config/base_config.py 中 KEYWORDS 参数用于控制需要爬去的关键词
+
+# Q: 可以指定帖子爬去吗？
+# A：在config/base_config.py 中 SPECIFIED_ID_LIST 参数用于控制需要指定爬去的帖子ID列表
+
+# Q: 刚开始能爬取数据，过一段时间就是失效了？
+# A：出现这种情况多半是由于你的账号触发了平台风控机制了，❗️❗️请勿大规模对平台进行爬虫，影响平台。
+```
 
 ## 项目代码结构
 
diff --git a/base/base_crawler.py b/base/base_crawler.py
index 52ce177..f05ae67 100644
--- a/base/base_crawler.py
+++ b/base/base_crawler.py
@@ -5,7 +5,7 @@ from base.proxy_account_pool import AccountPool
 
 class AbstractCrawler(ABC):
     @abstractmethod
-    def init_config(self, platform: str, login_type: str, account_pool: AccountPool):
+    def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str):
         pass
 
     @abstractmethod
diff --git a/config/base_config.py b/config/base_config.py
index b8f20c3..318c58d 100644
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -3,6 +3,7 @@ PLATFORM = "xhs"
 KEYWORDS = "python,golang"
 LOGIN_TYPE = "qrcode"  # qrcode or phone or cookie
 COOKIES = ""  # login by cookie, if login_type is cookie, you must set this value
+CRAWLER_TYPE = "search"
 
 # enable ip proxy
 ENABLE_IP_PROXY = False
@@ -24,3 +25,11 @@ CRAWLER_MAX_NOTES_COUNT = 20
 
 # max concurrency num
 MAX_CONCURRENCY_NUM = 10
+
+
+# specified note id list
+SPECIFIED_ID_LIST = [
+"6422c2750000000027000d88",
+"64ca1b73000000000b028dd2",
+"630d5b85000000001203ab41",
+]
diff --git a/main.py b/main.py
index 46c94cf..2b3c711 100644
--- a/main.py
+++ b/main.py
@@ -23,11 +23,13 @@ class CrawlerFactory:
 async def main():
     # define command line params ...
     parser = argparse.ArgumentParser(description='Media crawler program.')
-    parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)', choices=["xhs", "dy"],
+    parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy)', choices=["xhs", "dy"],
                         default=config.PLATFORM)
     parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
                         choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
 
+    parser.add_argument('--type', type=str, help='crawler type (search | detail)',
+                        choices=["search","detail"],default=config.CRAWLER_TYPE)
     # init account pool
     account_pool = proxy_account_pool.create_account_pool()
 
@@ -40,7 +42,8 @@ async def main():
     crawler.init_config(
         platform=args.platform,
         login_type=args.lt,
-        account_pool=account_pool
+        account_pool=account_pool,
+        crawler_type=args.type
     )
     await crawler.start()
 
diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py
index 92685b2..35d7af5 100644
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@@ -21,6 +21,7 @@ from .login import DouYinLogin
 class DouYinCrawler(AbstractCrawler):
     platform: str
     login_type: str
+    crawler_type: str
     context_page: Page
     dy_client: DOUYINClient
     account_pool: AccountPool
@@ -30,10 +31,11 @@ class DouYinCrawler(AbstractCrawler):
         self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"  # fixed
         self.index_url = "https://www.douyin.com"
 
-    def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
+    def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None:
         self.platform = platform
         self.login_type = login_type
         self.account_pool = account_pool
+        self.crawler_type = crawler_type
 
     async def start(self) -> None:
         account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
@@ -63,8 +65,12 @@ class DouYinCrawler(AbstractCrawler):
                 await login_obj.begin()
                 await self.dy_client.update_cookies(browser_context=self.browser_context)
 
-            # search_posts
-            await self.search()
+            if self.crawler_type == "search":
+                # Search for notes and retrieve their comment information.
+                await self.search()
+            elif self.crawler_type == "detail":
+                # Get the information and comments of the specified post
+                await self.get_specified_notes()
 
             utils.logger.info("Douyin Crawler finished ...")
 
@@ -74,7 +80,7 @@ class DouYinCrawler(AbstractCrawler):
             request_keyword_var.set(keyword)
             utils.logger.info(f"Current keyword: {keyword}")
             aweme_list: List[str] = []
-            dy_limit_count = 10  # douyin fixed limit page 10
+            dy_limit_count = 10
             page = 0
             while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
                 try:
@@ -95,6 +101,11 @@ class DouYinCrawler(AbstractCrawler):
             utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
             await self.batch_get_note_comments(aweme_list)
 
+    async def get_specified_notes(self):
+        """Get the information and comments of the specified post"""
+        # todo douyin support
+        pass
+
     async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
         task_list: List[Task] = []
         semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py
index 60136cf..87e4c03 100644
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -12,7 +12,7 @@ from base.base_crawler import AbstractCrawler
 from base.proxy_account_pool import AccountPool
 from models import xiaohongshu as xhs_model
 from tools import utils
-from var import request_keyword_var
+from var import crawler_type_var
 
 from .client import XHSClient
 from .exception import DataFetchError
@@ -22,6 +22,7 @@ from .login import XHSLogin
 class XiaoHongShuCrawler(AbstractCrawler):
     platform: str
     login_type: str
+    crawler_type: str
     context_page: Page
     xhs_client: XHSClient
     account_pool: AccountPool
@@ -31,10 +32,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
         self.index_url = "https://www.xiaohongshu.com"
         self.user_agent = utils.get_user_agent()
 
-    def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
+    def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None:
         self.platform = platform
         self.login_type = login_type
         self.account_pool = account_pool
+        self.crawler_type =crawler_type
 
     async def start(self) -> None:
         account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
@@ -72,8 +74,16 @@ class XiaoHongShuCrawler(AbstractCrawler):
                 await login_obj.begin()
                 await self.xhs_client.update_cookies(browser_context=self.browser_context)
 
-            # Search for notes and retrieve their comment information.
-            await self.search()
+            if self.crawler_type == "search":
+                # Search for notes and retrieve their comment information.
+                crawler_type_var.set("search")
+                await self.search()
+            elif self.crawler_type == "detail":
+                # Get the information and comments of the specified post
+                crawler_type_var.set("detail")
+                await self.get_specified_notes()
+            else:
+                pass
 
             utils.logger.info("Xhs Crawler finished ...")
 
@@ -82,8 +92,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
         utils.logger.info("Begin search xiaohongshu keywords")
         xhs_limit_count = 20  # xhs limit page fixed value
         for keyword in config.KEYWORDS.split(","):
-            # set keyword to context var
-            request_keyword_var.set(keyword)
             utils.logger.info(f"Current search keyword: {keyword}")
             page = 1
             while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@@ -107,6 +115,19 @@ class XiaoHongShuCrawler(AbstractCrawler):
                 utils.logger.info(f"Note details: {note_details}")
                 await self.batch_get_note_comments(note_id_list)
 
+    async def get_specified_notes(self):
+        """Get the information and comments of the specified post"""
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        task_list = [
+            self.get_note_detail(note_id=note_id, semaphore=semaphore) for note_id in config.SPECIFIED_ID_LIST
+        ]
+        note_details = await asyncio.gather(*task_list)
+        for note_detail in note_details:
+            if note_detail is not None:
+                await xhs_model.update_xhs_note(note_detail)
+        await self.batch_get_note_comments(config.SPECIFIED_ID_LIST)
+
+
     async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
         """Get note detail"""
         async with semaphore:
@@ -115,6 +136,9 @@ class XiaoHongShuCrawler(AbstractCrawler):
             except DataFetchError as ex:
                 utils.logger.error(f"Get note detail error: {ex}")
                 return None
+            except KeyError as ex:
+                utils.logger.error(f"have not fund note detail note_id:{note_id}, err: {ex}")
+                return None
 
     async def batch_get_note_comments(self, note_list: List[str]):
         """Batch get note comments"""
diff --git a/models/xiaohongshu.py b/models/xiaohongshu.py
index a598948..3c7c5a3 100644
--- a/models/xiaohongshu.py
+++ b/models/xiaohongshu.py
@@ -8,7 +8,7 @@ from tortoise.models import Model
 
 import config
 from tools import utils
-from var import request_keyword_var
+from var import crawler_type_var
 
 
 class XhsBaseModel(Model):
@@ -101,9 +101,9 @@ async def update_xhs_note(note_item: Dict):
             await XHSNote.filter(note_id=note_id).update(**note_data.dict())
     else:
         # Below is a simple way to save it in CSV format.
-        source_keywords = request_keyword_var.get()
         pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True)
-        with open(f"data/xhs/notes_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f:
+        save_file_name = f"data/xhs/{crawler_type_var.get()}_notes_{utils.get_current_date()}.csv"
+        with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
             writer = csv.writer(f)
             if f.tell() == 0:
                 writer.writerow(local_db_item.keys())
@@ -141,9 +141,9 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
             await XHSNoteComment.filter(comment_id=comment_id).update(**comment_data.dict())
     else:
         # Below is a simple way to save it in CSV format.
-        source_keywords = request_keyword_var.get()
         pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True)
-        with open(f"data/xhs/comment_{source_keywords}.csv", mode='a+', encoding="utf-8-sig", newline="") as f:
+        save_file_name = f"data/xhs/{crawler_type_var.get()}_comment_{utils.get_current_date()}.csv"
+        with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
             writer = csv.writer(f)
             if f.tell() == 0:
                 writer.writerow(local_db_item.keys())
diff --git a/tools/utils.py b/tools/utils.py
index 7bddfc4..250604a 100644
--- a/tools/utils.py
+++ b/tools/utils.py
@@ -267,3 +267,12 @@ def get_tracks(distance: int, level: str = "easy") -> List[int]:
         from . import easing
         _, tricks = easing.get_tracks(distance, seconds=2, ease_func="ease_out_expo")
         return tricks
+
+
+def get_current_time():
+    ISOTIMEFORMAT = '%Y-%m-%d %X'
+    return tme.strftime(ISOTIMEFORMAT, time.localtime())
+
+def get_current_date():
+    ISOTIMEFORMAT = '%Y-%m-%d'
+    return time.strftime(ISOTIMEFORMAT, time.localtime())
\ No newline at end of file
diff --git a/var.py b/var.py
index 11c2974..e1b6368 100644
--- a/var.py
+++ b/var.py
@@ -1,3 +1,4 @@
 from contextvars import ContextVar
 
 request_keyword_var: ContextVar[str] = ContextVar("request_keyword", default="")
+crawler_type_var: ContextVar[str] = ContextVar("crawler_type", default="")
\ No newline at end of file