From 1115b0d90c96be2c001c3cfcb90ff0ebda07959a Mon Sep 17 00:00:00 2001
From: Tianci-King <1641523786@qq.com>
Date: Fri, 12 Apr 2024 00:52:47 +0800
Subject: [PATCH] =?UTF-8?q?feat(core):=20=E6=96=B0=E5=A2=9E=E6=8E=A7?=
 =?UTF-8?q?=E5=88=B6=E7=88=AC=E8=99=AB=20=20=E5=8F=82=E6=95=B0=E8=B5=B7?=
 =?UTF-8?q?=E5=A7=8B=E9=A1=B5=E9=9D=A2=E7=9A=84=E9=A1=B5=E6=95=B0start=5Fp?=
 =?UTF-8?q?age;perf(argparse):=20=E5=90=91=E5=91=BD=E4=BB=A4=E8=A1=8C?=
 =?UTF-8?q?=E8=A7=A3=E6=9E=90=E5=99=A8=E6=B7=BB=E5=8A=A0=E7=A8=8B=E5=BA=8F?=
 =?UTF-8?q?=E5=8F=82=E6=95=B0=E8=B5=B7=E5=A7=8B=E9=A1=B5=E9=9D=A2=E9=A1=B5?=
 =?UTF-8?q?=E6=95=B0=E5=92=8C=E5=85=B3=E9=94=AE=E5=AD=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                      |  4 ++-
 base/base_crawler.py            |  2 +-
 config/base_config.py           |  3 ++
 main.py                         | 10 ++++--
 media_platform/bilibili/core.py | 14 ++++++--
 media_platform/douyin/core.py   | 13 +++++--
 media_platform/kuaishou/core.py | 14 ++++++--
 media_platform/weibo/core.py    | 14 ++++++--
 media_platform/xhs/core.py      | 60 ++++++++++++++++++++-------------
 9 files changed, 94 insertions(+), 40 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8402108..4b088ab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -165,4 +165,6 @@ cython_debug/
 /temp_image/
 /browser_data/
 /data/
-/cache
\ No newline at end of file
+/cache
+
+*/.DS_Store
\ No newline at end of file
diff --git a/base/base_crawler.py b/base/base_crawler.py
index 2a5b69f..6817c69 100644
--- a/base/base_crawler.py
+++ b/base/base_crawler.py
@@ -6,7 +6,7 @@ from playwright.async_api import BrowserContext, BrowserType
 
 class AbstractCrawler(ABC):
     @abstractmethod
-    def init_config(self, platform: str, login_type: str, crawler_type: str):
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
         pass
 
     @abstractmethod
diff --git a/config/base_config.py b/config/base_config.py
index 26761f8..309654d 100644
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -27,6 +27,9 @@ SAVE_DATA_OPTION = "json"  # csv or db or json
 # 用户浏览器缓存的浏览器文件配置
 USER_DATA_DIR = "%s_user_data_dir"  # %s will be replaced by platform name
 
+# 爬取开始页数 默认从第一页开始
+START_PAGE = 1
+
 # 爬取视频/帖子的数量控制
 CRAWLER_MAX_NOTES_COUNT = 20
 
diff --git a/main.py b/main.py
index 7c5902a..94e4d20 100644
--- a/main.py
+++ b/main.py
@@ -38,7 +38,11 @@ async def main():
                         choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
     parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
                         choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
-
+    parser.add_argument('--start', type=int, help='crawler type (number of start page)',
+                         default=config.START_PAGE)
+    parser.add_argument('--keyword', type=str, help='crawler type (please input keywords)',
+                         default=config.KEYWORDS)
+    
     # init db
     if config.SAVE_DATA_OPTION == "db":
         await db.init_db()
@@ -48,7 +52,9 @@ async def main():
     crawler.init_config(
         platform=args.platform,
         login_type=args.lt,
-        crawler_type=args.type
+        crawler_type=args.type,
+        start_page=args.start,
+        keyword=args.key
     )
     await crawler.start()
     
diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py
index 758ae0f..4a30e5f 100644
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@@ -37,10 +37,12 @@ class BilibiliCrawler(AbstractCrawler):
         self.index_url = "https://www.bilibili.com"
         self.user_agent = utils.get_user_agent()
 
-    def init_config(self, platform: str, login_type: str, crawler_type: str):
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
         self.platform = platform
         self.login_type = login_type
         self.crawler_type = crawler_type
+        self.start_page = start_page
+        self.keyword = keyword
 
     async def start(self):
         playwright_proxy_format, httpx_proxy_format = None, None
@@ -96,10 +98,16 @@ class BilibiliCrawler(AbstractCrawler):
         bili_limit_count =20  # bilibili limit page fixed value
         if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
             config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
-        for keyword in config.KEYWORDS.split(","):
+        start_page = self.start_page  # start page number
+        for keyword in self.keyword.split(","):
             utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
             page = 1
-            while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+            while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
+                    page += 1
+                    continue
+                
                 video_id_list: List[str] = []
                 videos_res = await self.bili_client.search_video_by_keyword(
                     keyword=keyword,
diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py
index 50c2a18..fb7936d 100644
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@@ -32,10 +32,12 @@ class DouYinCrawler(AbstractCrawler):
         self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"  # fixed
         self.index_url = "https://www.douyin.com"
 
-    def init_config(self, platform: str, login_type: str, crawler_type: str) -> None:
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
         self.platform = platform
         self.login_type = login_type
         self.crawler_type = crawler_type
+        self.start_page = start_page
+        self.keyword = keyword
 
     async def start(self) -> None:
         playwright_proxy_format, httpx_proxy_format = None, None
@@ -84,11 +86,16 @@ class DouYinCrawler(AbstractCrawler):
         dy_limit_count = 10  # douyin limit page fixed value
         if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
             config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
-        for keyword in config.KEYWORDS.split(","):
+        start_page = self.start_page  # start page number
+        for keyword in self.keyword.split(","):
             utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
             aweme_list: List[str] = []
             page = 0
-            while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+            while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
+                    page += 1
+                    continue
                 try:
                     posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
                                                                             offset=page * dy_limit_count,
diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py
index 4eb7796..df39374 100644
--- a/media_platform/kuaishou/core.py
+++ b/media_platform/kuaishou/core.py
@@ -32,10 +32,12 @@ class KuaishouCrawler(AbstractCrawler):
         self.index_url = "https://www.kuaishou.com"
         self.user_agent = utils.get_user_agent()
 
-    def init_config(self, platform: str, login_type: str, crawler_type: str):
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
         self.platform = platform
         self.login_type = login_type
         self.crawler_type = crawler_type
+        self.start_page = start_page
+        self.keyword = keyword
 
     async def start(self):
         playwright_proxy_format, httpx_proxy_format = None, None
@@ -88,10 +90,16 @@ class KuaishouCrawler(AbstractCrawler):
         ks_limit_count = 20  # kuaishou limit page fixed value
         if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
             config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
-        for keyword in config.KEYWORDS.split(","):
+        start_page = self.start_page
+        for keyword in self.keyword.split(","):
             utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
             page = 1
-            while page * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+            while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}")
+                    page += 1
+                    continue
+                
                 video_id_list: List[str] = []
                 videos_res = await self.ks_client.search_info_by_keyword(
                     keyword=keyword,
diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py
index 4395c8b..1f12ec7 100644
--- a/media_platform/weibo/core.py
+++ b/media_platform/weibo/core.py
@@ -40,10 +40,12 @@ class WeiboCrawler(AbstractCrawler):
         self.user_agent = utils.get_user_agent()
         self.mobile_user_agent = utils.get_mobile_user_agent()
 
-    def init_config(self, platform: str, login_type: str, crawler_type: str):
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
         self.platform = platform
         self.login_type = login_type
         self.crawler_type = crawler_type
+        self.start_page = start_page
+        self.keyword = keyword
 
     async def start(self):
         playwright_proxy_format, httpx_proxy_format = None, None
@@ -106,10 +108,16 @@ class WeiboCrawler(AbstractCrawler):
         weibo_limit_count = 10  # weibo limit page fixed value
         if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
             config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
-        for keyword in config.KEYWORDS.split(","):
+        start_page = self.start_page
+        for keyword in self.keyword.split(","):
             utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
             page = 1
-            while page * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+            while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
+                    page += 1
+                    continue
+                
                 search_res = await self.wb_client.get_note_by_keyword(
                     keyword=keyword,
                     page=page,
diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py
index 5b67d6d..6947083 100644
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -32,10 +32,12 @@ class XiaoHongShuCrawler(AbstractCrawler):
         self.index_url = "https://www.xiaohongshu.com"
         self.user_agent = utils.get_user_agent()
 
-    def init_config(self, platform: str, login_type: str, crawler_type: str) -> None:
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
         self.platform = platform
         self.login_type = login_type
         self.crawler_type = crawler_type
+        self.start_page = start_page
+        self.keyword = keyword
 
     async def start(self) -> None:
         playwright_proxy_format, httpx_proxy_format = None, None
@@ -99,31 +101,41 @@ class XiaoHongShuCrawler(AbstractCrawler):
         xhs_limit_count = 20  # xhs limit page fixed value
         if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
             config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
-        for keyword in config.KEYWORDS.split(","):
+        start_page = self.start_page
+        for keyword in self.keyword.split(","):
             utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
             page = 1
-            while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
-                note_id_list: List[str] = []
-                notes_res = await self.xhs_client.get_note_by_keyword(
-                    keyword=keyword,
-                    page=page,
-                    sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
-                )
-                utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
-                semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
-                task_list = [
-                    self.get_note_detail(post_item.get("id"), semaphore)
-                    for post_item in notes_res.get("items", {})
-                    if post_item.get('model_type') not in ('rec_query', 'hot_query')
-                ]
-                note_details = await asyncio.gather(*task_list)
-                for note_detail in note_details:
-                    if note_detail is not None:
-                        await xhs_store.update_xhs_note(note_detail)
-                        note_id_list.append(note_detail.get("note_id"))
-                page += 1
-                utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
-                await self.batch_get_note_comments(note_id_list)
+            while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
+                    page += 1
+                    continue
+
+                try:
+                    note_id_list: List[str] = []
+                    notes_res = await self.xhs_client.get_note_by_keyword(
+                        keyword=keyword,
+                        page=page,
+                        sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
+                    )
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
+                    semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+                    task_list = [
+                        self.get_note_detail(post_item.get("id"), semaphore)
+                        for post_item in notes_res.get("items", {})
+                        if post_item.get('model_type') not in ('rec_query', 'hot_query')
+                    ]
+                    note_details = await asyncio.gather(*task_list)
+                    for note_detail in note_details:
+                        if note_detail is not None:
+                            await xhs_store.update_xhs_note(note_detail)
+                            note_id_list.append(note_detail.get("note_id"))
+                    page += 1
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
+                    await self.batch_get_note_comments(note_id_list)
+                except DataFetchError:
+                    utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
+                    break
 
     async def get_creators_and_notes(self) -> None:
         """Get creator's notes and retrieve their comment information."""