feat(core): 新增控制爬虫参数起始页面的页数start_page;perf(argparse): 向命令行解析器添加程序参数起始页面页数和关键字

2024-04-12 00:52:47 +08:00 · 2024-04-12 00:52:47 +08:00 · 1115b0d90c
parent bba9841c26
commit 1115b0d90c
9 changed files with 94 additions and 40 deletions
--- a/.gitignore
+++ b/.gitignore
@ -165,4 +165,6 @@ cython_debug/
 /temp_image/
 /browser_data/
 /data/
-/cache
+/cache
+
+*/.DS_Store
--- a/base/base_crawler.py
+++ b/base/base_crawler.py
@ -6,7 +6,7 @@ from playwright.async_api import BrowserContext, BrowserType

 class AbstractCrawler(ABC):
    @abstractmethod
-    def init_config(self, platform: str, login_type: str, crawler_type: str):
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
        pass

    @abstractmethod
--- a/config/base_config.py
+++ b/config/base_config.py
@ -27,6 +27,9 @@ SAVE_DATA_OPTION = "json"  # csv or db or json
 # 用户浏览器缓存的浏览器文件配置
 USER_DATA_DIR = "%s_user_data_dir"  # %s will be replaced by platform name

+# 爬取开始页数 默认从第一页开始
+START_PAGE = 1
+
 # 爬取视频/帖子的数量控制
 CRAWLER_MAX_NOTES_COUNT = 20

--- a/main.py
+++ b/main.py
@ -38,7 +38,11 @@ async def main():
                        choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
    parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
                        choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
-
+    parser.add_argument('--start', type=int, help='crawler type (number of start page)',
+                         default=config.START_PAGE)
+    parser.add_argument('--keyword', type=str, help='crawler type (please input keywords)',
+                         default=config.KEYWORDS)
+    
    # init db
    if config.SAVE_DATA_OPTION == "db":
        await db.init_db()
@ -48,7 +52,9 @@ async def main():
    crawler.init_config(
        platform=args.platform,
        login_type=args.lt,
-        crawler_type=args.type
+        crawler_type=args.type,
+        start_page=args.start,
+        keyword=args.key
    )
    await crawler.start()
    
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@ -37,10 +37,12 @@ class BilibiliCrawler(AbstractCrawler):
        self.index_url = "https://www.bilibili.com"
        self.user_agent = utils.get_user_agent()

-    def init_config(self, platform: str, login_type: str, crawler_type: str):
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
        self.platform = platform
        self.login_type = login_type
        self.crawler_type = crawler_type
+        self.start_page = start_page
+        self.keyword = keyword

    async def start(self):
        playwright_proxy_format, httpx_proxy_format = None, None
@ -96,10 +98,16 @@ class BilibiliCrawler(AbstractCrawler):
        bili_limit_count =20  # bilibili limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
-        for keyword in config.KEYWORDS.split(","):
+        start_page = self.start_page  # start page number
+        for keyword in self.keyword.split(","):
            utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
            page = 1
-            while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+            while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
+                    page += 1
+                    continue
+                
                video_id_list: List[str] = []
                videos_res = await self.bili_client.search_video_by_keyword(
                    keyword=keyword,
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@ -32,10 +32,12 @@ class DouYinCrawler(AbstractCrawler):
        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"  # fixed
        self.index_url = "https://www.douyin.com"

-    def init_config(self, platform: str, login_type: str, crawler_type: str) -> None:
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
        self.platform = platform
        self.login_type = login_type
        self.crawler_type = crawler_type
+        self.start_page = start_page
+        self.keyword = keyword

    async def start(self) -> None:
        playwright_proxy_format, httpx_proxy_format = None, None
@ -84,11 +86,16 @@ class DouYinCrawler(AbstractCrawler):
        dy_limit_count = 10  # douyin limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
-        for keyword in config.KEYWORDS.split(","):
+        start_page = self.start_page  # start page number
+        for keyword in self.keyword.split(","):
            utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
            aweme_list: List[str] = []
            page = 0
-            while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+            while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
+                    page += 1
+                    continue
                try:
                    posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
                                                                            offset=page * dy_limit_count,
--- a/media_platform/kuaishou/core.py
+++ b/media_platform/kuaishou/core.py
@ -32,10 +32,12 @@ class KuaishouCrawler(AbstractCrawler):
        self.index_url = "https://www.kuaishou.com"
        self.user_agent = utils.get_user_agent()

-    def init_config(self, platform: str, login_type: str, crawler_type: str):
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
        self.platform = platform
        self.login_type = login_type
        self.crawler_type = crawler_type
+        self.start_page = start_page
+        self.keyword = keyword

    async def start(self):
        playwright_proxy_format, httpx_proxy_format = None, None
@ -88,10 +90,16 @@ class KuaishouCrawler(AbstractCrawler):
        ks_limit_count = 20  # kuaishou limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
-        for keyword in config.KEYWORDS.split(","):
+        start_page = self.start_page
+        for keyword in self.keyword.split(","):
            utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
            page = 1
-            while page * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+            while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}")
+                    page += 1
+                    continue
+                
                video_id_list: List[str] = []
                videos_res = await self.ks_client.search_info_by_keyword(
                    keyword=keyword,
--- a/media_platform/weibo/core.py
+++ b/media_platform/weibo/core.py
@ -40,10 +40,12 @@ class WeiboCrawler(AbstractCrawler):
        self.user_agent = utils.get_user_agent()
        self.mobile_user_agent = utils.get_mobile_user_agent()

-    def init_config(self, platform: str, login_type: str, crawler_type: str):
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
        self.platform = platform
        self.login_type = login_type
        self.crawler_type = crawler_type
+        self.start_page = start_page
+        self.keyword = keyword

    async def start(self):
        playwright_proxy_format, httpx_proxy_format = None, None
@ -106,10 +108,16 @@ class WeiboCrawler(AbstractCrawler):
        weibo_limit_count = 10  # weibo limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
-        for keyword in config.KEYWORDS.split(","):
+        start_page = self.start_page
+        for keyword in self.keyword.split(","):
            utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
            page = 1
-            while page * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+            while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
+                    page += 1
+                    continue
+                
                search_res = await self.wb_client.get_note_by_keyword(
                    keyword=keyword,
                    page=page,
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@ -32,10 +32,12 @@ class XiaoHongShuCrawler(AbstractCrawler):
        self.index_url = "https://www.xiaohongshu.com"
        self.user_agent = utils.get_user_agent()

-    def init_config(self, platform: str, login_type: str, crawler_type: str) -> None:
+    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
        self.platform = platform
        self.login_type = login_type
        self.crawler_type = crawler_type
+        self.start_page = start_page
+        self.keyword = keyword

    async def start(self) -> None:
        playwright_proxy_format, httpx_proxy_format = None, None
@ -99,31 +101,41 @@ class XiaoHongShuCrawler(AbstractCrawler):
        xhs_limit_count = 20  # xhs limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
-        for keyword in config.KEYWORDS.split(","):
+        start_page = self.start_page
+        for keyword in self.keyword.split(","):
            utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
            page = 1
-            while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
-                note_id_list: List[str] = []
-                notes_res = await self.xhs_client.get_note_by_keyword(
-                    keyword=keyword,
-                    page=page,
-                    sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
-                )
-                utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
-                semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
-                task_list = [
-                    self.get_note_detail(post_item.get("id"), semaphore)
-                    for post_item in notes_res.get("items", {})
-                    if post_item.get('model_type') not in ('rec_query', 'hot_query')
-                ]
-                note_details = await asyncio.gather(*task_list)
-                for note_detail in note_details:
-                    if note_detail is not None:
-                        await xhs_store.update_xhs_note(note_detail)
-                        note_id_list.append(note_detail.get("note_id"))
-                page += 1
-                utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
-                await self.batch_get_note_comments(note_id_list)
+            while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
+                if page < start_page:
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
+                    page += 1
+                    continue
+
+                try:
+                    note_id_list: List[str] = []
+                    notes_res = await self.xhs_client.get_note_by_keyword(
+                        keyword=keyword,
+                        page=page,
+                        sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
+                    )
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
+                    semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+                    task_list = [
+                        self.get_note_detail(post_item.get("id"), semaphore)
+                        for post_item in notes_res.get("items", {})
+                        if post_item.get('model_type') not in ('rec_query', 'hot_query')
+                    ]
+                    note_details = await asyncio.gather(*task_list)
+                    for note_detail in note_details:
+                        if note_detail is not None:
+                            await xhs_store.update_xhs_note(note_detail)
+                            note_id_list.append(note_detail.get("note_id"))
+                    page += 1
+                    utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
+                    await self.batch_get_note_comments(note_id_list)
+                except DataFetchError:
+                    utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
+                    break

    async def get_creators_and_notes(self) -> None:
        """Get creator's notes and retrieve their comment information."""