Merge branch 'main' into main

2024-06-12 21:53:41 +08:00 · 2024-06-12 21:53:41 +08:00 · 131e68334d
parent 7048f040c9 551e00c093
commit 131e68334d
20 changed files with 180 additions and 151 deletions
--- a/.gitignore
+++ b/.gitignore
@ -166,4 +166,5 @@ cython_debug/
 /browser_data/
 /data/
-*/.DS_Store
+*/.DS_Store
 .vscode
--- a/README.md
+++ b/README.md
@ -17,15 +17,17 @@
 ## 功能列表
 > 下面不支持的项目，相关的代码架构已经搭建好，只需要实现对应的方法即可，欢迎大家提交PR
 | 平台  | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 |
 |-----|-------|----------|-----|--------|-------|-------|-------|
 | 小红书 | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
 | 抖音  | ✅     | ✅        | ✅    | ✅       | ✅     | ✅     | ✅    |
 | 快手  | ✅     | ✅        | ❌   | ❌      | ✅     | ✅     | ✅    |
-| B 站 | ✅     | ✅        | ✅   | ❌      | ✅     | ✅     | ✅    |
+| B 站 | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
 | 微博  | ✅     | ✅        | ❌   | ❌      | ✅     | ✅     | ✅    |
 ## 使用方法
 ### 创建并激活 python 虚拟环境
--- a/base/base_crawler.py
+++ b/base/base_crawler.py
@ -5,10 +5,6 @@ from playwright.async_api import BrowserContext, BrowserType
 class AbstractCrawler(ABC):
    @abstractmethod
    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
        pass
    @abstractmethod
    async def start(self):
        pass
--- a/cmd_arg/init.py
+++ b/cmd_arg/init.py
@ -0,0 +1 @@
 from .arg import *
--- a/cmd_arg/arg.py
+++ b/cmd_arg/arg.py
@ -0,0 +1,39 @@
 import argparse
 import config
 from tools.utils import str2bool
 async def parse_cmd():
    # 读取command arg
    parser = argparse.ArgumentParser(description='Media crawler program.')
    parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
                        choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
    parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
                        choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
    parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
                        choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
    parser.add_argument('--start', type=int,
                        help='number of start page', default=config.START_PAGE)
    parser.add_argument('--keywords', type=str,
                        help='please input keywords', default=config.KEYWORDS)
    parser.add_argument('--get_comment', type=str2bool,
                        help='''whether to crawl level one comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
    parser.add_argument('--get_sub_comment', type=str2bool,
                        help=''''whether to crawl level two comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
    parser.add_argument('--save_data_option', type=str,
                        help='where to save the data (csv or db or json)', choices=['csv', 'db', 'json'], default=config.SAVE_DATA_OPTION)
    parser.add_argument('--cookies', type=str,
                        help='cookies used for cookie login type', default=config.COOKIES)
    args = parser.parse_args()
    # override config
    config.PLATFORM = args.platform
    config.LOGIN_TYPE = args.lt
    config.CRAWLER_TYPE = args.type
    config.START_PAGE = args.start
    config.KEYWORDS = args.keywords
    config.ENABLE_GET_COMMENTS = args.get_comment
    config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment
    config.SAVE_DATA_OPTION = args.save_data_option
    config.COOKIES = args.cookies
--- a/config/base_config.py
+++ b/config/base_config.py
@ -3,7 +3,8 @@ PLATFORM = "xhs"
 KEYWORDS = "python,golang"
 LOGIN_TYPE = "qrcode"  # qrcode or phone or cookie
 COOKIES = ""
-SORT_TYPE = "popularity_descending"  # 具体值参见media_platform.xxx.field下的枚举值，展示只支持小红书
+# 具体值参见media_platform.xxx.field下的枚举值，展示只支持小红书
 SORT_TYPE = "popularity_descending"
 CRAWLER_TYPE = "search"  # 爬取类型，search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
 # 是否开启 IP 代理
@ -45,9 +46,9 @@ ENABLE_GET_IMAGES = False
 # 是否开启爬评论模式, 默认不开启爬评论
 ENABLE_GET_COMMENTS = False
-# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs
+# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs, bilibili
 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
-ENABLE_GET_SUB_COMMENTS = True
+ENABLE_GET_SUB_COMMENTS = False
 # 指定小红书需要爬虫的笔记ID列表
 XHS_SPECIFIED_ID_LIST = [
@ -96,6 +97,12 @@ DY_CREATOR_ID_LIST = [
    # ........................
 ]
 # 指定bili创作者ID列表(sec_id)
 BILI_CREATOR_ID_LIST = [
    "20813884",
    # ........................
 ]
 #词云相关
 #是否开启生成评论词云图
 ENABLE_GET_WORDCLOUD = False
@ -110,4 +117,6 @@ CUSTOM_WORDS = {
 STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
 #中文字体文件路径
-FONT_PATH= "./docs/STZHONGS.TTF"
+FONT_PATH= "./docs/STZHONGS.TTF"
--- a/main.py
+++ b/main.py
@ -1,7 +1,7 @@
 import argparse
 import asyncio
 import sys
 import cmd_arg
 import config
 import db
 from base.base_crawler import AbstractCrawler
@ -28,34 +28,15 @@ class CrawlerFactory:
            raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...")
        return crawler_class()
 async def main():
-    # define command line params ...
+    # parse cmd
-    parser = argparse.ArgumentParser(description='Media crawler program.')
+    await cmd_arg.parse_cmd()
-    parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
+
                        choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
    parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
                        choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
    parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
                        choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
    parser.add_argument('--start', type=int, help='crawler type (number of start page)',
                         default=config.START_PAGE)
    parser.add_argument('--keywords', type=str, help='crawler type (please input keywords)',
                         default=config.KEYWORDS)
    # init db
    if config.SAVE_DATA_OPTION == "db":
        await db.init_db()
-    args = parser.parse_args()
+    crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
    crawler = CrawlerFactory.create_crawler(platform=args.platform)
    crawler.init_config(
        platform=args.platform,
        login_type=args.lt,
        crawler_type=args.type,
        start_page=args.start,
        keyword=args.keywords
    )
    await crawler.start()
    if config.SAVE_DATA_OPTION == "db":
--- a/media_platform/bilibili/client.py
+++ b/media_platform/bilibili/client.py
@ -208,7 +208,6 @@ class BilibiliClient(AbstractApiClient):
            if not is_fetch_sub_comments:
                result.extend(comment_list)
                continue
            # todo handle get sub comments
        return result
    async def get_video_all_level_two_comments(self,
@ -230,15 +229,15 @@ class BilibiliClient(AbstractApiClient):
        :return:
        """
-        pn = 0
+        pn = 1
        while True:
            result = await self.get_video_level_two_comments(
-                video_id, level_one_comment_id, 0, ps, order_mode)
+                video_id, level_one_comment_id, pn, ps, order_mode)
            comment_list: List[Dict] = result.get("replies", [])
            if callback:  # 如果有回调函数，就执行回调函数
                await callback(video_id, comment_list)
            await asyncio.sleep(crawl_interval)
-            if (int(result["page"]["count"]) <= (pn+1) * ps):
+            if (int(result["page"]["count"]) <= pn * ps):
                break
            pn += 1
@ -268,3 +267,21 @@ class BilibiliClient(AbstractApiClient):
        }
        result = await self.get(uri, post_data)
        return result
    async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
        """get all videos for a creator
        :param creator_id: 创作者 ID
        :param pn: 页数
        :param ps: 一页视频数
        :param order_mode: 排序方式
        :return:
        """
        uri = "/x/space/wbi/arc/search"
        post_data = {
            "mid": creator_id,
            "pn": pn,
            "ps": ps,
            "order": order_mode,
        }
        return await self.get(uri, post_data)
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@ -26,9 +26,6 @@ from .login import BilibiliLogin
 class BilibiliCrawler(AbstractCrawler):
    platform: str
    login_type: str
    crawler_type: str
    context_page: Page
    bili_client: BilibiliClient
    browser_context: BrowserContext
@ -37,13 +34,6 @@ class BilibiliCrawler(AbstractCrawler):
        self.index_url = "https://www.bilibili.com"
        self.user_agent = utils.get_user_agent()
    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
        self.platform = platform
        self.login_type = login_type
        self.crawler_type = crawler_type
        self.start_page = start_page
        self.keyword = keyword
    async def start(self):
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
@ -70,7 +60,7 @@ class BilibiliCrawler(AbstractCrawler):
            self.bili_client = await self.create_bilibili_client(httpx_proxy_format)
            if not await self.bili_client.pong():
                login_obj = BilibiliLogin(
-                    login_type=self.login_type,
+                    login_type=config.LOGIN_TYPE,
                    login_phone="",  # your phone number
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@ -79,13 +69,16 @@ class BilibiliCrawler(AbstractCrawler):
                await login_obj.begin()
                await self.bili_client.update_cookies(browser_context=self.browser_context)
-            crawler_type_var.set(self.crawler_type)
+            crawler_type_var.set(config.CRAWLER_TYPE)
-            if self.crawler_type == "search":
+            if config.CRAWLER_TYPE == "search":
                # Search for video and retrieve their comment information.
                await self.search()
-            elif self.crawler_type == "detail":
+            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
-                await self.get_specified_videos()
+                await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
            elif config.CRAWLER_TYPE == "creator":
                for creator_id in config.BILI_CREATOR_ID_LIST:
                    await self.get_creator_videos(int(creator_id))
            else:
                pass
            utils.logger.info(
@ -101,8 +94,8 @@ class BilibiliCrawler(AbstractCrawler):
        bili_limit_count = 20  # bilibili limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
-        start_page = self.start_page  # start page number
+        start_page = config.START_PAGE  # start page number
-        for keyword in self.keyword.split(","):
+        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(
                f"[BilibiliCrawler.search] Current search keyword: {keyword}")
            page = 1
@ -183,7 +176,25 @@ class BilibiliCrawler(AbstractCrawler):
                utils.logger.error(
                    f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
-    async def get_specified_videos(self):
+    async def get_creator_videos(self, creator_id: int):
        """
        get videos for a creator
        :return:
        """
        ps = 30
        pn = 1
        video_bvids_list = []
        while True:
            result = await self.bili_client.get_creator_videos(creator_id, pn, ps)
            for video in result["list"]["vlist"]:
                video_bvids_list.append(video["bvid"])
            if (int(result["page"]["count"]) <= pn * ps):
                break
            await asyncio.sleep(random.random())
            pn += 1
        await self.get_specified_videos(video_bvids_list)
    async def get_specified_videos(self, bvids_list: List[str]):
        """
        get specified videos info
        :return:
@ -191,7 +202,7 @@ class BilibiliCrawler(AbstractCrawler):
        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
        task_list = [
            self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
-            config.BILI_SPECIFIED_ID_LIST
+            bvids_list
        ]
        video_details = await asyncio.gather(*task_list)
        video_aids_list = []
@ -271,7 +282,7 @@ class BilibiliCrawler(AbstractCrawler):
            # feat issue #14
            # we will save login state to avoid login every time
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-                                         config.USER_DATA_DIR % self.platform)  # type: ignore
+                                         config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
--- a/media_platform/bilibili/login.py
+++ b/media_platform/bilibili/login.py
@ -13,6 +13,7 @@ from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
                      wait_fixed)
 from base.base_crawler import AbstractLogin
 import config
 from tools import utils
@ -24,7 +25,7 @@ class BilibiliLogin(AbstractLogin):
                 login_phone: Optional[str] = "",
                 cookie_str: str = ""
                 ):
-        self.login_type = login_type
+        config.LOGIN_TYPE = login_type
        self.browser_context = browser_context
        self.context_page = context_page
        self.login_phone = login_phone
@ -33,11 +34,11 @@ class BilibiliLogin(AbstractLogin):
    async def begin(self):
        """Start login bilibili"""
        utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
-        if self.login_type == "qrcode":
+        if config.LOGIN_TYPE == "qrcode":
            await self.login_by_qrcode()
-        elif self.login_type == "phone":
+        elif config.LOGIN_TYPE == "phone":
            await self.login_by_mobile()
-        elif self.login_type == "cookie":
+        elif config.LOGIN_TYPE == "cookie":
            await self.login_by_cookies()
        else:
            raise ValueError(
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@ -21,27 +21,14 @@ from .login import DouYinLogin
 class DouYinCrawler(AbstractCrawler):
    platform: str
    login_type: str
    crawler_type: str
    context_page: Page
    dy_client: DOUYINClient
    browser_context: BrowserContext
    start_page: int
    keyword: str
    def __init__(self) -> None:
        self.start_page = None
        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"  # fixed
        self.index_url = "https://www.douyin.com"
    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
        self.platform = platform
        self.login_type = login_type
        self.crawler_type = crawler_type
        self.start_page = start_page
        self.keyword = keyword
    async def start(self) -> None:
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
@ -66,7 +53,7 @@ class DouYinCrawler(AbstractCrawler):
            self.dy_client = await self.create_douyin_client(httpx_proxy_format)
            if not await self.dy_client.pong(browser_context=self.browser_context):
                login_obj = DouYinLogin(
-                    login_type=self.login_type,
+                    login_type=config.LOGIN_TYPE,
                    login_phone="",  # you phone number
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@ -74,14 +61,14 @@ class DouYinCrawler(AbstractCrawler):
                )
                await login_obj.begin()
                await self.dy_client.update_cookies(browser_context=self.browser_context)
-            crawler_type_var.set(self.crawler_type)
+            crawler_type_var.set(config.CRAWLER_TYPE)
-            if self.crawler_type == "search":
+            if config.CRAWLER_TYPE == "search":
                # Search for notes and retrieve their comment information.
                await self.search()
-            elif self.crawler_type == "detail":
+            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
                await self.get_specified_awemes()
-            elif self.crawler_type == "creator":
+            elif config.CRAWLER_TYPE == "creator":
                # Get the information and comments of the specified creator
                await self.get_creators_and_videos()
@ -92,8 +79,8 @@ class DouYinCrawler(AbstractCrawler):
        dy_limit_count = 10  # douyin limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
-        start_page = self.start_page  # start page number
+        start_page = config.START_PAGE  # start page number
-        for keyword in self.keyword.split(","):
+        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
            aweme_list: List[str] = []
            page = 0
@ -259,7 +246,7 @@ class DouYinCrawler(AbstractCrawler):
        """Launch browser and create browser context"""
        if config.SAVE_LOGIN_STATE:
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-                                         config.USER_DATA_DIR % self.platform)  # type: ignore
+                                         config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
--- a/media_platform/douyin/login.py
+++ b/media_platform/douyin/login.py
@ -23,7 +23,7 @@ class DouYinLogin(AbstractLogin):
                 login_phone: Optional[str] = "",
                 cookie_str: Optional[str] = ""
                 ):
-        self.login_type = login_type
+        config.LOGIN_TYPE = login_type
        self.browser_context = browser_context
        self.context_page = context_page
        self.login_phone = login_phone
@ -40,11 +40,11 @@ class DouYinLogin(AbstractLogin):
        await self.popup_login_dialog()
        # select login type
-        if self.login_type == "qrcode":
+        if config.LOGIN_TYPE == "qrcode":
            await self.login_by_qrcode()
-        elif self.login_type == "phone":
+        elif config.LOGIN_TYPE == "phone":
            await self.login_by_mobile()
-        elif self.login_type == "cookie":
+        elif config.LOGIN_TYPE == "cookie":
            await self.login_by_cookies()
        else:
            raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
--- a/media_platform/kuaishou/core.py
+++ b/media_platform/kuaishou/core.py
@ -21,9 +21,6 @@ from .login import KuaishouLogin
 class KuaishouCrawler(AbstractCrawler):
    platform: str
    login_type: str
    crawler_type: str
    context_page: Page
    ks_client: KuaiShouClient
    browser_context: BrowserContext
@ -32,13 +29,6 @@ class KuaishouCrawler(AbstractCrawler):
        self.index_url = "https://www.kuaishou.com"
        self.user_agent = utils.get_user_agent()
    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
        self.platform = platform
        self.login_type = login_type
        self.crawler_type = crawler_type
        self.start_page = start_page
        self.keyword = keyword
    async def start(self):
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
@ -64,7 +54,7 @@ class KuaishouCrawler(AbstractCrawler):
            self.ks_client = await self.create_ks_client(httpx_proxy_format)
            if not await self.ks_client.pong():
                login_obj = KuaishouLogin(
-                    login_type=self.login_type,
+                    login_type=config.LOGIN_TYPE,
                    login_phone=httpx_proxy_format,
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@ -73,11 +63,11 @@ class KuaishouCrawler(AbstractCrawler):
                await login_obj.begin()
                await self.ks_client.update_cookies(browser_context=self.browser_context)
-            crawler_type_var.set(self.crawler_type)
+            crawler_type_var.set(config.CRAWLER_TYPE)
-            if self.crawler_type == "search":
+            if config.CRAWLER_TYPE == "search":
                # Search for notes and retrieve their comment information.
                await self.search()
-            elif self.crawler_type == "detail":
+            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
                await self.get_specified_videos()
            else:
@ -90,8 +80,8 @@ class KuaishouCrawler(AbstractCrawler):
        ks_limit_count = 20  # kuaishou limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
-        start_page = self.start_page
+        start_page = config.START_PAGE
-        for keyword in self.keyword.split(","):
+        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
            page = 1
            while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -238,7 +228,7 @@ class KuaishouCrawler(AbstractCrawler):
        utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...")
        if config.SAVE_LOGIN_STATE:
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-                                         config.USER_DATA_DIR % self.platform)  # type: ignore
+                                         config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
--- a/media_platform/kuaishou/login.py
+++ b/media_platform/kuaishou/login.py
@ -19,7 +19,7 @@ class KuaishouLogin(AbstractLogin):
                 login_phone: Optional[str] = "",
                 cookie_str: str = ""
                 ):
-        self.login_type = login_type
+        config.LOGIN_TYPE = login_type
        self.browser_context = browser_context
        self.context_page = context_page
        self.login_phone = login_phone
@ -28,11 +28,11 @@ class KuaishouLogin(AbstractLogin):
    async def begin(self):
        """Start login xiaohongshu"""
        utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
-        if self.login_type == "qrcode":
+        if config.LOGIN_TYPE == "qrcode":
            await self.login_by_qrcode()
-        elif self.login_type == "phone":
+        elif config.LOGIN_TYPE == "phone":
            await self.login_by_mobile()
-        elif self.login_type == "cookie":
+        elif config.LOGIN_TYPE == "cookie":
            await self.login_by_cookies()
        else:
            raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
--- a/media_platform/weibo/core.py
+++ b/media_platform/weibo/core.py
@ -28,9 +28,6 @@ from .login import WeiboLogin
 class WeiboCrawler(AbstractCrawler):
    platform: str
    login_type: str
    crawler_type: str
    context_page: Page
    wb_client: WeiboClient
    browser_context: BrowserContext
@ -41,13 +38,6 @@ class WeiboCrawler(AbstractCrawler):
        self.user_agent = utils.get_user_agent()
        self.mobile_user_agent = utils.get_mobile_user_agent()
    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
        self.platform = platform
        self.login_type = login_type
        self.crawler_type = crawler_type
        self.start_page = start_page
        self.keyword = keyword
    async def start(self):
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
@ -73,7 +63,7 @@ class WeiboCrawler(AbstractCrawler):
            self.wb_client = await self.create_weibo_client(httpx_proxy_format)
            if not await self.wb_client.pong():
                login_obj = WeiboLogin(
-                    login_type=self.login_type,
+                    login_type=config.LOGIN_TYPE,
                    login_phone="",  # your phone number
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@ -89,11 +79,11 @@ class WeiboCrawler(AbstractCrawler):
                await asyncio.sleep(2)
                await self.wb_client.update_cookies(browser_context=self.browser_context)
-            crawler_type_var.set(self.crawler_type)
+            crawler_type_var.set(config.CRAWLER_TYPE)
-            if self.crawler_type == "search":
+            if config.CRAWLER_TYPE == "search":
                # Search for video and retrieve their comment information.
                await self.search()
-            elif self.crawler_type == "detail":
+            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
                await self.get_specified_notes()
            else:
@ -109,8 +99,8 @@ class WeiboCrawler(AbstractCrawler):
        weibo_limit_count = 10  # weibo limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
-        start_page = self.start_page
+        start_page = config.START_PAGE
-        for keyword in self.keyword.split(","):
+        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
            page = 1
            while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -274,7 +264,7 @@ class WeiboCrawler(AbstractCrawler):
        utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
        if config.SAVE_LOGIN_STATE:
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-                                         config.USER_DATA_DIR % self.platform)  # type: ignore
+                                         config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
--- a/media_platform/weibo/login.py
+++ b/media_platform/weibo/login.py
@ -24,7 +24,7 @@ class WeiboLogin(AbstractLogin):
                 login_phone: Optional[str] = "",
                 cookie_str: str = ""
                 ):
-        self.login_type = login_type
+        config.LOGIN_TYPE = login_type
        self.browser_context = browser_context
        self.context_page = context_page
        self.login_phone = login_phone
@ -33,11 +33,11 @@ class WeiboLogin(AbstractLogin):
    async def begin(self):
        """Start login weibo"""
        utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
-        if self.login_type == "qrcode":
+        if config.LOGIN_TYPE == "qrcode":
            await self.login_by_qrcode()
-        elif self.login_type == "phone":
+        elif config.LOGIN_TYPE == "phone":
            await self.login_by_mobile()
-        elif self.login_type == "cookie":
+        elif config.LOGIN_TYPE == "cookie":
            await self.login_by_cookies()
        else:
            raise ValueError(
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@ -21,9 +21,6 @@ from .login import XiaoHongShuLogin
 class XiaoHongShuCrawler(AbstractCrawler):
    platform: str
    login_type: str
    crawler_type: str
    context_page: Page
    xhs_client: XiaoHongShuClient
    browser_context: BrowserContext
@ -32,13 +29,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
        self.index_url = "https://www.xiaohongshu.com"
        self.user_agent = utils.get_user_agent()
    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
        self.platform = platform
        self.login_type = login_type
        self.crawler_type = crawler_type
        self.start_page = start_page
        self.keyword = keyword
    async def start(self) -> None:
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
@ -71,7 +61,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
            self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
            if not await self.xhs_client.pong():
                login_obj = XiaoHongShuLogin(
-                    login_type=self.login_type,
+                    login_type=config.LOGIN_TYPE,
                    login_phone="",  # input your phone number
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@ -80,14 +70,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
                await login_obj.begin()
                await self.xhs_client.update_cookies(browser_context=self.browser_context)
-            crawler_type_var.set(self.crawler_type)
+            crawler_type_var.set(config.CRAWLER_TYPE)
-            if self.crawler_type == "search":
+            if config.CRAWLER_TYPE == "search":
                # Search for notes and retrieve their comment information.
                await self.search()
-            elif self.crawler_type == "detail":
+            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
                await self.get_specified_notes()
-            elif self.crawler_type == "creator":
+            elif config.CRAWLER_TYPE == "creator":
                # Get creator's information and their notes and comments
                await self.get_creators_and_notes()
            else:
@ -101,8 +91,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
        xhs_limit_count = 20  # xhs limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
-        start_page = self.start_page
+        start_page = config.START_PAGE
-        for keyword in self.keyword.split(","):
+        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
            page = 1
            while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -264,7 +254,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
            # feat issue #14
            # we will save login state to avoid login every time
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-                                         config.USER_DATA_DIR % self.platform)  # type: ignore
+                                         config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
--- a/media_platform/xhs/login.py
+++ b/media_platform/xhs/login.py
@ -22,7 +22,7 @@ class XiaoHongShuLogin(AbstractLogin):
                 login_phone: Optional[str] = "",
                 cookie_str: str = ""
                 ):
-        self.login_type = login_type
+        config.LOGIN_TYPE = login_type
        self.browser_context = browser_context
        self.context_page = context_page
        self.login_phone = login_phone
@ -49,11 +49,11 @@ class XiaoHongShuLogin(AbstractLogin):
    async def begin(self):
        """Start login xiaohongshu"""
        utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
-        if self.login_type == "qrcode":
+        if config.LOGIN_TYPE == "qrcode":
            await self.login_by_qrcode()
-        elif self.login_type == "phone":
+        elif config.LOGIN_TYPE == "phone":
            await self.login_by_mobile()
-        elif self.login_type == "cookie":
+        elif config.LOGIN_TYPE == "cookie":
            await self.login_by_cookies()
        else:
            raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
--- a/schema/tables.sql
+++ b/schema/tables.sql
@ -311,4 +311,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
 ALTER TABLE `douyin_aweme_comment`
 ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
 ALTER TABLE `bilibili_video_comment`
 ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
 SET FOREIGN_KEY_CHECKS = 1;
--- a/tools/utils.py
+++ b/tools/utils.py
@ -1,3 +1,4 @@
 import argparse
 import logging
 from .crawler_util import *
@ -18,3 +19,13 @@ def init_loging_config():
 logger = init_loging_config()
 def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')