Merge remote-tracking branch 'origin/main'

2024-06-13 12:21:15 +08:00 · 2024-06-13 12:21:15 +08:00 · 78b0d317ff
parent a24f5279ba 131e68334d
commit 78b0d317ff
20 changed files with 180 additions and 151 deletions
--- a/.gitignore
+++ b/.gitignore
@ -167,3 +167,4 @@ cython_debug/
 /data/

 */.DS_Store
+.vscode
--- a/README.md
+++ b/README.md
@ -17,15 +17,17 @@
 ## 功能列表
 > 下面不支持的项目，相关的代码架构已经搭建好，只需要实现对应的方法即可，欢迎大家提交PR

+
 | 平台  | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 |
 |-----|-------|----------|-----|--------|-------|-------|-------|
 | 小红书 | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
 | 抖音  | ✅     | ✅        | ✅    | ✅       | ✅     | ✅     | ✅    |
 | 快手  | ✅     | ✅        | ❌   | ❌      | ✅     | ✅     | ✅    |
-| B 站 | ✅     | ✅        | ✅   | ❌      | ✅     | ✅     | ✅    |
+| B 站 | ✅     | ✅        | ✅   | ✅      | ✅     | ✅     | ✅    |
 | 微博  | ✅     | ✅        | ❌   | ❌      | ✅     | ✅     | ✅    |


+
 ## 使用方法

 ### 创建并激活 python 虚拟环境
--- a/base/base_crawler.py
+++ b/base/base_crawler.py
@ -5,10 +5,6 @@ from playwright.async_api import BrowserContext, BrowserType


 class AbstractCrawler(ABC):
-    @abstractmethod
-    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
-        pass
-
    @abstractmethod
    async def start(self):
        pass
--- a/cmd_arg/init.py
+++ b/cmd_arg/init.py
@ -0,0 +1 @@
+from .arg import *
--- a/cmd_arg/arg.py
+++ b/cmd_arg/arg.py
@ -0,0 +1,39 @@
+import argparse
+import config
+from tools.utils import str2bool
+
+
+async def parse_cmd():
+    # 读取command arg
+    parser = argparse.ArgumentParser(description='Media crawler program.')
+    parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
+                        choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
+    parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
+                        choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
+    parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
+                        choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
+    parser.add_argument('--start', type=int,
+                        help='number of start page', default=config.START_PAGE)
+    parser.add_argument('--keywords', type=str,
+                        help='please input keywords', default=config.KEYWORDS)
+    parser.add_argument('--get_comment', type=str2bool,
+                        help='''whether to crawl level one comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
+    parser.add_argument('--get_sub_comment', type=str2bool,
+                        help=''''whether to crawl level two comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
+    parser.add_argument('--save_data_option', type=str,
+                        help='where to save the data (csv or db or json)', choices=['csv', 'db', 'json'], default=config.SAVE_DATA_OPTION)
+    parser.add_argument('--cookies', type=str,
+                        help='cookies used for cookie login type', default=config.COOKIES)
+
+    args = parser.parse_args()
+
+    # override config
+    config.PLATFORM = args.platform
+    config.LOGIN_TYPE = args.lt
+    config.CRAWLER_TYPE = args.type
+    config.START_PAGE = args.start
+    config.KEYWORDS = args.keywords
+    config.ENABLE_GET_COMMENTS = args.get_comment
+    config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment
+    config.SAVE_DATA_OPTION = args.save_data_option
+    config.COOKIES = args.cookies
--- a/config/base_config.py
+++ b/config/base_config.py
@ -3,7 +3,8 @@ PLATFORM = "xhs"
 KEYWORDS = "python,golang"
 LOGIN_TYPE = "qrcode"  # qrcode or phone or cookie
 COOKIES = ""
-SORT_TYPE = "popularity_descending"  # 具体值参见media_platform.xxx.field下的枚举值，展示只支持小红书
+# 具体值参见media_platform.xxx.field下的枚举值，展示只支持小红书
+SORT_TYPE = "popularity_descending"
 CRAWLER_TYPE = "search"  # 爬取类型，search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)

 # 是否开启 IP 代理
@ -45,9 +46,9 @@ ENABLE_GET_IMAGES = False
 # 是否开启爬评论模式, 默认不开启爬评论
 ENABLE_GET_COMMENTS = False

-# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs
+# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs, bilibili
 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
-ENABLE_GET_SUB_COMMENTS = True
+ENABLE_GET_SUB_COMMENTS = False

 # 指定小红书需要爬虫的笔记ID列表
 XHS_SPECIFIED_ID_LIST = [
@ -96,6 +97,12 @@ DY_CREATOR_ID_LIST = [
    # ........................
 ]

+# 指定bili创作者ID列表(sec_id)
+BILI_CREATOR_ID_LIST = [
+    "20813884",
+    # ........................
+]
+
 #词云相关
 #是否开启生成评论词云图
 ENABLE_GET_WORDCLOUD = False
@ -111,3 +118,5 @@ STOP_WORDS_FILE = "./docs/hit_stopwords.txt"

 #中文字体文件路径
 FONT_PATH= "./docs/STZHONGS.TTF"
+
+
--- a/main.py
+++ b/main.py
@ -1,7 +1,7 @@
-import argparse
 import asyncio
 import sys

+import cmd_arg
 import config
 import db
 from base.base_crawler import AbstractCrawler
@ -28,34 +28,15 @@ class CrawlerFactory:
            raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...")
        return crawler_class()

-
 async def main():
-    # define command line params ...
-    parser = argparse.ArgumentParser(description='Media crawler program.')
-    parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
-                        choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
-    parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
-                        choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
-    parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
-                        choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
-    parser.add_argument('--start', type=int, help='crawler type (number of start page)',
-                         default=config.START_PAGE)
-    parser.add_argument('--keywords', type=str, help='crawler type (please input keywords)',
-                         default=config.KEYWORDS)
+    # parse cmd
+    await cmd_arg.parse_cmd()

    # init db
    if config.SAVE_DATA_OPTION == "db":
        await db.init_db()

-    args = parser.parse_args()
-    crawler = CrawlerFactory.create_crawler(platform=args.platform)
-    crawler.init_config(
-        platform=args.platform,
-        login_type=args.lt,
-        crawler_type=args.type,
-        start_page=args.start,
-        keyword=args.keywords
-    )
+    crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
    await crawler.start()
    
    if config.SAVE_DATA_OPTION == "db":
--- a/media_platform/bilibili/client.py
+++ b/media_platform/bilibili/client.py
@ -208,7 +208,6 @@ class BilibiliClient(AbstractApiClient):
            if not is_fetch_sub_comments:
                result.extend(comment_list)
                continue
-            # todo handle get sub comments
        return result

    async def get_video_all_level_two_comments(self,
@ -230,15 +229,15 @@ class BilibiliClient(AbstractApiClient):
        :return:
        """

-        pn = 0
+        pn = 1
        while True:
            result = await self.get_video_level_two_comments(
-                video_id, level_one_comment_id, 0, ps, order_mode)
+                video_id, level_one_comment_id, pn, ps, order_mode)
            comment_list: List[Dict] = result.get("replies", [])
            if callback:  # 如果有回调函数，就执行回调函数
                await callback(video_id, comment_list)
            await asyncio.sleep(crawl_interval)
-            if (int(result["page"]["count"]) <= (pn+1) * ps):
+            if (int(result["page"]["count"]) <= pn * ps):
                break

            pn += 1
@ -268,3 +267,21 @@ class BilibiliClient(AbstractApiClient):
        }
        result = await self.get(uri, post_data)
        return result
+
+    async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
+        """get all videos for a creator
+        :param creator_id: 创作者 ID
+        :param pn: 页数
+        :param ps: 一页视频数
+        :param order_mode: 排序方式
+
+        :return:
+        """
+        uri = "/x/space/wbi/arc/search"
+        post_data = {
+            "mid": creator_id,
+            "pn": pn,
+            "ps": ps,
+            "order": order_mode,
+        }
+        return await self.get(uri, post_data)
--- a/media_platform/bilibili/core.py
+++ b/media_platform/bilibili/core.py
@ -26,9 +26,6 @@ from .login import BilibiliLogin


 class BilibiliCrawler(AbstractCrawler):
-    platform: str
-    login_type: str
-    crawler_type: str
    context_page: Page
    bili_client: BilibiliClient
    browser_context: BrowserContext
@ -37,13 +34,6 @@ class BilibiliCrawler(AbstractCrawler):
        self.index_url = "https://www.bilibili.com"
        self.user_agent = utils.get_user_agent()

-    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
-        self.platform = platform
-        self.login_type = login_type
-        self.crawler_type = crawler_type
-        self.start_page = start_page
-        self.keyword = keyword
-
    async def start(self):
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
@ -70,7 +60,7 @@ class BilibiliCrawler(AbstractCrawler):
            self.bili_client = await self.create_bilibili_client(httpx_proxy_format)
            if not await self.bili_client.pong():
                login_obj = BilibiliLogin(
-                    login_type=self.login_type,
+                    login_type=config.LOGIN_TYPE,
                    login_phone="",  # your phone number
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@ -79,13 +69,16 @@ class BilibiliCrawler(AbstractCrawler):
                await login_obj.begin()
                await self.bili_client.update_cookies(browser_context=self.browser_context)

-            crawler_type_var.set(self.crawler_type)
-            if self.crawler_type == "search":
+            crawler_type_var.set(config.CRAWLER_TYPE)
+            if config.CRAWLER_TYPE == "search":
                # Search for video and retrieve their comment information.
                await self.search()
-            elif self.crawler_type == "detail":
+            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
-                await self.get_specified_videos()
+                await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
+            elif config.CRAWLER_TYPE == "creator":
+                for creator_id in config.BILI_CREATOR_ID_LIST:
+                    await self.get_creator_videos(int(creator_id))
            else:
                pass
            utils.logger.info(
@ -101,8 +94,8 @@ class BilibiliCrawler(AbstractCrawler):
        bili_limit_count = 20  # bilibili limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
-        start_page = self.start_page  # start page number
-        for keyword in self.keyword.split(","):
+        start_page = config.START_PAGE  # start page number
+        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(
                f"[BilibiliCrawler.search] Current search keyword: {keyword}")
            page = 1
@ -183,7 +176,25 @@ class BilibiliCrawler(AbstractCrawler):
                utils.logger.error(
                    f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")

-    async def get_specified_videos(self):
+    async def get_creator_videos(self, creator_id: int):
+        """
+        get videos for a creator
+        :return:
+        """
+        ps = 30
+        pn = 1
+        video_bvids_list = []
+        while True:
+            result = await self.bili_client.get_creator_videos(creator_id, pn, ps)
+            for video in result["list"]["vlist"]:
+                video_bvids_list.append(video["bvid"])
+            if (int(result["page"]["count"]) <= pn * ps):
+                break
+            await asyncio.sleep(random.random())
+            pn += 1
+        await self.get_specified_videos(video_bvids_list)
+
+    async def get_specified_videos(self, bvids_list: List[str]):
        """
        get specified videos info
        :return:
@ -191,7 +202,7 @@ class BilibiliCrawler(AbstractCrawler):
        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
        task_list = [
            self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
-            config.BILI_SPECIFIED_ID_LIST
+            bvids_list
        ]
        video_details = await asyncio.gather(*task_list)
        video_aids_list = []
@ -271,7 +282,7 @@ class BilibiliCrawler(AbstractCrawler):
            # feat issue #14
            # we will save login state to avoid login every time
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-                                         config.USER_DATA_DIR % self.platform)  # type: ignore
+                                         config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
--- a/media_platform/bilibili/login.py
+++ b/media_platform/bilibili/login.py
@ -13,6 +13,7 @@ from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
                      wait_fixed)

 from base.base_crawler import AbstractLogin
+import config
 from tools import utils


@ -24,7 +25,7 @@ class BilibiliLogin(AbstractLogin):
                 login_phone: Optional[str] = "",
                 cookie_str: str = ""
                 ):
-        self.login_type = login_type
+        config.LOGIN_TYPE = login_type
        self.browser_context = browser_context
        self.context_page = context_page
        self.login_phone = login_phone
@ -33,11 +34,11 @@ class BilibiliLogin(AbstractLogin):
    async def begin(self):
        """Start login bilibili"""
        utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
-        if self.login_type == "qrcode":
+        if config.LOGIN_TYPE == "qrcode":
            await self.login_by_qrcode()
-        elif self.login_type == "phone":
+        elif config.LOGIN_TYPE == "phone":
            await self.login_by_mobile()
-        elif self.login_type == "cookie":
+        elif config.LOGIN_TYPE == "cookie":
            await self.login_by_cookies()
        else:
            raise ValueError(
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@ -21,27 +21,14 @@ from .login import DouYinLogin


 class DouYinCrawler(AbstractCrawler):
-    platform: str
-    login_type: str
-    crawler_type: str
    context_page: Page
    dy_client: DOUYINClient
    browser_context: BrowserContext
-    start_page: int
-    keyword: str

    def __init__(self) -> None:
-        self.start_page = None
        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"  # fixed
        self.index_url = "https://www.douyin.com"

-    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
-        self.platform = platform
-        self.login_type = login_type
-        self.crawler_type = crawler_type
-        self.start_page = start_page
-        self.keyword = keyword
-
    async def start(self) -> None:
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
@ -66,7 +53,7 @@ class DouYinCrawler(AbstractCrawler):
            self.dy_client = await self.create_douyin_client(httpx_proxy_format)
            if not await self.dy_client.pong(browser_context=self.browser_context):
                login_obj = DouYinLogin(
-                    login_type=self.login_type,
+                    login_type=config.LOGIN_TYPE,
                    login_phone="",  # you phone number
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@ -74,14 +61,14 @@ class DouYinCrawler(AbstractCrawler):
                )
                await login_obj.begin()
                await self.dy_client.update_cookies(browser_context=self.browser_context)
-            crawler_type_var.set(self.crawler_type)
-            if self.crawler_type == "search":
+            crawler_type_var.set(config.CRAWLER_TYPE)
+            if config.CRAWLER_TYPE == "search":
                # Search for notes and retrieve their comment information.
                await self.search()
-            elif self.crawler_type == "detail":
+            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
                await self.get_specified_awemes()
-            elif self.crawler_type == "creator":
+            elif config.CRAWLER_TYPE == "creator":
                # Get the information and comments of the specified creator
                await self.get_creators_and_videos()

@ -92,8 +79,8 @@ class DouYinCrawler(AbstractCrawler):
        dy_limit_count = 10  # douyin limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
-        start_page = self.start_page  # start page number
-        for keyword in self.keyword.split(","):
+        start_page = config.START_PAGE  # start page number
+        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
            aweme_list: List[str] = []
            page = 0
@ -259,7 +246,7 @@ class DouYinCrawler(AbstractCrawler):
        """Launch browser and create browser context"""
        if config.SAVE_LOGIN_STATE:
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-                                         config.USER_DATA_DIR % self.platform)  # type: ignore
+                                         config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
--- a/media_platform/douyin/login.py
+++ b/media_platform/douyin/login.py
@ -23,7 +23,7 @@ class DouYinLogin(AbstractLogin):
                 login_phone: Optional[str] = "",
                 cookie_str: Optional[str] = ""
                 ):
-        self.login_type = login_type
+        config.LOGIN_TYPE = login_type
        self.browser_context = browser_context
        self.context_page = context_page
        self.login_phone = login_phone
@ -40,11 +40,11 @@ class DouYinLogin(AbstractLogin):
        await self.popup_login_dialog()

        # select login type
-        if self.login_type == "qrcode":
+        if config.LOGIN_TYPE == "qrcode":
            await self.login_by_qrcode()
-        elif self.login_type == "phone":
+        elif config.LOGIN_TYPE == "phone":
            await self.login_by_mobile()
-        elif self.login_type == "cookie":
+        elif config.LOGIN_TYPE == "cookie":
            await self.login_by_cookies()
        else:
            raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
--- a/media_platform/kuaishou/core.py
+++ b/media_platform/kuaishou/core.py
@ -21,9 +21,6 @@ from .login import KuaishouLogin


 class KuaishouCrawler(AbstractCrawler):
-    platform: str
-    login_type: str
-    crawler_type: str
    context_page: Page
    ks_client: KuaiShouClient
    browser_context: BrowserContext
@ -32,13 +29,6 @@ class KuaishouCrawler(AbstractCrawler):
        self.index_url = "https://www.kuaishou.com"
        self.user_agent = utils.get_user_agent()

-    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
-        self.platform = platform
-        self.login_type = login_type
-        self.crawler_type = crawler_type
-        self.start_page = start_page
-        self.keyword = keyword
-
    async def start(self):
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
@ -64,7 +54,7 @@ class KuaishouCrawler(AbstractCrawler):
            self.ks_client = await self.create_ks_client(httpx_proxy_format)
            if not await self.ks_client.pong():
                login_obj = KuaishouLogin(
-                    login_type=self.login_type,
+                    login_type=config.LOGIN_TYPE,
                    login_phone=httpx_proxy_format,
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@ -73,11 +63,11 @@ class KuaishouCrawler(AbstractCrawler):
                await login_obj.begin()
                await self.ks_client.update_cookies(browser_context=self.browser_context)

-            crawler_type_var.set(self.crawler_type)
-            if self.crawler_type == "search":
+            crawler_type_var.set(config.CRAWLER_TYPE)
+            if config.CRAWLER_TYPE == "search":
                # Search for notes and retrieve their comment information.
                await self.search()
-            elif self.crawler_type == "detail":
+            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
                await self.get_specified_videos()
            else:
@ -90,8 +80,8 @@ class KuaishouCrawler(AbstractCrawler):
        ks_limit_count = 20  # kuaishou limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
-        start_page = self.start_page
-        for keyword in self.keyword.split(","):
+        start_page = config.START_PAGE
+        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
            page = 1
            while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -238,7 +228,7 @@ class KuaishouCrawler(AbstractCrawler):
        utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...")
        if config.SAVE_LOGIN_STATE:
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-                                         config.USER_DATA_DIR % self.platform)  # type: ignore
+                                         config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
--- a/media_platform/kuaishou/login.py
+++ b/media_platform/kuaishou/login.py
@ -19,7 +19,7 @@ class KuaishouLogin(AbstractLogin):
                 login_phone: Optional[str] = "",
                 cookie_str: str = ""
                 ):
-        self.login_type = login_type
+        config.LOGIN_TYPE = login_type
        self.browser_context = browser_context
        self.context_page = context_page
        self.login_phone = login_phone
@ -28,11 +28,11 @@ class KuaishouLogin(AbstractLogin):
    async def begin(self):
        """Start login xiaohongshu"""
        utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
-        if self.login_type == "qrcode":
+        if config.LOGIN_TYPE == "qrcode":
            await self.login_by_qrcode()
-        elif self.login_type == "phone":
+        elif config.LOGIN_TYPE == "phone":
            await self.login_by_mobile()
-        elif self.login_type == "cookie":
+        elif config.LOGIN_TYPE == "cookie":
            await self.login_by_cookies()
        else:
            raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
--- a/media_platform/weibo/core.py
+++ b/media_platform/weibo/core.py
@ -28,9 +28,6 @@ from .login import WeiboLogin


 class WeiboCrawler(AbstractCrawler):
-    platform: str
-    login_type: str
-    crawler_type: str
    context_page: Page
    wb_client: WeiboClient
    browser_context: BrowserContext
@ -41,13 +38,6 @@ class WeiboCrawler(AbstractCrawler):
        self.user_agent = utils.get_user_agent()
        self.mobile_user_agent = utils.get_mobile_user_agent()

-    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
-        self.platform = platform
-        self.login_type = login_type
-        self.crawler_type = crawler_type
-        self.start_page = start_page
-        self.keyword = keyword
-
    async def start(self):
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
@ -73,7 +63,7 @@ class WeiboCrawler(AbstractCrawler):
            self.wb_client = await self.create_weibo_client(httpx_proxy_format)
            if not await self.wb_client.pong():
                login_obj = WeiboLogin(
-                    login_type=self.login_type,
+                    login_type=config.LOGIN_TYPE,
                    login_phone="",  # your phone number
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@ -89,11 +79,11 @@ class WeiboCrawler(AbstractCrawler):
                await asyncio.sleep(2)
                await self.wb_client.update_cookies(browser_context=self.browser_context)

-            crawler_type_var.set(self.crawler_type)
-            if self.crawler_type == "search":
+            crawler_type_var.set(config.CRAWLER_TYPE)
+            if config.CRAWLER_TYPE == "search":
                # Search for video and retrieve their comment information.
                await self.search()
-            elif self.crawler_type == "detail":
+            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
                await self.get_specified_notes()
            else:
@ -109,8 +99,8 @@ class WeiboCrawler(AbstractCrawler):
        weibo_limit_count = 10  # weibo limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
-        start_page = self.start_page
-        for keyword in self.keyword.split(","):
+        start_page = config.START_PAGE
+        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
            page = 1
            while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -274,7 +264,7 @@ class WeiboCrawler(AbstractCrawler):
        utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
        if config.SAVE_LOGIN_STATE:
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-                                         config.USER_DATA_DIR % self.platform)  # type: ignore
+                                         config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
--- a/media_platform/weibo/login.py
+++ b/media_platform/weibo/login.py
@ -24,7 +24,7 @@ class WeiboLogin(AbstractLogin):
                 login_phone: Optional[str] = "",
                 cookie_str: str = ""
                 ):
-        self.login_type = login_type
+        config.LOGIN_TYPE = login_type
        self.browser_context = browser_context
        self.context_page = context_page
        self.login_phone = login_phone
@ -33,11 +33,11 @@ class WeiboLogin(AbstractLogin):
    async def begin(self):
        """Start login weibo"""
        utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
-        if self.login_type == "qrcode":
+        if config.LOGIN_TYPE == "qrcode":
            await self.login_by_qrcode()
-        elif self.login_type == "phone":
+        elif config.LOGIN_TYPE == "phone":
            await self.login_by_mobile()
-        elif self.login_type == "cookie":
+        elif config.LOGIN_TYPE == "cookie":
            await self.login_by_cookies()
        else:
            raise ValueError(
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@ -21,9 +21,6 @@ from .login import XiaoHongShuLogin


 class XiaoHongShuCrawler(AbstractCrawler):
-    platform: str
-    login_type: str
-    crawler_type: str
    context_page: Page
    xhs_client: XiaoHongShuClient
    browser_context: BrowserContext
@ -32,13 +29,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
        self.index_url = "https://www.xiaohongshu.com"
        self.user_agent = utils.get_user_agent()

-    def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
-        self.platform = platform
-        self.login_type = login_type
-        self.crawler_type = crawler_type
-        self.start_page = start_page
-        self.keyword = keyword
-
    async def start(self) -> None:
        playwright_proxy_format, httpx_proxy_format = None, None
        if config.ENABLE_IP_PROXY:
@ -71,7 +61,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
            self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
            if not await self.xhs_client.pong():
                login_obj = XiaoHongShuLogin(
-                    login_type=self.login_type,
+                    login_type=config.LOGIN_TYPE,
                    login_phone="",  # input your phone number
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@ -80,14 +70,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
                await login_obj.begin()
                await self.xhs_client.update_cookies(browser_context=self.browser_context)

-            crawler_type_var.set(self.crawler_type)
-            if self.crawler_type == "search":
+            crawler_type_var.set(config.CRAWLER_TYPE)
+            if config.CRAWLER_TYPE == "search":
                # Search for notes and retrieve their comment information.
                await self.search()
-            elif self.crawler_type == "detail":
+            elif config.CRAWLER_TYPE == "detail":
                # Get the information and comments of the specified post
                await self.get_specified_notes()
-            elif self.crawler_type == "creator":
+            elif config.CRAWLER_TYPE == "creator":
                # Get creator's information and their notes and comments
                await self.get_creators_and_notes()
            else:
@ -101,8 +91,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
        xhs_limit_count = 20  # xhs limit page fixed value
        if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
-        start_page = self.start_page
-        for keyword in self.keyword.split(","):
+        start_page = config.START_PAGE
+        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
            page = 1
            while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -264,7 +254,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
            # feat issue #14
            # we will save login state to avoid login every time
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-                                         config.USER_DATA_DIR % self.platform)  # type: ignore
+                                         config.USER_DATA_DIR % config.PLATFORM)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
--- a/media_platform/xhs/login.py
+++ b/media_platform/xhs/login.py
@ -22,7 +22,7 @@ class XiaoHongShuLogin(AbstractLogin):
                 login_phone: Optional[str] = "",
                 cookie_str: str = ""
                 ):
-        self.login_type = login_type
+        config.LOGIN_TYPE = login_type
        self.browser_context = browser_context
        self.context_page = context_page
        self.login_phone = login_phone
@ -49,11 +49,11 @@ class XiaoHongShuLogin(AbstractLogin):
    async def begin(self):
        """Start login xiaohongshu"""
        utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
-        if self.login_type == "qrcode":
+        if config.LOGIN_TYPE == "qrcode":
            await self.login_by_qrcode()
-        elif self.login_type == "phone":
+        elif config.LOGIN_TYPE == "phone":
            await self.login_by_mobile()
-        elif self.login_type == "cookie":
+        elif config.LOGIN_TYPE == "cookie":
            await self.login_by_cookies()
        else:
            raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
--- a/schema/tables.sql
+++ b/schema/tables.sql
@ -311,4 +311,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
 ALTER TABLE `douyin_aweme_comment`
 ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';

+ALTER TABLE `bilibili_video_comment`
+ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
+
 SET FOREIGN_KEY_CHECKS = 1;
--- a/tools/utils.py
+++ b/tools/utils.py
@ -1,3 +1,4 @@
+import argparse
 import logging

 from .crawler_util import *
@ -18,3 +19,13 @@ def init_loging_config():


 logger = init_loging_config()
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')