add few arg cmds

2024-06-12 10:17:37 +08:00 · 2024-06-12 10:17:37 +08:00 · 985ea93caf
parent 645ec729f6
commit 985ea93caf
3 changed files with 28 additions and 3 deletions
--- a/cmd_arg/arg.py
+++ b/cmd_arg/arg.py
@ -1,5 +1,6 @@
 import argparse
 import config
+from tools.utils import str2bool


 async def parse_cmd():
@ -15,6 +16,14 @@ async def parse_cmd():
                        help='number of start page', default=config.START_PAGE)
    parser.add_argument('--keywords', type=str,
                        help='please input keywords', default=config.KEYWORDS)
+    parser.add_argument('--get_comment', type=str2bool,
+                        help='''whether to crawl level one comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
+    parser.add_argument('--get_sub_comment', type=str2bool,
+                        help=''''whether to crawl level two comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
+    parser.add_argument('--save_data_option', type=str,
+                        help='where to save the data (csv or db or json)', choices=['csv', 'db', 'json'], default=config.SAVE_DATA_OPTION)
+    parser.add_argument('--cookies', type=str,
+                        help='cookies used for cookie login type', default=config.COOKIES)

    args = parser.parse_args()

@ -24,3 +33,7 @@ async def parse_cmd():
    config.CRAWLER_TYPE = args.type
    config.START_PAGE = args.start
    config.KEYWORDS = args.keywords
+    config.ENABLE_GET_COMMENTS = args.get_comment
+    config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment
+    config.SAVE_DATA_OPTION = args.save_data_option
+    config.COOKIES = args.cookies
--- a/config/base_config.py
+++ b/config/base_config.py
@ -3,7 +3,8 @@ PLATFORM = "xhs"
 KEYWORDS = "python,golang"
 LOGIN_TYPE = "qrcode"  # qrcode or phone or cookie
 COOKIES = ""
-SORT_TYPE = "popularity_descending"  # 具体值参见media_platform.xxx.field下的枚举值，展示只支持小红书
+# 具体值参见media_platform.xxx.field下的枚举值，展示只支持小红书
+SORT_TYPE = "popularity_descending"
 CRAWLER_TYPE = "search"  # 爬取类型，search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)

 # 是否开启 IP 代理
@ -45,9 +46,9 @@ ENABLE_GET_IMAGES = False
 # 是否开启爬评论模式, 默认不开启爬评论
 ENABLE_GET_COMMENTS = False

-# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs
+# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs, bilibili
 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
-ENABLE_GET_SUB_COMMENTS = True
+ENABLE_GET_SUB_COMMENTS = False

 # 指定小红书需要爬虫的笔记ID列表
 XHS_SPECIFIED_ID_LIST = [
--- a/tools/utils.py
+++ b/tools/utils.py
@ -1,3 +1,4 @@
+import argparse
 import logging

 from .crawler_util import *
@ -18,3 +19,13 @@ def init_loging_config():


 logger = init_loging_config()
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')