diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index f977d9d..2d07675 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -1,5 +1,6 @@ import argparse import config +from tools.utils import str2bool async def parse_cmd(): @@ -15,6 +16,14 @@ async def parse_cmd(): help='number of start page', default=config.START_PAGE) parser.add_argument('--keywords', type=str, help='please input keywords', default=config.KEYWORDS) + parser.add_argument('--get_comment', type=str2bool, + help='''whether to crawl level one comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS) + parser.add_argument('--get_sub_comment', type=str2bool, + help=''''whether to crawl level two comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS) + parser.add_argument('--save_data_option', type=str, + help='where to save the data (csv or db or json)', choices=['csv', 'db', 'json'], default=config.SAVE_DATA_OPTION) + parser.add_argument('--cookies', type=str, + help='cookies used for cookie login type', default=config.COOKIES) args = parser.parse_args() @@ -24,3 +33,7 @@ async def parse_cmd(): config.CRAWLER_TYPE = args.type config.START_PAGE = args.start config.KEYWORDS = args.keywords + config.ENABLE_GET_COMMENTS = args.get_comment + config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment + config.SAVE_DATA_OPTION = args.save_data_option + config.COOKIES = args.cookies diff --git a/config/base_config.py b/config/base_config.py index ee55a87..b1a592c 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -3,7 +3,8 @@ PLATFORM = "xhs" KEYWORDS = "python,golang" LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" -SORT_TYPE = "popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书 +# 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书 +SORT_TYPE = "popularity_descending" CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据) # 是否开启 IP 代理 @@ -45,9 +46,9 @@ ENABLE_GET_IMAGES = False # 是否开启爬评论模式, 默认不开启爬评论 ENABLE_GET_COMMENTS = False -# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs +# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs, bilibili # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 -ENABLE_GET_SUB_COMMENTS = True +ENABLE_GET_SUB_COMMENTS = False # 指定小红书需要爬虫的笔记ID列表 XHS_SPECIFIED_ID_LIST = [ diff --git a/tools/utils.py b/tools/utils.py index 7386fd8..572764c 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -1,3 +1,4 @@ +import argparse import logging from .crawler_util import * @@ -18,3 +19,13 @@ def init_loging_config(): logger = init_loging_config() + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.')