improve base config reading command line arg logic

This commit is contained in:
nelzomal 2024-06-09 09:35:52 +08:00
parent 3c7c678d7a
commit eace7d1750
15 changed files with 91 additions and 139 deletions

1
.gitignore vendored
View File

@ -167,3 +167,4 @@ cython_debug/
/data/
*/.DS_Store
.vscode

View File

@ -5,10 +5,6 @@ from playwright.async_api import BrowserContext, BrowserType
class AbstractCrawler(ABC):
@abstractmethod
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
pass
@abstractmethod
async def start(self):
pass

1
cmd_arg/__init__.py Normal file
View File

@ -0,0 +1 @@
from .arg import *

25
cmd_arg/arg.py Normal file
View File

@ -0,0 +1,25 @@
import argparse
import config
async def parse_cmd():
# 读取command arg
parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
choices=["xhs", "dy", "ks", "bili", "wb"])
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
choices=["qrcode", "phone", "cookie"])
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
choices=["search", "detail", "creator"])
parser.add_argument('--start', type=int,
help='number of start page')
parser.add_argument('--keywords', type=str,
help='please input keywords')
args = parser.parse_args()
# override config
config.PLATFORM = args.platform
config.LOGIN_TYPE = args.lt
config.CRAWLER_TYPE = args.type
config.START_PAGE = args.start
config.KEYWORDS = args.keywords

27
main.py
View File

@ -1,7 +1,7 @@
import argparse
import asyncio
import sys
import cmd_arg
import config
import db
from base.base_crawler import AbstractCrawler
@ -28,34 +28,15 @@ class CrawlerFactory:
raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...")
return crawler_class()
async def main():
# define command line params ...
parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
parser.add_argument('--start', type=int, help='crawler type (number of start page)',
default=config.START_PAGE)
parser.add_argument('--keywords', type=str, help='crawler type (please input keywords)',
default=config.KEYWORDS)
# parse cmd
await cmd_arg.parse_cmd()
# init db
if config.SAVE_DATA_OPTION == "db":
await db.init_db()
args = parser.parse_args()
crawler = CrawlerFactory.create_crawler(platform=args.platform)
crawler.init_config(
platform=args.platform,
login_type=args.lt,
crawler_type=args.type,
start_page=args.start,
keyword=args.keywords
)
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
await crawler.start()
if config.SAVE_DATA_OPTION == "db":

View File

@ -26,9 +26,6 @@ from .login import BilibiliLogin
class BilibiliCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page
bili_client: BilibiliClient
browser_context: BrowserContext
@ -37,13 +34,6 @@ class BilibiliCrawler(AbstractCrawler):
self.index_url = "https://www.bilibili.com"
self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
@ -70,7 +60,7 @@ class BilibiliCrawler(AbstractCrawler):
self.bili_client = await self.create_bilibili_client(httpx_proxy_format)
if not await self.bili_client.pong():
login_obj = BilibiliLogin(
login_type=self.login_type,
login_type=config.LOGIN_TYPE,
login_phone="", # your phone number
browser_context=self.browser_context,
context_page=self.context_page,
@ -79,11 +69,11 @@ class BilibiliCrawler(AbstractCrawler):
await login_obj.begin()
await self.bili_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type)
if self.crawler_type == "search":
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for video and retrieve their comment information.
await self.search()
elif self.crawler_type == "detail":
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_videos()
else:
@ -101,8 +91,8 @@ class BilibiliCrawler(AbstractCrawler):
bili_limit_count = 20 # bilibili limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
start_page = self.start_page # start page number
for keyword in self.keyword.split(","):
start_page = config.START_PAGE # start page number
for keyword in config.KEYWORDS.split(","):
utils.logger.info(
f"[BilibiliCrawler.search] Current search keyword: {keyword}")
page = 1
@ -271,7 +261,7 @@ class BilibiliCrawler(AbstractCrawler):
# feat issue #14
# we will save login state to avoid login every time
user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,

View File

@ -13,6 +13,7 @@ from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
from base.base_crawler import AbstractLogin
import config
from tools import utils
@ -24,7 +25,7 @@ class BilibiliLogin(AbstractLogin):
login_phone: Optional[str] = "",
cookie_str: str = ""
):
self.login_type = login_type
config.LOGIN_TYPE = login_type
self.browser_context = browser_context
self.context_page = context_page
self.login_phone = login_phone
@ -33,11 +34,11 @@ class BilibiliLogin(AbstractLogin):
async def begin(self):
"""Start login bilibili"""
utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
if self.login_type == "qrcode":
if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile()
elif self.login_type == "cookie":
elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies()
else:
raise ValueError(

View File

@ -21,27 +21,14 @@ from .login import DouYinLogin
class DouYinCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page
dy_client: DOUYINClient
browser_context: BrowserContext
start_page: int
keyword: str
def __init__(self) -> None:
self.start_page = None
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.index_url = "https://www.douyin.com"
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
@ -66,7 +53,7 @@ class DouYinCrawler(AbstractCrawler):
self.dy_client = await self.create_douyin_client(httpx_proxy_format)
if not await self.dy_client.pong(browser_context=self.browser_context):
login_obj = DouYinLogin(
login_type=self.login_type,
login_type=config.LOGIN_TYPE,
login_phone="", # you phone number
browser_context=self.browser_context,
context_page=self.context_page,
@ -74,14 +61,14 @@ class DouYinCrawler(AbstractCrawler):
)
await login_obj.begin()
await self.dy_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type)
if self.crawler_type == "search":
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information.
await self.search()
elif self.crawler_type == "detail":
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_awemes()
elif self.crawler_type == "creator":
elif config.CRAWLER_TYPE == "creator":
# Get the information and comments of the specified creator
await self.get_creators_and_videos()
@ -92,8 +79,8 @@ class DouYinCrawler(AbstractCrawler):
dy_limit_count = 10 # douyin limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
start_page = self.start_page # start page number
for keyword in self.keyword.split(","):
start_page = config.START_PAGE # start page number
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
aweme_list: List[str] = []
page = 0
@ -259,7 +246,7 @@ class DouYinCrawler(AbstractCrawler):
"""Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,

View File

@ -23,7 +23,7 @@ class DouYinLogin(AbstractLogin):
login_phone: Optional[str] = "",
cookie_str: Optional[str] = ""
):
self.login_type = login_type
config.LOGIN_TYPE = login_type
self.browser_context = browser_context
self.context_page = context_page
self.login_phone = login_phone
@ -40,11 +40,11 @@ class DouYinLogin(AbstractLogin):
await self.popup_login_dialog()
# select login type
if self.login_type == "qrcode":
if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile()
elif self.login_type == "cookie":
elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies()
else:
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")

View File

@ -21,9 +21,6 @@ from .login import KuaishouLogin
class KuaishouCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page
ks_client: KuaiShouClient
browser_context: BrowserContext
@ -32,13 +29,6 @@ class KuaishouCrawler(AbstractCrawler):
self.index_url = "https://www.kuaishou.com"
self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
@ -64,7 +54,7 @@ class KuaishouCrawler(AbstractCrawler):
self.ks_client = await self.create_ks_client(httpx_proxy_format)
if not await self.ks_client.pong():
login_obj = KuaishouLogin(
login_type=self.login_type,
login_type=config.LOGIN_TYPE,
login_phone=httpx_proxy_format,
browser_context=self.browser_context,
context_page=self.context_page,
@ -73,11 +63,11 @@ class KuaishouCrawler(AbstractCrawler):
await login_obj.begin()
await self.ks_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type)
if self.crawler_type == "search":
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information.
await self.search()
elif self.crawler_type == "detail":
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_videos()
else:
@ -90,8 +80,8 @@ class KuaishouCrawler(AbstractCrawler):
ks_limit_count = 20 # kuaishou limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
start_page = self.start_page
for keyword in self.keyword.split(","):
start_page = config.START_PAGE
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
page = 1
while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -238,7 +228,7 @@ class KuaishouCrawler(AbstractCrawler):
utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,

View File

@ -19,7 +19,7 @@ class KuaishouLogin(AbstractLogin):
login_phone: Optional[str] = "",
cookie_str: str = ""
):
self.login_type = login_type
config.LOGIN_TYPE = login_type
self.browser_context = browser_context
self.context_page = context_page
self.login_phone = login_phone
@ -28,11 +28,11 @@ class KuaishouLogin(AbstractLogin):
async def begin(self):
"""Start login xiaohongshu"""
utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
if self.login_type == "qrcode":
if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile()
elif self.login_type == "cookie":
elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies()
else:
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")

View File

@ -28,9 +28,6 @@ from .login import WeiboLogin
class WeiboCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page
wb_client: WeiboClient
browser_context: BrowserContext
@ -41,13 +38,6 @@ class WeiboCrawler(AbstractCrawler):
self.user_agent = utils.get_user_agent()
self.mobile_user_agent = utils.get_mobile_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
@ -73,7 +63,7 @@ class WeiboCrawler(AbstractCrawler):
self.wb_client = await self.create_weibo_client(httpx_proxy_format)
if not await self.wb_client.pong():
login_obj = WeiboLogin(
login_type=self.login_type,
login_type=config.LOGIN_TYPE,
login_phone="", # your phone number
browser_context=self.browser_context,
context_page=self.context_page,
@ -89,11 +79,11 @@ class WeiboCrawler(AbstractCrawler):
await asyncio.sleep(2)
await self.wb_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type)
if self.crawler_type == "search":
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for video and retrieve their comment information.
await self.search()
elif self.crawler_type == "detail":
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_notes()
else:
@ -109,8 +99,8 @@ class WeiboCrawler(AbstractCrawler):
weibo_limit_count = 10 # weibo limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
start_page = self.start_page
for keyword in self.keyword.split(","):
start_page = config.START_PAGE
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
page = 1
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -274,7 +264,7 @@ class WeiboCrawler(AbstractCrawler):
utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,

View File

@ -24,7 +24,7 @@ class WeiboLogin(AbstractLogin):
login_phone: Optional[str] = "",
cookie_str: str = ""
):
self.login_type = login_type
config.LOGIN_TYPE = login_type
self.browser_context = browser_context
self.context_page = context_page
self.login_phone = login_phone
@ -33,11 +33,11 @@ class WeiboLogin(AbstractLogin):
async def begin(self):
"""Start login weibo"""
utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
if self.login_type == "qrcode":
if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile()
elif self.login_type == "cookie":
elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies()
else:
raise ValueError(

View File

@ -21,9 +21,6 @@ from .login import XiaoHongShuLogin
class XiaoHongShuCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page
xhs_client: XiaoHongShuClient
browser_context: BrowserContext
@ -32,13 +29,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.index_url = "https://www.xiaohongshu.com"
self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
@ -71,7 +61,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
if not await self.xhs_client.pong():
login_obj = XiaoHongShuLogin(
login_type=self.login_type,
login_type=config.LOGIN_TYPE,
login_phone="", # input your phone number
browser_context=self.browser_context,
context_page=self.context_page,
@ -80,14 +70,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
await login_obj.begin()
await self.xhs_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type)
if self.crawler_type == "search":
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information.
await self.search()
elif self.crawler_type == "detail":
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_notes()
elif self.crawler_type == "creator":
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments
await self.get_creators_and_notes()
else:
@ -101,8 +91,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
xhs_limit_count = 20 # xhs limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
start_page = self.start_page
for keyword in self.keyword.split(","):
start_page = config.START_PAGE
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
page = 1
while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -264,7 +254,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
# feat issue #14
# we will save login state to avoid login every time
user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,

View File

@ -22,7 +22,7 @@ class XiaoHongShuLogin(AbstractLogin):
login_phone: Optional[str] = "",
cookie_str: str = ""
):
self.login_type = login_type
config.LOGIN_TYPE = login_type
self.browser_context = browser_context
self.context_page = context_page
self.login_phone = login_phone
@ -49,11 +49,11 @@ class XiaoHongShuLogin(AbstractLogin):
async def begin(self):
"""Start login xiaohongshu"""
utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
if self.login_type == "qrcode":
if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile()
elif self.login_type == "cookie":
elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies()
else:
raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")