improve base config reading command line arg logic

This commit is contained in:
nelzomal 2024-06-09 09:35:52 +08:00
parent 3c7c678d7a
commit eace7d1750
15 changed files with 91 additions and 139 deletions

1
.gitignore vendored
View File

@ -167,3 +167,4 @@ cython_debug/
/data/ /data/
*/.DS_Store */.DS_Store
.vscode

View File

@ -5,10 +5,6 @@ from playwright.async_api import BrowserContext, BrowserType
class AbstractCrawler(ABC): class AbstractCrawler(ABC):
@abstractmethod
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
pass
@abstractmethod @abstractmethod
async def start(self): async def start(self):
pass pass

1
cmd_arg/__init__.py Normal file
View File

@ -0,0 +1 @@
from .arg import *

25
cmd_arg/arg.py Normal file
View File

@ -0,0 +1,25 @@
import argparse
import config
async def parse_cmd():
# 读取command arg
parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
choices=["xhs", "dy", "ks", "bili", "wb"])
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
choices=["qrcode", "phone", "cookie"])
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
choices=["search", "detail", "creator"])
parser.add_argument('--start', type=int,
help='number of start page')
parser.add_argument('--keywords', type=str,
help='please input keywords')
args = parser.parse_args()
# override config
config.PLATFORM = args.platform
config.LOGIN_TYPE = args.lt
config.CRAWLER_TYPE = args.type
config.START_PAGE = args.start
config.KEYWORDS = args.keywords

27
main.py
View File

@ -1,7 +1,7 @@
import argparse
import asyncio import asyncio
import sys import sys
import cmd_arg
import config import config
import db import db
from base.base_crawler import AbstractCrawler from base.base_crawler import AbstractCrawler
@ -28,34 +28,15 @@ class CrawlerFactory:
raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...") raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...")
return crawler_class() return crawler_class()
async def main(): async def main():
# define command line params ... # parse cmd
parser = argparse.ArgumentParser(description='Media crawler program.') await cmd_arg.parse_cmd()
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
parser.add_argument('--start', type=int, help='crawler type (number of start page)',
default=config.START_PAGE)
parser.add_argument('--keywords', type=str, help='crawler type (please input keywords)',
default=config.KEYWORDS)
# init db # init db
if config.SAVE_DATA_OPTION == "db": if config.SAVE_DATA_OPTION == "db":
await db.init_db() await db.init_db()
args = parser.parse_args() crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
crawler = CrawlerFactory.create_crawler(platform=args.platform)
crawler.init_config(
platform=args.platform,
login_type=args.lt,
crawler_type=args.type,
start_page=args.start,
keyword=args.keywords
)
await crawler.start() await crawler.start()
if config.SAVE_DATA_OPTION == "db": if config.SAVE_DATA_OPTION == "db":

View File

@ -26,9 +26,6 @@ from .login import BilibiliLogin
class BilibiliCrawler(AbstractCrawler): class BilibiliCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page context_page: Page
bili_client: BilibiliClient bili_client: BilibiliClient
browser_context: BrowserContext browser_context: BrowserContext
@ -37,13 +34,6 @@ class BilibiliCrawler(AbstractCrawler):
self.index_url = "https://www.bilibili.com" self.index_url = "https://www.bilibili.com"
self.user_agent = utils.get_user_agent() self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self): async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
@ -70,7 +60,7 @@ class BilibiliCrawler(AbstractCrawler):
self.bili_client = await self.create_bilibili_client(httpx_proxy_format) self.bili_client = await self.create_bilibili_client(httpx_proxy_format)
if not await self.bili_client.pong(): if not await self.bili_client.pong():
login_obj = BilibiliLogin( login_obj = BilibiliLogin(
login_type=self.login_type, login_type=config.LOGIN_TYPE,
login_phone="", # your phone number login_phone="", # your phone number
browser_context=self.browser_context, browser_context=self.browser_context,
context_page=self.context_page, context_page=self.context_page,
@ -79,11 +69,11 @@ class BilibiliCrawler(AbstractCrawler):
await login_obj.begin() await login_obj.begin()
await self.bili_client.update_cookies(browser_context=self.browser_context) await self.bili_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type) crawler_type_var.set(config.CRAWLER_TYPE)
if self.crawler_type == "search": if config.CRAWLER_TYPE == "search":
# Search for video and retrieve their comment information. # Search for video and retrieve their comment information.
await self.search() await self.search()
elif self.crawler_type == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_videos() await self.get_specified_videos()
else: else:
@ -101,8 +91,8 @@ class BilibiliCrawler(AbstractCrawler):
bili_limit_count = 20 # bilibili limit page fixed value bili_limit_count = 20 # bilibili limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
start_page = self.start_page # start page number start_page = config.START_PAGE # start page number
for keyword in self.keyword.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info( utils.logger.info(
f"[BilibiliCrawler.search] Current search keyword: {keyword}") f"[BilibiliCrawler.search] Current search keyword: {keyword}")
page = 1 page = 1
@ -271,7 +261,7 @@ class BilibiliCrawler(AbstractCrawler):
# feat issue #14 # feat issue #14
# we will save login state to avoid login every time # we will save login state to avoid login every time
user_data_dir = os.path.join(os.getcwd(), "browser_data", user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,

View File

@ -13,6 +13,7 @@ from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed) wait_fixed)
from base.base_crawler import AbstractLogin from base.base_crawler import AbstractLogin
import config
from tools import utils from tools import utils
@ -24,7 +25,7 @@ class BilibiliLogin(AbstractLogin):
login_phone: Optional[str] = "", login_phone: Optional[str] = "",
cookie_str: str = "" cookie_str: str = ""
): ):
self.login_type = login_type config.LOGIN_TYPE = login_type
self.browser_context = browser_context self.browser_context = browser_context
self.context_page = context_page self.context_page = context_page
self.login_phone = login_phone self.login_phone = login_phone
@ -33,11 +34,11 @@ class BilibiliLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login bilibili""" """Start login bilibili"""
utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...") utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
if self.login_type == "qrcode": if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile() await self.login_by_mobile()
elif self.login_type == "cookie": elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError( raise ValueError(

View File

@ -21,27 +21,14 @@ from .login import DouYinLogin
class DouYinCrawler(AbstractCrawler): class DouYinCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page context_page: Page
dy_client: DOUYINClient dy_client: DOUYINClient
browser_context: BrowserContext browser_context: BrowserContext
start_page: int
keyword: str
def __init__(self) -> None: def __init__(self) -> None:
self.start_page = None
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.index_url = "https://www.douyin.com" self.index_url = "https://www.douyin.com"
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self) -> None: async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
@ -66,7 +53,7 @@ class DouYinCrawler(AbstractCrawler):
self.dy_client = await self.create_douyin_client(httpx_proxy_format) self.dy_client = await self.create_douyin_client(httpx_proxy_format)
if not await self.dy_client.pong(browser_context=self.browser_context): if not await self.dy_client.pong(browser_context=self.browser_context):
login_obj = DouYinLogin( login_obj = DouYinLogin(
login_type=self.login_type, login_type=config.LOGIN_TYPE,
login_phone="", # you phone number login_phone="", # you phone number
browser_context=self.browser_context, browser_context=self.browser_context,
context_page=self.context_page, context_page=self.context_page,
@ -74,14 +61,14 @@ class DouYinCrawler(AbstractCrawler):
) )
await login_obj.begin() await login_obj.begin()
await self.dy_client.update_cookies(browser_context=self.browser_context) await self.dy_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type) crawler_type_var.set(config.CRAWLER_TYPE)
if self.crawler_type == "search": if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information. # Search for notes and retrieve their comment information.
await self.search() await self.search()
elif self.crawler_type == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_awemes() await self.get_specified_awemes()
elif self.crawler_type == "creator": elif config.CRAWLER_TYPE == "creator":
# Get the information and comments of the specified creator # Get the information and comments of the specified creator
await self.get_creators_and_videos() await self.get_creators_and_videos()
@ -92,8 +79,8 @@ class DouYinCrawler(AbstractCrawler):
dy_limit_count = 10 # douyin limit page fixed value dy_limit_count = 10 # douyin limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
start_page = self.start_page # start page number start_page = config.START_PAGE # start page number
for keyword in self.keyword.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}") utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
aweme_list: List[str] = [] aweme_list: List[str] = []
page = 0 page = 0
@ -259,7 +246,7 @@ class DouYinCrawler(AbstractCrawler):
"""Launch browser and create browser context""" """Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,

View File

@ -23,7 +23,7 @@ class DouYinLogin(AbstractLogin):
login_phone: Optional[str] = "", login_phone: Optional[str] = "",
cookie_str: Optional[str] = "" cookie_str: Optional[str] = ""
): ):
self.login_type = login_type config.LOGIN_TYPE = login_type
self.browser_context = browser_context self.browser_context = browser_context
self.context_page = context_page self.context_page = context_page
self.login_phone = login_phone self.login_phone = login_phone
@ -40,11 +40,11 @@ class DouYinLogin(AbstractLogin):
await self.popup_login_dialog() await self.popup_login_dialog()
# select login type # select login type
if self.login_type == "qrcode": if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile() await self.login_by_mobile()
elif self.login_type == "cookie": elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")

View File

@ -21,9 +21,6 @@ from .login import KuaishouLogin
class KuaishouCrawler(AbstractCrawler): class KuaishouCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page context_page: Page
ks_client: KuaiShouClient ks_client: KuaiShouClient
browser_context: BrowserContext browser_context: BrowserContext
@ -32,13 +29,6 @@ class KuaishouCrawler(AbstractCrawler):
self.index_url = "https://www.kuaishou.com" self.index_url = "https://www.kuaishou.com"
self.user_agent = utils.get_user_agent() self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self): async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
@ -64,7 +54,7 @@ class KuaishouCrawler(AbstractCrawler):
self.ks_client = await self.create_ks_client(httpx_proxy_format) self.ks_client = await self.create_ks_client(httpx_proxy_format)
if not await self.ks_client.pong(): if not await self.ks_client.pong():
login_obj = KuaishouLogin( login_obj = KuaishouLogin(
login_type=self.login_type, login_type=config.LOGIN_TYPE,
login_phone=httpx_proxy_format, login_phone=httpx_proxy_format,
browser_context=self.browser_context, browser_context=self.browser_context,
context_page=self.context_page, context_page=self.context_page,
@ -73,11 +63,11 @@ class KuaishouCrawler(AbstractCrawler):
await login_obj.begin() await login_obj.begin()
await self.ks_client.update_cookies(browser_context=self.browser_context) await self.ks_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type) crawler_type_var.set(config.CRAWLER_TYPE)
if self.crawler_type == "search": if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information. # Search for notes and retrieve their comment information.
await self.search() await self.search()
elif self.crawler_type == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_videos() await self.get_specified_videos()
else: else:
@ -90,8 +80,8 @@ class KuaishouCrawler(AbstractCrawler):
ks_limit_count = 20 # kuaishou limit page fixed value ks_limit_count = 20 # kuaishou limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
start_page = self.start_page start_page = config.START_PAGE
for keyword in self.keyword.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}") utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
page = 1 page = 1
while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -238,7 +228,7 @@ class KuaishouCrawler(AbstractCrawler):
utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...") utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,

View File

@ -19,7 +19,7 @@ class KuaishouLogin(AbstractLogin):
login_phone: Optional[str] = "", login_phone: Optional[str] = "",
cookie_str: str = "" cookie_str: str = ""
): ):
self.login_type = login_type config.LOGIN_TYPE = login_type
self.browser_context = browser_context self.browser_context = browser_context
self.context_page = context_page self.context_page = context_page
self.login_phone = login_phone self.login_phone = login_phone
@ -28,11 +28,11 @@ class KuaishouLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login xiaohongshu""" """Start login xiaohongshu"""
utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...") utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
if self.login_type == "qrcode": if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile() await self.login_by_mobile()
elif self.login_type == "cookie": elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")

View File

@ -28,9 +28,6 @@ from .login import WeiboLogin
class WeiboCrawler(AbstractCrawler): class WeiboCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page context_page: Page
wb_client: WeiboClient wb_client: WeiboClient
browser_context: BrowserContext browser_context: BrowserContext
@ -41,13 +38,6 @@ class WeiboCrawler(AbstractCrawler):
self.user_agent = utils.get_user_agent() self.user_agent = utils.get_user_agent()
self.mobile_user_agent = utils.get_mobile_user_agent() self.mobile_user_agent = utils.get_mobile_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self): async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
@ -73,7 +63,7 @@ class WeiboCrawler(AbstractCrawler):
self.wb_client = await self.create_weibo_client(httpx_proxy_format) self.wb_client = await self.create_weibo_client(httpx_proxy_format)
if not await self.wb_client.pong(): if not await self.wb_client.pong():
login_obj = WeiboLogin( login_obj = WeiboLogin(
login_type=self.login_type, login_type=config.LOGIN_TYPE,
login_phone="", # your phone number login_phone="", # your phone number
browser_context=self.browser_context, browser_context=self.browser_context,
context_page=self.context_page, context_page=self.context_page,
@ -89,11 +79,11 @@ class WeiboCrawler(AbstractCrawler):
await asyncio.sleep(2) await asyncio.sleep(2)
await self.wb_client.update_cookies(browser_context=self.browser_context) await self.wb_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type) crawler_type_var.set(config.CRAWLER_TYPE)
if self.crawler_type == "search": if config.CRAWLER_TYPE == "search":
# Search for video and retrieve their comment information. # Search for video and retrieve their comment information.
await self.search() await self.search()
elif self.crawler_type == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_notes() await self.get_specified_notes()
else: else:
@ -109,8 +99,8 @@ class WeiboCrawler(AbstractCrawler):
weibo_limit_count = 10 # weibo limit page fixed value weibo_limit_count = 10 # weibo limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
start_page = self.start_page start_page = config.START_PAGE
for keyword in self.keyword.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}") utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
page = 1 page = 1
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -274,7 +264,7 @@ class WeiboCrawler(AbstractCrawler):
utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...") utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,

View File

@ -24,7 +24,7 @@ class WeiboLogin(AbstractLogin):
login_phone: Optional[str] = "", login_phone: Optional[str] = "",
cookie_str: str = "" cookie_str: str = ""
): ):
self.login_type = login_type config.LOGIN_TYPE = login_type
self.browser_context = browser_context self.browser_context = browser_context
self.context_page = context_page self.context_page = context_page
self.login_phone = login_phone self.login_phone = login_phone
@ -33,11 +33,11 @@ class WeiboLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login weibo""" """Start login weibo"""
utils.logger.info("[WeiboLogin.begin] Begin login weibo ...") utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
if self.login_type == "qrcode": if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile() await self.login_by_mobile()
elif self.login_type == "cookie": elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError( raise ValueError(

View File

@ -21,9 +21,6 @@ from .login import XiaoHongShuLogin
class XiaoHongShuCrawler(AbstractCrawler): class XiaoHongShuCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page context_page: Page
xhs_client: XiaoHongShuClient xhs_client: XiaoHongShuClient
browser_context: BrowserContext browser_context: BrowserContext
@ -32,13 +29,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.index_url = "https://www.xiaohongshu.com" self.index_url = "https://www.xiaohongshu.com"
self.user_agent = utils.get_user_agent() self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self) -> None: async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
@ -71,7 +61,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.xhs_client = await self.create_xhs_client(httpx_proxy_format) self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
if not await self.xhs_client.pong(): if not await self.xhs_client.pong():
login_obj = XiaoHongShuLogin( login_obj = XiaoHongShuLogin(
login_type=self.login_type, login_type=config.LOGIN_TYPE,
login_phone="", # input your phone number login_phone="", # input your phone number
browser_context=self.browser_context, browser_context=self.browser_context,
context_page=self.context_page, context_page=self.context_page,
@ -80,14 +70,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
await login_obj.begin() await login_obj.begin()
await self.xhs_client.update_cookies(browser_context=self.browser_context) await self.xhs_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type) crawler_type_var.set(config.CRAWLER_TYPE)
if self.crawler_type == "search": if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information. # Search for notes and retrieve their comment information.
await self.search() await self.search()
elif self.crawler_type == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_notes() await self.get_specified_notes()
elif self.crawler_type == "creator": elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments # Get creator's information and their notes and comments
await self.get_creators_and_notes() await self.get_creators_and_notes()
else: else:
@ -101,8 +91,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
xhs_limit_count = 20 # xhs limit page fixed value xhs_limit_count = 20 # xhs limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
start_page = self.start_page start_page = config.START_PAGE
for keyword in self.keyword.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}") utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
page = 1 page = 1
while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -264,7 +254,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
# feat issue #14 # feat issue #14
# we will save login state to avoid login every time # we will save login state to avoid login every time
user_data_dir = os.path.join(os.getcwd(), "browser_data", user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,

View File

@ -22,7 +22,7 @@ class XiaoHongShuLogin(AbstractLogin):
login_phone: Optional[str] = "", login_phone: Optional[str] = "",
cookie_str: str = "" cookie_str: str = ""
): ):
self.login_type = login_type config.LOGIN_TYPE = login_type
self.browser_context = browser_context self.browser_context = browser_context
self.context_page = context_page self.context_page = context_page
self.login_phone = login_phone self.login_phone = login_phone
@ -49,11 +49,11 @@ class XiaoHongShuLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login xiaohongshu""" """Start login xiaohongshu"""
utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...") utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
if self.login_type == "qrcode": if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile() await self.login_by_mobile()
elif self.login_type == "cookie": elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...") raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")