diff --git a/.gitignore b/.gitignore index 24210d2..1824b20 100644 --- a/.gitignore +++ b/.gitignore @@ -166,4 +166,5 @@ cython_debug/ /browser_data/ /data/ -*/.DS_Store \ No newline at end of file +*/.DS_Store +.vscode \ No newline at end of file diff --git a/README.md b/README.md index 3931c32..9ad0002 100644 --- a/README.md +++ b/README.md @@ -17,15 +17,17 @@ ## 功能列表 > 下面不支持的项目,相关的代码架构已经搭建好,只需要实现对应的方法即可,欢迎大家提交PR + | 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 | |-----|-------|----------|-----|--------|-------|-------|-------| | 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | -| B 站 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | +| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | + ## 使用方法 ### 创建并激活 python 虚拟环境 diff --git a/base/base_crawler.py b/base/base_crawler.py index fa59055..2b48643 100644 --- a/base/base_crawler.py +++ b/base/base_crawler.py @@ -5,10 +5,6 @@ from playwright.async_api import BrowserContext, BrowserType class AbstractCrawler(ABC): - @abstractmethod - def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str): - pass - @abstractmethod async def start(self): pass diff --git a/cmd_arg/__init__.py b/cmd_arg/__init__.py new file mode 100644 index 0000000..c5f2180 --- /dev/null +++ b/cmd_arg/__init__.py @@ -0,0 +1 @@ +from .arg import * diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py new file mode 100644 index 0000000..2d07675 --- /dev/null +++ b/cmd_arg/arg.py @@ -0,0 +1,39 @@ +import argparse +import config +from tools.utils import str2bool + + +async def parse_cmd(): + # 读取command arg + parser = argparse.ArgumentParser(description='Media crawler program.') + parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)', + choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM) + parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', + choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) + parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)', + choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE) + parser.add_argument('--start', type=int, + help='number of start page', default=config.START_PAGE) + parser.add_argument('--keywords', type=str, + help='please input keywords', default=config.KEYWORDS) + parser.add_argument('--get_comment', type=str2bool, + help='''whether to crawl level one comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS) + parser.add_argument('--get_sub_comment', type=str2bool, + help=''''whether to crawl level two comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS) + parser.add_argument('--save_data_option', type=str, + help='where to save the data (csv or db or json)', choices=['csv', 'db', 'json'], default=config.SAVE_DATA_OPTION) + parser.add_argument('--cookies', type=str, + help='cookies used for cookie login type', default=config.COOKIES) + + args = parser.parse_args() + + # override config + config.PLATFORM = args.platform + config.LOGIN_TYPE = args.lt + config.CRAWLER_TYPE = args.type + config.START_PAGE = args.start + config.KEYWORDS = args.keywords + config.ENABLE_GET_COMMENTS = args.get_comment + config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment + config.SAVE_DATA_OPTION = args.save_data_option + config.COOKIES = args.cookies diff --git a/config/base_config.py b/config/base_config.py index 4cd9844..9b52e52 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -3,7 +3,8 @@ PLATFORM = "xhs" KEYWORDS = "python,golang" LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" -SORT_TYPE = "popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书 +# 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书 +SORT_TYPE = "popularity_descending" CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据) # 是否开启 IP 代理 @@ -45,9 +46,9 @@ ENABLE_GET_IMAGES = False # 是否开启爬评论模式, 默认不开启爬评论 ENABLE_GET_COMMENTS = False -# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs +# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs, bilibili # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 -ENABLE_GET_SUB_COMMENTS = True +ENABLE_GET_SUB_COMMENTS = False # 指定小红书需要爬虫的笔记ID列表 XHS_SPECIFIED_ID_LIST = [ @@ -96,6 +97,12 @@ DY_CREATOR_ID_LIST = [ # ........................ ] +# 指定bili创作者ID列表(sec_id) +BILI_CREATOR_ID_LIST = [ + "20813884", + # ........................ +] + #词云相关 #是否开启生成评论词云图 ENABLE_GET_WORDCLOUD = False @@ -110,4 +117,6 @@ CUSTOM_WORDS = { STOP_WORDS_FILE = "./docs/hit_stopwords.txt" #中文字体文件路径 -FONT_PATH= "./docs/STZHONGS.TTF" \ No newline at end of file +FONT_PATH= "./docs/STZHONGS.TTF" + + diff --git a/main.py b/main.py index 9a97494..27d84ad 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,7 @@ -import argparse import asyncio import sys +import cmd_arg import config import db from base.base_crawler import AbstractCrawler @@ -28,34 +28,15 @@ class CrawlerFactory: raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...") return crawler_class() - async def main(): - # define command line params ... - parser = argparse.ArgumentParser(description='Media crawler program.') - parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)', - choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM) - parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', - choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) - parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)', - choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE) - parser.add_argument('--start', type=int, help='crawler type (number of start page)', - default=config.START_PAGE) - parser.add_argument('--keywords', type=str, help='crawler type (please input keywords)', - default=config.KEYWORDS) - + # parse cmd + await cmd_arg.parse_cmd() + # init db if config.SAVE_DATA_OPTION == "db": await db.init_db() - args = parser.parse_args() - crawler = CrawlerFactory.create_crawler(platform=args.platform) - crawler.init_config( - platform=args.platform, - login_type=args.lt, - crawler_type=args.type, - start_page=args.start, - keyword=args.keywords - ) + crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM) await crawler.start() if config.SAVE_DATA_OPTION == "db": diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index c97f110..5c13e03 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -208,7 +208,6 @@ class BilibiliClient(AbstractApiClient): if not is_fetch_sub_comments: result.extend(comment_list) continue - # todo handle get sub comments return result async def get_video_all_level_two_comments(self, @@ -230,15 +229,15 @@ class BilibiliClient(AbstractApiClient): :return: """ - pn = 0 + pn = 1 while True: result = await self.get_video_level_two_comments( - video_id, level_one_comment_id, 0, ps, order_mode) + video_id, level_one_comment_id, pn, ps, order_mode) comment_list: List[Dict] = result.get("replies", []) if callback: # 如果有回调函数,就执行回调函数 await callback(video_id, comment_list) await asyncio.sleep(crawl_interval) - if (int(result["page"]["count"]) <= (pn+1) * ps): + if (int(result["page"]["count"]) <= pn * ps): break pn += 1 @@ -268,3 +267,21 @@ class BilibiliClient(AbstractApiClient): } result = await self.get(uri, post_data) return result + + async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict: + """get all videos for a creator + :param creator_id: 创作者 ID + :param pn: 页数 + :param ps: 一页视频数 + :param order_mode: 排序方式 + + :return: + """ + uri = "/x/space/wbi/arc/search" + post_data = { + "mid": creator_id, + "pn": pn, + "ps": ps, + "order": order_mode, + } + return await self.get(uri, post_data) diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index 4e038bb..c14391d 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -26,9 +26,6 @@ from .login import BilibiliLogin class BilibiliCrawler(AbstractCrawler): - platform: str - login_type: str - crawler_type: str context_page: Page bili_client: BilibiliClient browser_context: BrowserContext @@ -37,13 +34,6 @@ class BilibiliCrawler(AbstractCrawler): self.index_url = "https://www.bilibili.com" self.user_agent = utils.get_user_agent() - def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str): - self.platform = platform - self.login_type = login_type - self.crawler_type = crawler_type - self.start_page = start_page - self.keyword = keyword - async def start(self): playwright_proxy_format, httpx_proxy_format = None, None if config.ENABLE_IP_PROXY: @@ -70,7 +60,7 @@ class BilibiliCrawler(AbstractCrawler): self.bili_client = await self.create_bilibili_client(httpx_proxy_format) if not await self.bili_client.pong(): login_obj = BilibiliLogin( - login_type=self.login_type, + login_type=config.LOGIN_TYPE, login_phone="", # your phone number browser_context=self.browser_context, context_page=self.context_page, @@ -79,13 +69,16 @@ class BilibiliCrawler(AbstractCrawler): await login_obj.begin() await self.bili_client.update_cookies(browser_context=self.browser_context) - crawler_type_var.set(self.crawler_type) - if self.crawler_type == "search": + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": # Search for video and retrieve their comment information. await self.search() - elif self.crawler_type == "detail": + elif config.CRAWLER_TYPE == "detail": # Get the information and comments of the specified post - await self.get_specified_videos() + await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST) + elif config.CRAWLER_TYPE == "creator": + for creator_id in config.BILI_CREATOR_ID_LIST: + await self.get_creator_videos(int(creator_id)) else: pass utils.logger.info( @@ -101,8 +94,8 @@ class BilibiliCrawler(AbstractCrawler): bili_limit_count = 20 # bilibili limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count: config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count - start_page = self.start_page # start page number - for keyword in self.keyword.split(","): + start_page = config.START_PAGE # start page number + for keyword in config.KEYWORDS.split(","): utils.logger.info( f"[BilibiliCrawler.search] Current search keyword: {keyword}") page = 1 @@ -183,7 +176,25 @@ class BilibiliCrawler(AbstractCrawler): utils.logger.error( f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}") - async def get_specified_videos(self): + async def get_creator_videos(self, creator_id: int): + """ + get videos for a creator + :return: + """ + ps = 30 + pn = 1 + video_bvids_list = [] + while True: + result = await self.bili_client.get_creator_videos(creator_id, pn, ps) + for video in result["list"]["vlist"]: + video_bvids_list.append(video["bvid"]) + if (int(result["page"]["count"]) <= pn * ps): + break + await asyncio.sleep(random.random()) + pn += 1 + await self.get_specified_videos(video_bvids_list) + + async def get_specified_videos(self, bvids_list: List[str]): """ get specified videos info :return: @@ -191,7 +202,7 @@ class BilibiliCrawler(AbstractCrawler): semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in - config.BILI_SPECIFIED_ID_LIST + bvids_list ] video_details = await asyncio.gather(*task_list) video_aids_list = [] @@ -271,7 +282,7 @@ class BilibiliCrawler(AbstractCrawler): # feat issue #14 # we will save login state to avoid login every time user_data_dir = os.path.join(os.getcwd(), "browser_data", - config.USER_DATA_DIR % self.platform) # type: ignore + config.USER_DATA_DIR % config.PLATFORM) # type: ignore browser_context = await chromium.launch_persistent_context( user_data_dir=user_data_dir, accept_downloads=True, diff --git a/media_platform/bilibili/login.py b/media_platform/bilibili/login.py index c0f7398..33c929f 100644 --- a/media_platform/bilibili/login.py +++ b/media_platform/bilibili/login.py @@ -13,6 +13,7 @@ from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, wait_fixed) from base.base_crawler import AbstractLogin +import config from tools import utils @@ -24,7 +25,7 @@ class BilibiliLogin(AbstractLogin): login_phone: Optional[str] = "", cookie_str: str = "" ): - self.login_type = login_type + config.LOGIN_TYPE = login_type self.browser_context = browser_context self.context_page = context_page self.login_phone = login_phone @@ -33,11 +34,11 @@ class BilibiliLogin(AbstractLogin): async def begin(self): """Start login bilibili""" utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...") - if self.login_type == "qrcode": + if config.LOGIN_TYPE == "qrcode": await self.login_by_qrcode() - elif self.login_type == "phone": + elif config.LOGIN_TYPE == "phone": await self.login_by_mobile() - elif self.login_type == "cookie": + elif config.LOGIN_TYPE == "cookie": await self.login_by_cookies() else: raise ValueError( diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 9823baa..dde5d5b 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -21,27 +21,14 @@ from .login import DouYinLogin class DouYinCrawler(AbstractCrawler): - platform: str - login_type: str - crawler_type: str context_page: Page dy_client: DOUYINClient browser_context: BrowserContext - start_page: int - keyword: str def __init__(self) -> None: - self.start_page = None self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed self.index_url = "https://www.douyin.com" - def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None: - self.platform = platform - self.login_type = login_type - self.crawler_type = crawler_type - self.start_page = start_page - self.keyword = keyword - async def start(self) -> None: playwright_proxy_format, httpx_proxy_format = None, None if config.ENABLE_IP_PROXY: @@ -66,7 +53,7 @@ class DouYinCrawler(AbstractCrawler): self.dy_client = await self.create_douyin_client(httpx_proxy_format) if not await self.dy_client.pong(browser_context=self.browser_context): login_obj = DouYinLogin( - login_type=self.login_type, + login_type=config.LOGIN_TYPE, login_phone="", # you phone number browser_context=self.browser_context, context_page=self.context_page, @@ -74,14 +61,14 @@ class DouYinCrawler(AbstractCrawler): ) await login_obj.begin() await self.dy_client.update_cookies(browser_context=self.browser_context) - crawler_type_var.set(self.crawler_type) - if self.crawler_type == "search": + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": # Search for notes and retrieve their comment information. await self.search() - elif self.crawler_type == "detail": + elif config.CRAWLER_TYPE == "detail": # Get the information and comments of the specified post await self.get_specified_awemes() - elif self.crawler_type == "creator": + elif config.CRAWLER_TYPE == "creator": # Get the information and comments of the specified creator await self.get_creators_and_videos() @@ -92,8 +79,8 @@ class DouYinCrawler(AbstractCrawler): dy_limit_count = 10 # douyin limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count: config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count - start_page = self.start_page # start page number - for keyword in self.keyword.split(","): + start_page = config.START_PAGE # start page number + for keyword in config.KEYWORDS.split(","): utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}") aweme_list: List[str] = [] page = 0 @@ -259,7 +246,7 @@ class DouYinCrawler(AbstractCrawler): """Launch browser and create browser context""" if config.SAVE_LOGIN_STATE: user_data_dir = os.path.join(os.getcwd(), "browser_data", - config.USER_DATA_DIR % self.platform) # type: ignore + config.USER_DATA_DIR % config.PLATFORM) # type: ignore browser_context = await chromium.launch_persistent_context( user_data_dir=user_data_dir, accept_downloads=True, diff --git a/media_platform/douyin/login.py b/media_platform/douyin/login.py index f0b3c23..b46cf04 100644 --- a/media_platform/douyin/login.py +++ b/media_platform/douyin/login.py @@ -23,7 +23,7 @@ class DouYinLogin(AbstractLogin): login_phone: Optional[str] = "", cookie_str: Optional[str] = "" ): - self.login_type = login_type + config.LOGIN_TYPE = login_type self.browser_context = browser_context self.context_page = context_page self.login_phone = login_phone @@ -40,11 +40,11 @@ class DouYinLogin(AbstractLogin): await self.popup_login_dialog() # select login type - if self.login_type == "qrcode": + if config.LOGIN_TYPE == "qrcode": await self.login_by_qrcode() - elif self.login_type == "phone": + elif config.LOGIN_TYPE == "phone": await self.login_by_mobile() - elif self.login_type == "cookie": + elif config.LOGIN_TYPE == "cookie": await self.login_by_cookies() else: raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index df39374..d318a9c 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -21,9 +21,6 @@ from .login import KuaishouLogin class KuaishouCrawler(AbstractCrawler): - platform: str - login_type: str - crawler_type: str context_page: Page ks_client: KuaiShouClient browser_context: BrowserContext @@ -32,13 +29,6 @@ class KuaishouCrawler(AbstractCrawler): self.index_url = "https://www.kuaishou.com" self.user_agent = utils.get_user_agent() - def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str): - self.platform = platform - self.login_type = login_type - self.crawler_type = crawler_type - self.start_page = start_page - self.keyword = keyword - async def start(self): playwright_proxy_format, httpx_proxy_format = None, None if config.ENABLE_IP_PROXY: @@ -64,7 +54,7 @@ class KuaishouCrawler(AbstractCrawler): self.ks_client = await self.create_ks_client(httpx_proxy_format) if not await self.ks_client.pong(): login_obj = KuaishouLogin( - login_type=self.login_type, + login_type=config.LOGIN_TYPE, login_phone=httpx_proxy_format, browser_context=self.browser_context, context_page=self.context_page, @@ -73,11 +63,11 @@ class KuaishouCrawler(AbstractCrawler): await login_obj.begin() await self.ks_client.update_cookies(browser_context=self.browser_context) - crawler_type_var.set(self.crawler_type) - if self.crawler_type == "search": + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": # Search for notes and retrieve their comment information. await self.search() - elif self.crawler_type == "detail": + elif config.CRAWLER_TYPE == "detail": # Get the information and comments of the specified post await self.get_specified_videos() else: @@ -90,8 +80,8 @@ class KuaishouCrawler(AbstractCrawler): ks_limit_count = 20 # kuaishou limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count: config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count - start_page = self.start_page - for keyword in self.keyword.split(","): + start_page = config.START_PAGE + for keyword in config.KEYWORDS.split(","): utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}") page = 1 while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: @@ -238,7 +228,7 @@ class KuaishouCrawler(AbstractCrawler): utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...") if config.SAVE_LOGIN_STATE: user_data_dir = os.path.join(os.getcwd(), "browser_data", - config.USER_DATA_DIR % self.platform) # type: ignore + config.USER_DATA_DIR % config.PLATFORM) # type: ignore browser_context = await chromium.launch_persistent_context( user_data_dir=user_data_dir, accept_downloads=True, diff --git a/media_platform/kuaishou/login.py b/media_platform/kuaishou/login.py index 54a9e38..cbd578b 100644 --- a/media_platform/kuaishou/login.py +++ b/media_platform/kuaishou/login.py @@ -19,7 +19,7 @@ class KuaishouLogin(AbstractLogin): login_phone: Optional[str] = "", cookie_str: str = "" ): - self.login_type = login_type + config.LOGIN_TYPE = login_type self.browser_context = browser_context self.context_page = context_page self.login_phone = login_phone @@ -28,11 +28,11 @@ class KuaishouLogin(AbstractLogin): async def begin(self): """Start login xiaohongshu""" utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...") - if self.login_type == "qrcode": + if config.LOGIN_TYPE == "qrcode": await self.login_by_qrcode() - elif self.login_type == "phone": + elif config.LOGIN_TYPE == "phone": await self.login_by_mobile() - elif self.login_type == "cookie": + elif config.LOGIN_TYPE == "cookie": await self.login_by_cookies() else: raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index 50b6b6c..481287e 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -28,9 +28,6 @@ from .login import WeiboLogin class WeiboCrawler(AbstractCrawler): - platform: str - login_type: str - crawler_type: str context_page: Page wb_client: WeiboClient browser_context: BrowserContext @@ -41,13 +38,6 @@ class WeiboCrawler(AbstractCrawler): self.user_agent = utils.get_user_agent() self.mobile_user_agent = utils.get_mobile_user_agent() - def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str): - self.platform = platform - self.login_type = login_type - self.crawler_type = crawler_type - self.start_page = start_page - self.keyword = keyword - async def start(self): playwright_proxy_format, httpx_proxy_format = None, None if config.ENABLE_IP_PROXY: @@ -73,7 +63,7 @@ class WeiboCrawler(AbstractCrawler): self.wb_client = await self.create_weibo_client(httpx_proxy_format) if not await self.wb_client.pong(): login_obj = WeiboLogin( - login_type=self.login_type, + login_type=config.LOGIN_TYPE, login_phone="", # your phone number browser_context=self.browser_context, context_page=self.context_page, @@ -89,11 +79,11 @@ class WeiboCrawler(AbstractCrawler): await asyncio.sleep(2) await self.wb_client.update_cookies(browser_context=self.browser_context) - crawler_type_var.set(self.crawler_type) - if self.crawler_type == "search": + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": # Search for video and retrieve their comment information. await self.search() - elif self.crawler_type == "detail": + elif config.CRAWLER_TYPE == "detail": # Get the information and comments of the specified post await self.get_specified_notes() else: @@ -109,8 +99,8 @@ class WeiboCrawler(AbstractCrawler): weibo_limit_count = 10 # weibo limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count: config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count - start_page = self.start_page - for keyword in self.keyword.split(","): + start_page = config.START_PAGE + for keyword in config.KEYWORDS.split(","): utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}") page = 1 while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: @@ -274,7 +264,7 @@ class WeiboCrawler(AbstractCrawler): utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...") if config.SAVE_LOGIN_STATE: user_data_dir = os.path.join(os.getcwd(), "browser_data", - config.USER_DATA_DIR % self.platform) # type: ignore + config.USER_DATA_DIR % config.PLATFORM) # type: ignore browser_context = await chromium.launch_persistent_context( user_data_dir=user_data_dir, accept_downloads=True, diff --git a/media_platform/weibo/login.py b/media_platform/weibo/login.py index 929aff8..2c2cf38 100644 --- a/media_platform/weibo/login.py +++ b/media_platform/weibo/login.py @@ -24,7 +24,7 @@ class WeiboLogin(AbstractLogin): login_phone: Optional[str] = "", cookie_str: str = "" ): - self.login_type = login_type + config.LOGIN_TYPE = login_type self.browser_context = browser_context self.context_page = context_page self.login_phone = login_phone @@ -33,11 +33,11 @@ class WeiboLogin(AbstractLogin): async def begin(self): """Start login weibo""" utils.logger.info("[WeiboLogin.begin] Begin login weibo ...") - if self.login_type == "qrcode": + if config.LOGIN_TYPE == "qrcode": await self.login_by_qrcode() - elif self.login_type == "phone": + elif config.LOGIN_TYPE == "phone": await self.login_by_mobile() - elif self.login_type == "cookie": + elif config.LOGIN_TYPE == "cookie": await self.login_by_cookies() else: raise ValueError( diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 7603f95..d720b68 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -21,9 +21,6 @@ from .login import XiaoHongShuLogin class XiaoHongShuCrawler(AbstractCrawler): - platform: str - login_type: str - crawler_type: str context_page: Page xhs_client: XiaoHongShuClient browser_context: BrowserContext @@ -32,13 +29,6 @@ class XiaoHongShuCrawler(AbstractCrawler): self.index_url = "https://www.xiaohongshu.com" self.user_agent = utils.get_user_agent() - def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None: - self.platform = platform - self.login_type = login_type - self.crawler_type = crawler_type - self.start_page = start_page - self.keyword = keyword - async def start(self) -> None: playwright_proxy_format, httpx_proxy_format = None, None if config.ENABLE_IP_PROXY: @@ -71,7 +61,7 @@ class XiaoHongShuCrawler(AbstractCrawler): self.xhs_client = await self.create_xhs_client(httpx_proxy_format) if not await self.xhs_client.pong(): login_obj = XiaoHongShuLogin( - login_type=self.login_type, + login_type=config.LOGIN_TYPE, login_phone="", # input your phone number browser_context=self.browser_context, context_page=self.context_page, @@ -80,14 +70,14 @@ class XiaoHongShuCrawler(AbstractCrawler): await login_obj.begin() await self.xhs_client.update_cookies(browser_context=self.browser_context) - crawler_type_var.set(self.crawler_type) - if self.crawler_type == "search": + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": # Search for notes and retrieve their comment information. await self.search() - elif self.crawler_type == "detail": + elif config.CRAWLER_TYPE == "detail": # Get the information and comments of the specified post await self.get_specified_notes() - elif self.crawler_type == "creator": + elif config.CRAWLER_TYPE == "creator": # Get creator's information and their notes and comments await self.get_creators_and_notes() else: @@ -101,8 +91,8 @@ class XiaoHongShuCrawler(AbstractCrawler): xhs_limit_count = 20 # xhs limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count: config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count - start_page = self.start_page - for keyword in self.keyword.split(","): + start_page = config.START_PAGE + for keyword in config.KEYWORDS.split(","): utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}") page = 1 while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: @@ -264,7 +254,7 @@ class XiaoHongShuCrawler(AbstractCrawler): # feat issue #14 # we will save login state to avoid login every time user_data_dir = os.path.join(os.getcwd(), "browser_data", - config.USER_DATA_DIR % self.platform) # type: ignore + config.USER_DATA_DIR % config.PLATFORM) # type: ignore browser_context = await chromium.launch_persistent_context( user_data_dir=user_data_dir, accept_downloads=True, diff --git a/media_platform/xhs/login.py b/media_platform/xhs/login.py index 07c0ba2..e624a9b 100644 --- a/media_platform/xhs/login.py +++ b/media_platform/xhs/login.py @@ -22,7 +22,7 @@ class XiaoHongShuLogin(AbstractLogin): login_phone: Optional[str] = "", cookie_str: str = "" ): - self.login_type = login_type + config.LOGIN_TYPE = login_type self.browser_context = browser_context self.context_page = context_page self.login_phone = login_phone @@ -49,11 +49,11 @@ class XiaoHongShuLogin(AbstractLogin): async def begin(self): """Start login xiaohongshu""" utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...") - if self.login_type == "qrcode": + if config.LOGIN_TYPE == "qrcode": await self.login_by_qrcode() - elif self.login_type == "phone": + elif config.LOGIN_TYPE == "phone": await self.login_by_mobile() - elif self.login_type == "cookie": + elif config.LOGIN_TYPE == "cookie": await self.login_by_cookies() else: raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...") diff --git a/schema/tables.sql b/schema/tables.sql index fa475e1..2e2825b 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -311,4 +311,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ALTER TABLE `douyin_aweme_comment` ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; +ALTER TABLE `bilibili_video_comment` +ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + SET FOREIGN_KEY_CHECKS = 1; diff --git a/tools/utils.py b/tools/utils.py index 7386fd8..572764c 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -1,3 +1,4 @@ +import argparse import logging from .crawler_util import * @@ -18,3 +19,13 @@ def init_loging_config(): logger = init_loging_config() + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.')