Merge remote-tracking branch 'origin/main'

This commit is contained in:
Rosyrain 2024-06-13 12:21:15 +08:00
commit 78b0d317ff
20 changed files with 180 additions and 151 deletions

1
.gitignore vendored
View File

@ -167,3 +167,4 @@ cython_debug/
/data/ /data/
*/.DS_Store */.DS_Store
.vscode

View File

@ -17,15 +17,17 @@
## 功能列表 ## 功能列表
> 下面不支持的项目相关的代码架构已经搭建好只需要实现对应的方法即可欢迎大家提交PR > 下面不支持的项目相关的代码架构已经搭建好只需要实现对应的方法即可欢迎大家提交PR
| 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 | | 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 |
|-----|-------|----------|-----|--------|-------|-------|-------| |-----|-------|----------|-----|--------|-------|-------|-------|
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
| B 站 | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ | | B 站 | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ |
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
## 使用方法 ## 使用方法
### 创建并激活 python 虚拟环境 ### 创建并激活 python 虚拟环境

View File

@ -5,10 +5,6 @@ from playwright.async_api import BrowserContext, BrowserType
class AbstractCrawler(ABC): class AbstractCrawler(ABC):
@abstractmethod
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
pass
@abstractmethod @abstractmethod
async def start(self): async def start(self):
pass pass

1
cmd_arg/__init__.py Normal file
View File

@ -0,0 +1 @@
from .arg import *

39
cmd_arg/arg.py Normal file
View File

@ -0,0 +1,39 @@
import argparse
import config
from tools.utils import str2bool
async def parse_cmd():
# 读取command arg
parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
parser.add_argument('--start', type=int,
help='number of start page', default=config.START_PAGE)
parser.add_argument('--keywords', type=str,
help='please input keywords', default=config.KEYWORDS)
parser.add_argument('--get_comment', type=str2bool,
help='''whether to crawl level one comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
parser.add_argument('--get_sub_comment', type=str2bool,
help=''''whether to crawl level two comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
parser.add_argument('--save_data_option', type=str,
help='where to save the data (csv or db or json)', choices=['csv', 'db', 'json'], default=config.SAVE_DATA_OPTION)
parser.add_argument('--cookies', type=str,
help='cookies used for cookie login type', default=config.COOKIES)
args = parser.parse_args()
# override config
config.PLATFORM = args.platform
config.LOGIN_TYPE = args.lt
config.CRAWLER_TYPE = args.type
config.START_PAGE = args.start
config.KEYWORDS = args.keywords
config.ENABLE_GET_COMMENTS = args.get_comment
config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment
config.SAVE_DATA_OPTION = args.save_data_option
config.COOKIES = args.cookies

View File

@ -3,7 +3,8 @@ PLATFORM = "xhs"
KEYWORDS = "python,golang" KEYWORDS = "python,golang"
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = "" COOKIES = ""
SORT_TYPE = "popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值展示只支持小红书 # 具体值参见media_platform.xxx.field下的枚举值展示只支持小红书
SORT_TYPE = "popularity_descending"
CRAWLER_TYPE = "search" # 爬取类型search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据) CRAWLER_TYPE = "search" # 爬取类型search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
# 是否开启 IP 代理 # 是否开启 IP 代理
@ -45,9 +46,9 @@ ENABLE_GET_IMAGES = False
# 是否开启爬评论模式, 默认不开启爬评论 # 是否开启爬评论模式, 默认不开启爬评论
ENABLE_GET_COMMENTS = False ENABLE_GET_COMMENTS = False
# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs # 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs, bilibili
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
ENABLE_GET_SUB_COMMENTS = True ENABLE_GET_SUB_COMMENTS = False
# 指定小红书需要爬虫的笔记ID列表 # 指定小红书需要爬虫的笔记ID列表
XHS_SPECIFIED_ID_LIST = [ XHS_SPECIFIED_ID_LIST = [
@ -96,6 +97,12 @@ DY_CREATOR_ID_LIST = [
# ........................ # ........................
] ]
# 指定bili创作者ID列表(sec_id)
BILI_CREATOR_ID_LIST = [
"20813884",
# ........................
]
#词云相关 #词云相关
#是否开启生成评论词云图 #是否开启生成评论词云图
ENABLE_GET_WORDCLOUD = False ENABLE_GET_WORDCLOUD = False
@ -111,3 +118,5 @@ STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
#中文字体文件路径 #中文字体文件路径
FONT_PATH= "./docs/STZHONGS.TTF" FONT_PATH= "./docs/STZHONGS.TTF"

27
main.py
View File

@ -1,7 +1,7 @@
import argparse
import asyncio import asyncio
import sys import sys
import cmd_arg
import config import config
import db import db
from base.base_crawler import AbstractCrawler from base.base_crawler import AbstractCrawler
@ -28,34 +28,15 @@ class CrawlerFactory:
raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...") raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...")
return crawler_class() return crawler_class()
async def main(): async def main():
# define command line params ... # parse cmd
parser = argparse.ArgumentParser(description='Media crawler program.') await cmd_arg.parse_cmd()
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
parser.add_argument('--start', type=int, help='crawler type (number of start page)',
default=config.START_PAGE)
parser.add_argument('--keywords', type=str, help='crawler type (please input keywords)',
default=config.KEYWORDS)
# init db # init db
if config.SAVE_DATA_OPTION == "db": if config.SAVE_DATA_OPTION == "db":
await db.init_db() await db.init_db()
args = parser.parse_args() crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
crawler = CrawlerFactory.create_crawler(platform=args.platform)
crawler.init_config(
platform=args.platform,
login_type=args.lt,
crawler_type=args.type,
start_page=args.start,
keyword=args.keywords
)
await crawler.start() await crawler.start()
if config.SAVE_DATA_OPTION == "db": if config.SAVE_DATA_OPTION == "db":

View File

@ -208,7 +208,6 @@ class BilibiliClient(AbstractApiClient):
if not is_fetch_sub_comments: if not is_fetch_sub_comments:
result.extend(comment_list) result.extend(comment_list)
continue continue
# todo handle get sub comments
return result return result
async def get_video_all_level_two_comments(self, async def get_video_all_level_two_comments(self,
@ -230,15 +229,15 @@ class BilibiliClient(AbstractApiClient):
:return: :return:
""" """
pn = 0 pn = 1
while True: while True:
result = await self.get_video_level_two_comments( result = await self.get_video_level_two_comments(
video_id, level_one_comment_id, 0, ps, order_mode) video_id, level_one_comment_id, pn, ps, order_mode)
comment_list: List[Dict] = result.get("replies", []) comment_list: List[Dict] = result.get("replies", [])
if callback: # 如果有回调函数,就执行回调函数 if callback: # 如果有回调函数,就执行回调函数
await callback(video_id, comment_list) await callback(video_id, comment_list)
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)
if (int(result["page"]["count"]) <= (pn+1) * ps): if (int(result["page"]["count"]) <= pn * ps):
break break
pn += 1 pn += 1
@ -268,3 +267,21 @@ class BilibiliClient(AbstractApiClient):
} }
result = await self.get(uri, post_data) result = await self.get(uri, post_data)
return result return result
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
"""get all videos for a creator
:param creator_id: 创作者 ID
:param pn: 页数
:param ps: 一页视频数
:param order_mode: 排序方式
:return:
"""
uri = "/x/space/wbi/arc/search"
post_data = {
"mid": creator_id,
"pn": pn,
"ps": ps,
"order": order_mode,
}
return await self.get(uri, post_data)

View File

@ -26,9 +26,6 @@ from .login import BilibiliLogin
class BilibiliCrawler(AbstractCrawler): class BilibiliCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page context_page: Page
bili_client: BilibiliClient bili_client: BilibiliClient
browser_context: BrowserContext browser_context: BrowserContext
@ -37,13 +34,6 @@ class BilibiliCrawler(AbstractCrawler):
self.index_url = "https://www.bilibili.com" self.index_url = "https://www.bilibili.com"
self.user_agent = utils.get_user_agent() self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self): async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
@ -70,7 +60,7 @@ class BilibiliCrawler(AbstractCrawler):
self.bili_client = await self.create_bilibili_client(httpx_proxy_format) self.bili_client = await self.create_bilibili_client(httpx_proxy_format)
if not await self.bili_client.pong(): if not await self.bili_client.pong():
login_obj = BilibiliLogin( login_obj = BilibiliLogin(
login_type=self.login_type, login_type=config.LOGIN_TYPE,
login_phone="", # your phone number login_phone="", # your phone number
browser_context=self.browser_context, browser_context=self.browser_context,
context_page=self.context_page, context_page=self.context_page,
@ -79,13 +69,16 @@ class BilibiliCrawler(AbstractCrawler):
await login_obj.begin() await login_obj.begin()
await self.bili_client.update_cookies(browser_context=self.browser_context) await self.bili_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type) crawler_type_var.set(config.CRAWLER_TYPE)
if self.crawler_type == "search": if config.CRAWLER_TYPE == "search":
# Search for video and retrieve their comment information. # Search for video and retrieve their comment information.
await self.search() await self.search()
elif self.crawler_type == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_videos() await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
elif config.CRAWLER_TYPE == "creator":
for creator_id in config.BILI_CREATOR_ID_LIST:
await self.get_creator_videos(int(creator_id))
else: else:
pass pass
utils.logger.info( utils.logger.info(
@ -101,8 +94,8 @@ class BilibiliCrawler(AbstractCrawler):
bili_limit_count = 20 # bilibili limit page fixed value bili_limit_count = 20 # bilibili limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
start_page = self.start_page # start page number start_page = config.START_PAGE # start page number
for keyword in self.keyword.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info( utils.logger.info(
f"[BilibiliCrawler.search] Current search keyword: {keyword}") f"[BilibiliCrawler.search] Current search keyword: {keyword}")
page = 1 page = 1
@ -183,7 +176,25 @@ class BilibiliCrawler(AbstractCrawler):
utils.logger.error( utils.logger.error(
f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}") f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
async def get_specified_videos(self): async def get_creator_videos(self, creator_id: int):
"""
get videos for a creator
:return:
"""
ps = 30
pn = 1
video_bvids_list = []
while True:
result = await self.bili_client.get_creator_videos(creator_id, pn, ps)
for video in result["list"]["vlist"]:
video_bvids_list.append(video["bvid"])
if (int(result["page"]["count"]) <= pn * ps):
break
await asyncio.sleep(random.random())
pn += 1
await self.get_specified_videos(video_bvids_list)
async def get_specified_videos(self, bvids_list: List[str]):
""" """
get specified videos info get specified videos info
:return: :return:
@ -191,7 +202,7 @@ class BilibiliCrawler(AbstractCrawler):
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [ task_list = [
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
config.BILI_SPECIFIED_ID_LIST bvids_list
] ]
video_details = await asyncio.gather(*task_list) video_details = await asyncio.gather(*task_list)
video_aids_list = [] video_aids_list = []
@ -271,7 +282,7 @@ class BilibiliCrawler(AbstractCrawler):
# feat issue #14 # feat issue #14
# we will save login state to avoid login every time # we will save login state to avoid login every time
user_data_dir = os.path.join(os.getcwd(), "browser_data", user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,

View File

@ -13,6 +13,7 @@ from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed) wait_fixed)
from base.base_crawler import AbstractLogin from base.base_crawler import AbstractLogin
import config
from tools import utils from tools import utils
@ -24,7 +25,7 @@ class BilibiliLogin(AbstractLogin):
login_phone: Optional[str] = "", login_phone: Optional[str] = "",
cookie_str: str = "" cookie_str: str = ""
): ):
self.login_type = login_type config.LOGIN_TYPE = login_type
self.browser_context = browser_context self.browser_context = browser_context
self.context_page = context_page self.context_page = context_page
self.login_phone = login_phone self.login_phone = login_phone
@ -33,11 +34,11 @@ class BilibiliLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login bilibili""" """Start login bilibili"""
utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...") utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
if self.login_type == "qrcode": if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile() await self.login_by_mobile()
elif self.login_type == "cookie": elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError( raise ValueError(

View File

@ -21,27 +21,14 @@ from .login import DouYinLogin
class DouYinCrawler(AbstractCrawler): class DouYinCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page context_page: Page
dy_client: DOUYINClient dy_client: DOUYINClient
browser_context: BrowserContext browser_context: BrowserContext
start_page: int
keyword: str
def __init__(self) -> None: def __init__(self) -> None:
self.start_page = None
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.index_url = "https://www.douyin.com" self.index_url = "https://www.douyin.com"
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self) -> None: async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
@ -66,7 +53,7 @@ class DouYinCrawler(AbstractCrawler):
self.dy_client = await self.create_douyin_client(httpx_proxy_format) self.dy_client = await self.create_douyin_client(httpx_proxy_format)
if not await self.dy_client.pong(browser_context=self.browser_context): if not await self.dy_client.pong(browser_context=self.browser_context):
login_obj = DouYinLogin( login_obj = DouYinLogin(
login_type=self.login_type, login_type=config.LOGIN_TYPE,
login_phone="", # you phone number login_phone="", # you phone number
browser_context=self.browser_context, browser_context=self.browser_context,
context_page=self.context_page, context_page=self.context_page,
@ -74,14 +61,14 @@ class DouYinCrawler(AbstractCrawler):
) )
await login_obj.begin() await login_obj.begin()
await self.dy_client.update_cookies(browser_context=self.browser_context) await self.dy_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type) crawler_type_var.set(config.CRAWLER_TYPE)
if self.crawler_type == "search": if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information. # Search for notes and retrieve their comment information.
await self.search() await self.search()
elif self.crawler_type == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_awemes() await self.get_specified_awemes()
elif self.crawler_type == "creator": elif config.CRAWLER_TYPE == "creator":
# Get the information and comments of the specified creator # Get the information and comments of the specified creator
await self.get_creators_and_videos() await self.get_creators_and_videos()
@ -92,8 +79,8 @@ class DouYinCrawler(AbstractCrawler):
dy_limit_count = 10 # douyin limit page fixed value dy_limit_count = 10 # douyin limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
start_page = self.start_page # start page number start_page = config.START_PAGE # start page number
for keyword in self.keyword.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}") utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
aweme_list: List[str] = [] aweme_list: List[str] = []
page = 0 page = 0
@ -259,7 +246,7 @@ class DouYinCrawler(AbstractCrawler):
"""Launch browser and create browser context""" """Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,

View File

@ -23,7 +23,7 @@ class DouYinLogin(AbstractLogin):
login_phone: Optional[str] = "", login_phone: Optional[str] = "",
cookie_str: Optional[str] = "" cookie_str: Optional[str] = ""
): ):
self.login_type = login_type config.LOGIN_TYPE = login_type
self.browser_context = browser_context self.browser_context = browser_context
self.context_page = context_page self.context_page = context_page
self.login_phone = login_phone self.login_phone = login_phone
@ -40,11 +40,11 @@ class DouYinLogin(AbstractLogin):
await self.popup_login_dialog() await self.popup_login_dialog()
# select login type # select login type
if self.login_type == "qrcode": if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile() await self.login_by_mobile()
elif self.login_type == "cookie": elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")

View File

@ -21,9 +21,6 @@ from .login import KuaishouLogin
class KuaishouCrawler(AbstractCrawler): class KuaishouCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page context_page: Page
ks_client: KuaiShouClient ks_client: KuaiShouClient
browser_context: BrowserContext browser_context: BrowserContext
@ -32,13 +29,6 @@ class KuaishouCrawler(AbstractCrawler):
self.index_url = "https://www.kuaishou.com" self.index_url = "https://www.kuaishou.com"
self.user_agent = utils.get_user_agent() self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self): async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
@ -64,7 +54,7 @@ class KuaishouCrawler(AbstractCrawler):
self.ks_client = await self.create_ks_client(httpx_proxy_format) self.ks_client = await self.create_ks_client(httpx_proxy_format)
if not await self.ks_client.pong(): if not await self.ks_client.pong():
login_obj = KuaishouLogin( login_obj = KuaishouLogin(
login_type=self.login_type, login_type=config.LOGIN_TYPE,
login_phone=httpx_proxy_format, login_phone=httpx_proxy_format,
browser_context=self.browser_context, browser_context=self.browser_context,
context_page=self.context_page, context_page=self.context_page,
@ -73,11 +63,11 @@ class KuaishouCrawler(AbstractCrawler):
await login_obj.begin() await login_obj.begin()
await self.ks_client.update_cookies(browser_context=self.browser_context) await self.ks_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type) crawler_type_var.set(config.CRAWLER_TYPE)
if self.crawler_type == "search": if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information. # Search for notes and retrieve their comment information.
await self.search() await self.search()
elif self.crawler_type == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_videos() await self.get_specified_videos()
else: else:
@ -90,8 +80,8 @@ class KuaishouCrawler(AbstractCrawler):
ks_limit_count = 20 # kuaishou limit page fixed value ks_limit_count = 20 # kuaishou limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
start_page = self.start_page start_page = config.START_PAGE
for keyword in self.keyword.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}") utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
page = 1 page = 1
while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -238,7 +228,7 @@ class KuaishouCrawler(AbstractCrawler):
utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...") utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,

View File

@ -19,7 +19,7 @@ class KuaishouLogin(AbstractLogin):
login_phone: Optional[str] = "", login_phone: Optional[str] = "",
cookie_str: str = "" cookie_str: str = ""
): ):
self.login_type = login_type config.LOGIN_TYPE = login_type
self.browser_context = browser_context self.browser_context = browser_context
self.context_page = context_page self.context_page = context_page
self.login_phone = login_phone self.login_phone = login_phone
@ -28,11 +28,11 @@ class KuaishouLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login xiaohongshu""" """Start login xiaohongshu"""
utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...") utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
if self.login_type == "qrcode": if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile() await self.login_by_mobile()
elif self.login_type == "cookie": elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")

View File

@ -28,9 +28,6 @@ from .login import WeiboLogin
class WeiboCrawler(AbstractCrawler): class WeiboCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page context_page: Page
wb_client: WeiboClient wb_client: WeiboClient
browser_context: BrowserContext browser_context: BrowserContext
@ -41,13 +38,6 @@ class WeiboCrawler(AbstractCrawler):
self.user_agent = utils.get_user_agent() self.user_agent = utils.get_user_agent()
self.mobile_user_agent = utils.get_mobile_user_agent() self.mobile_user_agent = utils.get_mobile_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self): async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
@ -73,7 +63,7 @@ class WeiboCrawler(AbstractCrawler):
self.wb_client = await self.create_weibo_client(httpx_proxy_format) self.wb_client = await self.create_weibo_client(httpx_proxy_format)
if not await self.wb_client.pong(): if not await self.wb_client.pong():
login_obj = WeiboLogin( login_obj = WeiboLogin(
login_type=self.login_type, login_type=config.LOGIN_TYPE,
login_phone="", # your phone number login_phone="", # your phone number
browser_context=self.browser_context, browser_context=self.browser_context,
context_page=self.context_page, context_page=self.context_page,
@ -89,11 +79,11 @@ class WeiboCrawler(AbstractCrawler):
await asyncio.sleep(2) await asyncio.sleep(2)
await self.wb_client.update_cookies(browser_context=self.browser_context) await self.wb_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type) crawler_type_var.set(config.CRAWLER_TYPE)
if self.crawler_type == "search": if config.CRAWLER_TYPE == "search":
# Search for video and retrieve their comment information. # Search for video and retrieve their comment information.
await self.search() await self.search()
elif self.crawler_type == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_notes() await self.get_specified_notes()
else: else:
@ -109,8 +99,8 @@ class WeiboCrawler(AbstractCrawler):
weibo_limit_count = 10 # weibo limit page fixed value weibo_limit_count = 10 # weibo limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
start_page = self.start_page start_page = config.START_PAGE
for keyword in self.keyword.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}") utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
page = 1 page = 1
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -274,7 +264,7 @@ class WeiboCrawler(AbstractCrawler):
utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...") utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,

View File

@ -24,7 +24,7 @@ class WeiboLogin(AbstractLogin):
login_phone: Optional[str] = "", login_phone: Optional[str] = "",
cookie_str: str = "" cookie_str: str = ""
): ):
self.login_type = login_type config.LOGIN_TYPE = login_type
self.browser_context = browser_context self.browser_context = browser_context
self.context_page = context_page self.context_page = context_page
self.login_phone = login_phone self.login_phone = login_phone
@ -33,11 +33,11 @@ class WeiboLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login weibo""" """Start login weibo"""
utils.logger.info("[WeiboLogin.begin] Begin login weibo ...") utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
if self.login_type == "qrcode": if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile() await self.login_by_mobile()
elif self.login_type == "cookie": elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError( raise ValueError(

View File

@ -21,9 +21,6 @@ from .login import XiaoHongShuLogin
class XiaoHongShuCrawler(AbstractCrawler): class XiaoHongShuCrawler(AbstractCrawler):
platform: str
login_type: str
crawler_type: str
context_page: Page context_page: Page
xhs_client: XiaoHongShuClient xhs_client: XiaoHongShuClient
browser_context: BrowserContext browser_context: BrowserContext
@ -32,13 +29,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.index_url = "https://www.xiaohongshu.com" self.index_url = "https://www.xiaohongshu.com"
self.user_agent = utils.get_user_agent() self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self) -> None: async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None playwright_proxy_format, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY: if config.ENABLE_IP_PROXY:
@ -71,7 +61,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.xhs_client = await self.create_xhs_client(httpx_proxy_format) self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
if not await self.xhs_client.pong(): if not await self.xhs_client.pong():
login_obj = XiaoHongShuLogin( login_obj = XiaoHongShuLogin(
login_type=self.login_type, login_type=config.LOGIN_TYPE,
login_phone="", # input your phone number login_phone="", # input your phone number
browser_context=self.browser_context, browser_context=self.browser_context,
context_page=self.context_page, context_page=self.context_page,
@ -80,14 +70,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
await login_obj.begin() await login_obj.begin()
await self.xhs_client.update_cookies(browser_context=self.browser_context) await self.xhs_client.update_cookies(browser_context=self.browser_context)
crawler_type_var.set(self.crawler_type) crawler_type_var.set(config.CRAWLER_TYPE)
if self.crawler_type == "search": if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information. # Search for notes and retrieve their comment information.
await self.search() await self.search()
elif self.crawler_type == "detail": elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_notes() await self.get_specified_notes()
elif self.crawler_type == "creator": elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments # Get creator's information and their notes and comments
await self.get_creators_and_notes() await self.get_creators_and_notes()
else: else:
@ -101,8 +91,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
xhs_limit_count = 20 # xhs limit page fixed value xhs_limit_count = 20 # xhs limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count: if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
start_page = self.start_page start_page = config.START_PAGE
for keyword in self.keyword.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}") utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
page = 1 page = 1
while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
@ -264,7 +254,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
# feat issue #14 # feat issue #14
# we will save login state to avoid login every time # we will save login state to avoid login every time
user_data_dir = os.path.join(os.getcwd(), "browser_data", user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,

View File

@ -22,7 +22,7 @@ class XiaoHongShuLogin(AbstractLogin):
login_phone: Optional[str] = "", login_phone: Optional[str] = "",
cookie_str: str = "" cookie_str: str = ""
): ):
self.login_type = login_type config.LOGIN_TYPE = login_type
self.browser_context = browser_context self.browser_context = browser_context
self.context_page = context_page self.context_page = context_page
self.login_phone = login_phone self.login_phone = login_phone
@ -49,11 +49,11 @@ class XiaoHongShuLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login xiaohongshu""" """Start login xiaohongshu"""
utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...") utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
if self.login_type == "qrcode": if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile() await self.login_by_mobile()
elif self.login_type == "cookie": elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...") raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")

View File

@ -311,4 +311,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `douyin_aweme_comment` ALTER TABLE `douyin_aweme_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `bilibili_video_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
SET FOREIGN_KEY_CHECKS = 1; SET FOREIGN_KEY_CHECKS = 1;

View File

@ -1,3 +1,4 @@
import argparse
import logging import logging
from .crawler_util import * from .crawler_util import *
@ -18,3 +19,13 @@ def init_loging_config():
logger = init_loging_config() logger = init_loging_config()
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')