Merge remote-tracking branch 'origin/main'
This commit is contained in:
commit
78b0d317ff
|
@ -167,3 +167,4 @@ cython_debug/
|
||||||
/data/
|
/data/
|
||||||
|
|
||||||
*/.DS_Store
|
*/.DS_Store
|
||||||
|
.vscode
|
|
@ -17,15 +17,17 @@
|
||||||
## 功能列表
|
## 功能列表
|
||||||
> 下面不支持的项目,相关的代码架构已经搭建好,只需要实现对应的方法即可,欢迎大家提交PR
|
> 下面不支持的项目,相关的代码架构已经搭建好,只需要实现对应的方法即可,欢迎大家提交PR
|
||||||
|
|
||||||
|
|
||||||
| 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 |
|
| 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 |
|
||||||
|-----|-------|----------|-----|--------|-------|-------|-------|
|
|-----|-------|----------|-----|--------|-------|-------|-------|
|
||||||
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
|
| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||||
| B 站 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
|
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
|
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 使用方法
|
## 使用方法
|
||||||
|
|
||||||
### 创建并激活 python 虚拟环境
|
### 创建并激活 python 虚拟环境
|
||||||
|
|
|
@ -5,10 +5,6 @@ from playwright.async_api import BrowserContext, BrowserType
|
||||||
|
|
||||||
|
|
||||||
class AbstractCrawler(ABC):
|
class AbstractCrawler(ABC):
|
||||||
@abstractmethod
|
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def start(self):
|
async def start(self):
|
||||||
pass
|
pass
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
from .arg import *
|
|
@ -0,0 +1,39 @@
|
||||||
|
import argparse
|
||||||
|
import config
|
||||||
|
from tools.utils import str2bool
|
||||||
|
|
||||||
|
|
||||||
|
async def parse_cmd():
|
||||||
|
# 读取command arg
|
||||||
|
parser = argparse.ArgumentParser(description='Media crawler program.')
|
||||||
|
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
|
||||||
|
choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
|
||||||
|
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
|
||||||
|
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
||||||
|
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
|
||||||
|
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
|
||||||
|
parser.add_argument('--start', type=int,
|
||||||
|
help='number of start page', default=config.START_PAGE)
|
||||||
|
parser.add_argument('--keywords', type=str,
|
||||||
|
help='please input keywords', default=config.KEYWORDS)
|
||||||
|
parser.add_argument('--get_comment', type=str2bool,
|
||||||
|
help='''whether to crawl level one comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
|
||||||
|
parser.add_argument('--get_sub_comment', type=str2bool,
|
||||||
|
help=''''whether to crawl level two comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
|
||||||
|
parser.add_argument('--save_data_option', type=str,
|
||||||
|
help='where to save the data (csv or db or json)', choices=['csv', 'db', 'json'], default=config.SAVE_DATA_OPTION)
|
||||||
|
parser.add_argument('--cookies', type=str,
|
||||||
|
help='cookies used for cookie login type', default=config.COOKIES)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# override config
|
||||||
|
config.PLATFORM = args.platform
|
||||||
|
config.LOGIN_TYPE = args.lt
|
||||||
|
config.CRAWLER_TYPE = args.type
|
||||||
|
config.START_PAGE = args.start
|
||||||
|
config.KEYWORDS = args.keywords
|
||||||
|
config.ENABLE_GET_COMMENTS = args.get_comment
|
||||||
|
config.ENABLE_GET_SUB_COMMENTS = args.get_sub_comment
|
||||||
|
config.SAVE_DATA_OPTION = args.save_data_option
|
||||||
|
config.COOKIES = args.cookies
|
|
@ -3,7 +3,8 @@ PLATFORM = "xhs"
|
||||||
KEYWORDS = "python,golang"
|
KEYWORDS = "python,golang"
|
||||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
||||||
COOKIES = ""
|
COOKIES = ""
|
||||||
SORT_TYPE = "popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书
|
# 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书
|
||||||
|
SORT_TYPE = "popularity_descending"
|
||||||
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
|
CRAWLER_TYPE = "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
|
||||||
|
|
||||||
# 是否开启 IP 代理
|
# 是否开启 IP 代理
|
||||||
|
@ -45,9 +46,9 @@ ENABLE_GET_IMAGES = False
|
||||||
# 是否开启爬评论模式, 默认不开启爬评论
|
# 是否开启爬评论模式, 默认不开启爬评论
|
||||||
ENABLE_GET_COMMENTS = False
|
ENABLE_GET_COMMENTS = False
|
||||||
|
|
||||||
# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs
|
# 是否开启爬二级评论模式, 默认不开启爬二级评论, 目前仅支持 xhs, bilibili
|
||||||
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
||||||
ENABLE_GET_SUB_COMMENTS = True
|
ENABLE_GET_SUB_COMMENTS = False
|
||||||
|
|
||||||
# 指定小红书需要爬虫的笔记ID列表
|
# 指定小红书需要爬虫的笔记ID列表
|
||||||
XHS_SPECIFIED_ID_LIST = [
|
XHS_SPECIFIED_ID_LIST = [
|
||||||
|
@ -96,6 +97,12 @@ DY_CREATOR_ID_LIST = [
|
||||||
# ........................
|
# ........................
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# 指定bili创作者ID列表(sec_id)
|
||||||
|
BILI_CREATOR_ID_LIST = [
|
||||||
|
"20813884",
|
||||||
|
# ........................
|
||||||
|
]
|
||||||
|
|
||||||
#词云相关
|
#词云相关
|
||||||
#是否开启生成评论词云图
|
#是否开启生成评论词云图
|
||||||
ENABLE_GET_WORDCLOUD = False
|
ENABLE_GET_WORDCLOUD = False
|
||||||
|
@ -111,3 +118,5 @@ STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
|
||||||
|
|
||||||
#中文字体文件路径
|
#中文字体文件路径
|
||||||
FONT_PATH= "./docs/STZHONGS.TTF"
|
FONT_PATH= "./docs/STZHONGS.TTF"
|
||||||
|
|
||||||
|
|
||||||
|
|
27
main.py
27
main.py
|
@ -1,7 +1,7 @@
|
||||||
import argparse
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
import cmd_arg
|
||||||
import config
|
import config
|
||||||
import db
|
import db
|
||||||
from base.base_crawler import AbstractCrawler
|
from base.base_crawler import AbstractCrawler
|
||||||
|
@ -28,34 +28,15 @@ class CrawlerFactory:
|
||||||
raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...")
|
raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...")
|
||||||
return crawler_class()
|
return crawler_class()
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# define command line params ...
|
# parse cmd
|
||||||
parser = argparse.ArgumentParser(description='Media crawler program.')
|
await cmd_arg.parse_cmd()
|
||||||
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
|
|
||||||
choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
|
|
||||||
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
|
|
||||||
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
|
||||||
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
|
|
||||||
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
|
|
||||||
parser.add_argument('--start', type=int, help='crawler type (number of start page)',
|
|
||||||
default=config.START_PAGE)
|
|
||||||
parser.add_argument('--keywords', type=str, help='crawler type (please input keywords)',
|
|
||||||
default=config.KEYWORDS)
|
|
||||||
|
|
||||||
# init db
|
# init db
|
||||||
if config.SAVE_DATA_OPTION == "db":
|
if config.SAVE_DATA_OPTION == "db":
|
||||||
await db.init_db()
|
await db.init_db()
|
||||||
|
|
||||||
args = parser.parse_args()
|
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
|
||||||
crawler = CrawlerFactory.create_crawler(platform=args.platform)
|
|
||||||
crawler.init_config(
|
|
||||||
platform=args.platform,
|
|
||||||
login_type=args.lt,
|
|
||||||
crawler_type=args.type,
|
|
||||||
start_page=args.start,
|
|
||||||
keyword=args.keywords
|
|
||||||
)
|
|
||||||
await crawler.start()
|
await crawler.start()
|
||||||
|
|
||||||
if config.SAVE_DATA_OPTION == "db":
|
if config.SAVE_DATA_OPTION == "db":
|
||||||
|
|
|
@ -208,7 +208,6 @@ class BilibiliClient(AbstractApiClient):
|
||||||
if not is_fetch_sub_comments:
|
if not is_fetch_sub_comments:
|
||||||
result.extend(comment_list)
|
result.extend(comment_list)
|
||||||
continue
|
continue
|
||||||
# todo handle get sub comments
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
async def get_video_all_level_two_comments(self,
|
async def get_video_all_level_two_comments(self,
|
||||||
|
@ -230,15 +229,15 @@ class BilibiliClient(AbstractApiClient):
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pn = 0
|
pn = 1
|
||||||
while True:
|
while True:
|
||||||
result = await self.get_video_level_two_comments(
|
result = await self.get_video_level_two_comments(
|
||||||
video_id, level_one_comment_id, 0, ps, order_mode)
|
video_id, level_one_comment_id, pn, ps, order_mode)
|
||||||
comment_list: List[Dict] = result.get("replies", [])
|
comment_list: List[Dict] = result.get("replies", [])
|
||||||
if callback: # 如果有回调函数,就执行回调函数
|
if callback: # 如果有回调函数,就执行回调函数
|
||||||
await callback(video_id, comment_list)
|
await callback(video_id, comment_list)
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
if (int(result["page"]["count"]) <= (pn+1) * ps):
|
if (int(result["page"]["count"]) <= pn * ps):
|
||||||
break
|
break
|
||||||
|
|
||||||
pn += 1
|
pn += 1
|
||||||
|
@ -268,3 +267,21 @@ class BilibiliClient(AbstractApiClient):
|
||||||
}
|
}
|
||||||
result = await self.get(uri, post_data)
|
result = await self.get(uri, post_data)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
|
||||||
|
"""get all videos for a creator
|
||||||
|
:param creator_id: 创作者 ID
|
||||||
|
:param pn: 页数
|
||||||
|
:param ps: 一页视频数
|
||||||
|
:param order_mode: 排序方式
|
||||||
|
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
uri = "/x/space/wbi/arc/search"
|
||||||
|
post_data = {
|
||||||
|
"mid": creator_id,
|
||||||
|
"pn": pn,
|
||||||
|
"ps": ps,
|
||||||
|
"order": order_mode,
|
||||||
|
}
|
||||||
|
return await self.get(uri, post_data)
|
||||||
|
|
|
@ -26,9 +26,6 @@ from .login import BilibiliLogin
|
||||||
|
|
||||||
|
|
||||||
class BilibiliCrawler(AbstractCrawler):
|
class BilibiliCrawler(AbstractCrawler):
|
||||||
platform: str
|
|
||||||
login_type: str
|
|
||||||
crawler_type: str
|
|
||||||
context_page: Page
|
context_page: Page
|
||||||
bili_client: BilibiliClient
|
bili_client: BilibiliClient
|
||||||
browser_context: BrowserContext
|
browser_context: BrowserContext
|
||||||
|
@ -37,13 +34,6 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
self.index_url = "https://www.bilibili.com"
|
self.index_url = "https://www.bilibili.com"
|
||||||
self.user_agent = utils.get_user_agent()
|
self.user_agent = utils.get_user_agent()
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
|
|
||||||
self.platform = platform
|
|
||||||
self.login_type = login_type
|
|
||||||
self.crawler_type = crawler_type
|
|
||||||
self.start_page = start_page
|
|
||||||
self.keyword = keyword
|
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
playwright_proxy_format, httpx_proxy_format = None, None
|
playwright_proxy_format, httpx_proxy_format = None, None
|
||||||
if config.ENABLE_IP_PROXY:
|
if config.ENABLE_IP_PROXY:
|
||||||
|
@ -70,7 +60,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
self.bili_client = await self.create_bilibili_client(httpx_proxy_format)
|
self.bili_client = await self.create_bilibili_client(httpx_proxy_format)
|
||||||
if not await self.bili_client.pong():
|
if not await self.bili_client.pong():
|
||||||
login_obj = BilibiliLogin(
|
login_obj = BilibiliLogin(
|
||||||
login_type=self.login_type,
|
login_type=config.LOGIN_TYPE,
|
||||||
login_phone="", # your phone number
|
login_phone="", # your phone number
|
||||||
browser_context=self.browser_context,
|
browser_context=self.browser_context,
|
||||||
context_page=self.context_page,
|
context_page=self.context_page,
|
||||||
|
@ -79,13 +69,16 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
await login_obj.begin()
|
await login_obj.begin()
|
||||||
await self.bili_client.update_cookies(browser_context=self.browser_context)
|
await self.bili_client.update_cookies(browser_context=self.browser_context)
|
||||||
|
|
||||||
crawler_type_var.set(self.crawler_type)
|
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||||
if self.crawler_type == "search":
|
if config.CRAWLER_TYPE == "search":
|
||||||
# Search for video and retrieve their comment information.
|
# Search for video and retrieve their comment information.
|
||||||
await self.search()
|
await self.search()
|
||||||
elif self.crawler_type == "detail":
|
elif config.CRAWLER_TYPE == "detail":
|
||||||
# Get the information and comments of the specified post
|
# Get the information and comments of the specified post
|
||||||
await self.get_specified_videos()
|
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
|
||||||
|
elif config.CRAWLER_TYPE == "creator":
|
||||||
|
for creator_id in config.BILI_CREATOR_ID_LIST:
|
||||||
|
await self.get_creator_videos(int(creator_id))
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
|
@ -101,8 +94,8 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
bili_limit_count = 20 # bilibili limit page fixed value
|
bili_limit_count = 20 # bilibili limit page fixed value
|
||||||
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
|
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
|
||||||
start_page = self.start_page # start page number
|
start_page = config.START_PAGE # start page number
|
||||||
for keyword in self.keyword.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[BilibiliCrawler.search] Current search keyword: {keyword}")
|
f"[BilibiliCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
|
@ -183,7 +176,25 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
utils.logger.error(
|
utils.logger.error(
|
||||||
f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
|
f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
|
||||||
|
|
||||||
async def get_specified_videos(self):
|
async def get_creator_videos(self, creator_id: int):
|
||||||
|
"""
|
||||||
|
get videos for a creator
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
ps = 30
|
||||||
|
pn = 1
|
||||||
|
video_bvids_list = []
|
||||||
|
while True:
|
||||||
|
result = await self.bili_client.get_creator_videos(creator_id, pn, ps)
|
||||||
|
for video in result["list"]["vlist"]:
|
||||||
|
video_bvids_list.append(video["bvid"])
|
||||||
|
if (int(result["page"]["count"]) <= pn * ps):
|
||||||
|
break
|
||||||
|
await asyncio.sleep(random.random())
|
||||||
|
pn += 1
|
||||||
|
await self.get_specified_videos(video_bvids_list)
|
||||||
|
|
||||||
|
async def get_specified_videos(self, bvids_list: List[str]):
|
||||||
"""
|
"""
|
||||||
get specified videos info
|
get specified videos info
|
||||||
:return:
|
:return:
|
||||||
|
@ -191,7 +202,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
task_list = [
|
task_list = [
|
||||||
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
|
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
|
||||||
config.BILI_SPECIFIED_ID_LIST
|
bvids_list
|
||||||
]
|
]
|
||||||
video_details = await asyncio.gather(*task_list)
|
video_details = await asyncio.gather(*task_list)
|
||||||
video_aids_list = []
|
video_aids_list = []
|
||||||
|
@ -271,7 +282,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
# feat issue #14
|
# feat issue #14
|
||||||
# we will save login state to avoid login every time
|
# we will save login state to avoid login every time
|
||||||
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
||||||
config.USER_DATA_DIR % self.platform) # type: ignore
|
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||||
browser_context = await chromium.launch_persistent_context(
|
browser_context = await chromium.launch_persistent_context(
|
||||||
user_data_dir=user_data_dir,
|
user_data_dir=user_data_dir,
|
||||||
accept_downloads=True,
|
accept_downloads=True,
|
||||||
|
|
|
@ -13,6 +13,7 @@ from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||||
wait_fixed)
|
wait_fixed)
|
||||||
|
|
||||||
from base.base_crawler import AbstractLogin
|
from base.base_crawler import AbstractLogin
|
||||||
|
import config
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,7 +25,7 @@ class BilibiliLogin(AbstractLogin):
|
||||||
login_phone: Optional[str] = "",
|
login_phone: Optional[str] = "",
|
||||||
cookie_str: str = ""
|
cookie_str: str = ""
|
||||||
):
|
):
|
||||||
self.login_type = login_type
|
config.LOGIN_TYPE = login_type
|
||||||
self.browser_context = browser_context
|
self.browser_context = browser_context
|
||||||
self.context_page = context_page
|
self.context_page = context_page
|
||||||
self.login_phone = login_phone
|
self.login_phone = login_phone
|
||||||
|
@ -33,11 +34,11 @@ class BilibiliLogin(AbstractLogin):
|
||||||
async def begin(self):
|
async def begin(self):
|
||||||
"""Start login bilibili"""
|
"""Start login bilibili"""
|
||||||
utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
|
utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
|
||||||
if self.login_type == "qrcode":
|
if config.LOGIN_TYPE == "qrcode":
|
||||||
await self.login_by_qrcode()
|
await self.login_by_qrcode()
|
||||||
elif self.login_type == "phone":
|
elif config.LOGIN_TYPE == "phone":
|
||||||
await self.login_by_mobile()
|
await self.login_by_mobile()
|
||||||
elif self.login_type == "cookie":
|
elif config.LOGIN_TYPE == "cookie":
|
||||||
await self.login_by_cookies()
|
await self.login_by_cookies()
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|
|
@ -21,27 +21,14 @@ from .login import DouYinLogin
|
||||||
|
|
||||||
|
|
||||||
class DouYinCrawler(AbstractCrawler):
|
class DouYinCrawler(AbstractCrawler):
|
||||||
platform: str
|
|
||||||
login_type: str
|
|
||||||
crawler_type: str
|
|
||||||
context_page: Page
|
context_page: Page
|
||||||
dy_client: DOUYINClient
|
dy_client: DOUYINClient
|
||||||
browser_context: BrowserContext
|
browser_context: BrowserContext
|
||||||
start_page: int
|
|
||||||
keyword: str
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.start_page = None
|
|
||||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
||||||
self.index_url = "https://www.douyin.com"
|
self.index_url = "https://www.douyin.com"
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
|
|
||||||
self.platform = platform
|
|
||||||
self.login_type = login_type
|
|
||||||
self.crawler_type = crawler_type
|
|
||||||
self.start_page = start_page
|
|
||||||
self.keyword = keyword
|
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
playwright_proxy_format, httpx_proxy_format = None, None
|
playwright_proxy_format, httpx_proxy_format = None, None
|
||||||
if config.ENABLE_IP_PROXY:
|
if config.ENABLE_IP_PROXY:
|
||||||
|
@ -66,7 +53,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
self.dy_client = await self.create_douyin_client(httpx_proxy_format)
|
self.dy_client = await self.create_douyin_client(httpx_proxy_format)
|
||||||
if not await self.dy_client.pong(browser_context=self.browser_context):
|
if not await self.dy_client.pong(browser_context=self.browser_context):
|
||||||
login_obj = DouYinLogin(
|
login_obj = DouYinLogin(
|
||||||
login_type=self.login_type,
|
login_type=config.LOGIN_TYPE,
|
||||||
login_phone="", # you phone number
|
login_phone="", # you phone number
|
||||||
browser_context=self.browser_context,
|
browser_context=self.browser_context,
|
||||||
context_page=self.context_page,
|
context_page=self.context_page,
|
||||||
|
@ -74,14 +61,14 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
)
|
)
|
||||||
await login_obj.begin()
|
await login_obj.begin()
|
||||||
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
await self.dy_client.update_cookies(browser_context=self.browser_context)
|
||||||
crawler_type_var.set(self.crawler_type)
|
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||||
if self.crawler_type == "search":
|
if config.CRAWLER_TYPE == "search":
|
||||||
# Search for notes and retrieve their comment information.
|
# Search for notes and retrieve their comment information.
|
||||||
await self.search()
|
await self.search()
|
||||||
elif self.crawler_type == "detail":
|
elif config.CRAWLER_TYPE == "detail":
|
||||||
# Get the information and comments of the specified post
|
# Get the information and comments of the specified post
|
||||||
await self.get_specified_awemes()
|
await self.get_specified_awemes()
|
||||||
elif self.crawler_type == "creator":
|
elif config.CRAWLER_TYPE == "creator":
|
||||||
# Get the information and comments of the specified creator
|
# Get the information and comments of the specified creator
|
||||||
await self.get_creators_and_videos()
|
await self.get_creators_and_videos()
|
||||||
|
|
||||||
|
@ -92,8 +79,8 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
dy_limit_count = 10 # douyin limit page fixed value
|
dy_limit_count = 10 # douyin limit page fixed value
|
||||||
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
|
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
|
||||||
start_page = self.start_page # start page number
|
start_page = config.START_PAGE # start page number
|
||||||
for keyword in self.keyword.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
|
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
|
||||||
aweme_list: List[str] = []
|
aweme_list: List[str] = []
|
||||||
page = 0
|
page = 0
|
||||||
|
@ -259,7 +246,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
"""Launch browser and create browser context"""
|
"""Launch browser and create browser context"""
|
||||||
if config.SAVE_LOGIN_STATE:
|
if config.SAVE_LOGIN_STATE:
|
||||||
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
||||||
config.USER_DATA_DIR % self.platform) # type: ignore
|
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||||
browser_context = await chromium.launch_persistent_context(
|
browser_context = await chromium.launch_persistent_context(
|
||||||
user_data_dir=user_data_dir,
|
user_data_dir=user_data_dir,
|
||||||
accept_downloads=True,
|
accept_downloads=True,
|
||||||
|
|
|
@ -23,7 +23,7 @@ class DouYinLogin(AbstractLogin):
|
||||||
login_phone: Optional[str] = "",
|
login_phone: Optional[str] = "",
|
||||||
cookie_str: Optional[str] = ""
|
cookie_str: Optional[str] = ""
|
||||||
):
|
):
|
||||||
self.login_type = login_type
|
config.LOGIN_TYPE = login_type
|
||||||
self.browser_context = browser_context
|
self.browser_context = browser_context
|
||||||
self.context_page = context_page
|
self.context_page = context_page
|
||||||
self.login_phone = login_phone
|
self.login_phone = login_phone
|
||||||
|
@ -40,11 +40,11 @@ class DouYinLogin(AbstractLogin):
|
||||||
await self.popup_login_dialog()
|
await self.popup_login_dialog()
|
||||||
|
|
||||||
# select login type
|
# select login type
|
||||||
if self.login_type == "qrcode":
|
if config.LOGIN_TYPE == "qrcode":
|
||||||
await self.login_by_qrcode()
|
await self.login_by_qrcode()
|
||||||
elif self.login_type == "phone":
|
elif config.LOGIN_TYPE == "phone":
|
||||||
await self.login_by_mobile()
|
await self.login_by_mobile()
|
||||||
elif self.login_type == "cookie":
|
elif config.LOGIN_TYPE == "cookie":
|
||||||
await self.login_by_cookies()
|
await self.login_by_cookies()
|
||||||
else:
|
else:
|
||||||
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||||
|
|
|
@ -21,9 +21,6 @@ from .login import KuaishouLogin
|
||||||
|
|
||||||
|
|
||||||
class KuaishouCrawler(AbstractCrawler):
|
class KuaishouCrawler(AbstractCrawler):
|
||||||
platform: str
|
|
||||||
login_type: str
|
|
||||||
crawler_type: str
|
|
||||||
context_page: Page
|
context_page: Page
|
||||||
ks_client: KuaiShouClient
|
ks_client: KuaiShouClient
|
||||||
browser_context: BrowserContext
|
browser_context: BrowserContext
|
||||||
|
@ -32,13 +29,6 @@ class KuaishouCrawler(AbstractCrawler):
|
||||||
self.index_url = "https://www.kuaishou.com"
|
self.index_url = "https://www.kuaishou.com"
|
||||||
self.user_agent = utils.get_user_agent()
|
self.user_agent = utils.get_user_agent()
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
|
|
||||||
self.platform = platform
|
|
||||||
self.login_type = login_type
|
|
||||||
self.crawler_type = crawler_type
|
|
||||||
self.start_page = start_page
|
|
||||||
self.keyword = keyword
|
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
playwright_proxy_format, httpx_proxy_format = None, None
|
playwright_proxy_format, httpx_proxy_format = None, None
|
||||||
if config.ENABLE_IP_PROXY:
|
if config.ENABLE_IP_PROXY:
|
||||||
|
@ -64,7 +54,7 @@ class KuaishouCrawler(AbstractCrawler):
|
||||||
self.ks_client = await self.create_ks_client(httpx_proxy_format)
|
self.ks_client = await self.create_ks_client(httpx_proxy_format)
|
||||||
if not await self.ks_client.pong():
|
if not await self.ks_client.pong():
|
||||||
login_obj = KuaishouLogin(
|
login_obj = KuaishouLogin(
|
||||||
login_type=self.login_type,
|
login_type=config.LOGIN_TYPE,
|
||||||
login_phone=httpx_proxy_format,
|
login_phone=httpx_proxy_format,
|
||||||
browser_context=self.browser_context,
|
browser_context=self.browser_context,
|
||||||
context_page=self.context_page,
|
context_page=self.context_page,
|
||||||
|
@ -73,11 +63,11 @@ class KuaishouCrawler(AbstractCrawler):
|
||||||
await login_obj.begin()
|
await login_obj.begin()
|
||||||
await self.ks_client.update_cookies(browser_context=self.browser_context)
|
await self.ks_client.update_cookies(browser_context=self.browser_context)
|
||||||
|
|
||||||
crawler_type_var.set(self.crawler_type)
|
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||||
if self.crawler_type == "search":
|
if config.CRAWLER_TYPE == "search":
|
||||||
# Search for notes and retrieve their comment information.
|
# Search for notes and retrieve their comment information.
|
||||||
await self.search()
|
await self.search()
|
||||||
elif self.crawler_type == "detail":
|
elif config.CRAWLER_TYPE == "detail":
|
||||||
# Get the information and comments of the specified post
|
# Get the information and comments of the specified post
|
||||||
await self.get_specified_videos()
|
await self.get_specified_videos()
|
||||||
else:
|
else:
|
||||||
|
@ -90,8 +80,8 @@ class KuaishouCrawler(AbstractCrawler):
|
||||||
ks_limit_count = 20 # kuaishou limit page fixed value
|
ks_limit_count = 20 # kuaishou limit page fixed value
|
||||||
if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
|
if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
|
||||||
start_page = self.start_page
|
start_page = config.START_PAGE
|
||||||
for keyword in self.keyword.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
|
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
@ -238,7 +228,7 @@ class KuaishouCrawler(AbstractCrawler):
|
||||||
utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...")
|
utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...")
|
||||||
if config.SAVE_LOGIN_STATE:
|
if config.SAVE_LOGIN_STATE:
|
||||||
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
||||||
config.USER_DATA_DIR % self.platform) # type: ignore
|
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||||
browser_context = await chromium.launch_persistent_context(
|
browser_context = await chromium.launch_persistent_context(
|
||||||
user_data_dir=user_data_dir,
|
user_data_dir=user_data_dir,
|
||||||
accept_downloads=True,
|
accept_downloads=True,
|
||||||
|
|
|
@ -19,7 +19,7 @@ class KuaishouLogin(AbstractLogin):
|
||||||
login_phone: Optional[str] = "",
|
login_phone: Optional[str] = "",
|
||||||
cookie_str: str = ""
|
cookie_str: str = ""
|
||||||
):
|
):
|
||||||
self.login_type = login_type
|
config.LOGIN_TYPE = login_type
|
||||||
self.browser_context = browser_context
|
self.browser_context = browser_context
|
||||||
self.context_page = context_page
|
self.context_page = context_page
|
||||||
self.login_phone = login_phone
|
self.login_phone = login_phone
|
||||||
|
@ -28,11 +28,11 @@ class KuaishouLogin(AbstractLogin):
|
||||||
async def begin(self):
|
async def begin(self):
|
||||||
"""Start login xiaohongshu"""
|
"""Start login xiaohongshu"""
|
||||||
utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
|
utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
|
||||||
if self.login_type == "qrcode":
|
if config.LOGIN_TYPE == "qrcode":
|
||||||
await self.login_by_qrcode()
|
await self.login_by_qrcode()
|
||||||
elif self.login_type == "phone":
|
elif config.LOGIN_TYPE == "phone":
|
||||||
await self.login_by_mobile()
|
await self.login_by_mobile()
|
||||||
elif self.login_type == "cookie":
|
elif config.LOGIN_TYPE == "cookie":
|
||||||
await self.login_by_cookies()
|
await self.login_by_cookies()
|
||||||
else:
|
else:
|
||||||
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||||
|
|
|
@ -28,9 +28,6 @@ from .login import WeiboLogin
|
||||||
|
|
||||||
|
|
||||||
class WeiboCrawler(AbstractCrawler):
|
class WeiboCrawler(AbstractCrawler):
|
||||||
platform: str
|
|
||||||
login_type: str
|
|
||||||
crawler_type: str
|
|
||||||
context_page: Page
|
context_page: Page
|
||||||
wb_client: WeiboClient
|
wb_client: WeiboClient
|
||||||
browser_context: BrowserContext
|
browser_context: BrowserContext
|
||||||
|
@ -41,13 +38,6 @@ class WeiboCrawler(AbstractCrawler):
|
||||||
self.user_agent = utils.get_user_agent()
|
self.user_agent = utils.get_user_agent()
|
||||||
self.mobile_user_agent = utils.get_mobile_user_agent()
|
self.mobile_user_agent = utils.get_mobile_user_agent()
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
|
|
||||||
self.platform = platform
|
|
||||||
self.login_type = login_type
|
|
||||||
self.crawler_type = crawler_type
|
|
||||||
self.start_page = start_page
|
|
||||||
self.keyword = keyword
|
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
playwright_proxy_format, httpx_proxy_format = None, None
|
playwright_proxy_format, httpx_proxy_format = None, None
|
||||||
if config.ENABLE_IP_PROXY:
|
if config.ENABLE_IP_PROXY:
|
||||||
|
@ -73,7 +63,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||||
self.wb_client = await self.create_weibo_client(httpx_proxy_format)
|
self.wb_client = await self.create_weibo_client(httpx_proxy_format)
|
||||||
if not await self.wb_client.pong():
|
if not await self.wb_client.pong():
|
||||||
login_obj = WeiboLogin(
|
login_obj = WeiboLogin(
|
||||||
login_type=self.login_type,
|
login_type=config.LOGIN_TYPE,
|
||||||
login_phone="", # your phone number
|
login_phone="", # your phone number
|
||||||
browser_context=self.browser_context,
|
browser_context=self.browser_context,
|
||||||
context_page=self.context_page,
|
context_page=self.context_page,
|
||||||
|
@ -89,11 +79,11 @@ class WeiboCrawler(AbstractCrawler):
|
||||||
await asyncio.sleep(2)
|
await asyncio.sleep(2)
|
||||||
await self.wb_client.update_cookies(browser_context=self.browser_context)
|
await self.wb_client.update_cookies(browser_context=self.browser_context)
|
||||||
|
|
||||||
crawler_type_var.set(self.crawler_type)
|
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||||
if self.crawler_type == "search":
|
if config.CRAWLER_TYPE == "search":
|
||||||
# Search for video and retrieve their comment information.
|
# Search for video and retrieve their comment information.
|
||||||
await self.search()
|
await self.search()
|
||||||
elif self.crawler_type == "detail":
|
elif config.CRAWLER_TYPE == "detail":
|
||||||
# Get the information and comments of the specified post
|
# Get the information and comments of the specified post
|
||||||
await self.get_specified_notes()
|
await self.get_specified_notes()
|
||||||
else:
|
else:
|
||||||
|
@ -109,8 +99,8 @@ class WeiboCrawler(AbstractCrawler):
|
||||||
weibo_limit_count = 10 # weibo limit page fixed value
|
weibo_limit_count = 10 # weibo limit page fixed value
|
||||||
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
|
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
|
||||||
start_page = self.start_page
|
start_page = config.START_PAGE
|
||||||
for keyword in self.keyword.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
|
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
@ -274,7 +264,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||||
utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
|
utils.logger.info("[WeiboCrawler.launch_browser] Begin create browser context ...")
|
||||||
if config.SAVE_LOGIN_STATE:
|
if config.SAVE_LOGIN_STATE:
|
||||||
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
||||||
config.USER_DATA_DIR % self.platform) # type: ignore
|
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||||
browser_context = await chromium.launch_persistent_context(
|
browser_context = await chromium.launch_persistent_context(
|
||||||
user_data_dir=user_data_dir,
|
user_data_dir=user_data_dir,
|
||||||
accept_downloads=True,
|
accept_downloads=True,
|
||||||
|
|
|
@ -24,7 +24,7 @@ class WeiboLogin(AbstractLogin):
|
||||||
login_phone: Optional[str] = "",
|
login_phone: Optional[str] = "",
|
||||||
cookie_str: str = ""
|
cookie_str: str = ""
|
||||||
):
|
):
|
||||||
self.login_type = login_type
|
config.LOGIN_TYPE = login_type
|
||||||
self.browser_context = browser_context
|
self.browser_context = browser_context
|
||||||
self.context_page = context_page
|
self.context_page = context_page
|
||||||
self.login_phone = login_phone
|
self.login_phone = login_phone
|
||||||
|
@ -33,11 +33,11 @@ class WeiboLogin(AbstractLogin):
|
||||||
async def begin(self):
|
async def begin(self):
|
||||||
"""Start login weibo"""
|
"""Start login weibo"""
|
||||||
utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
|
utils.logger.info("[WeiboLogin.begin] Begin login weibo ...")
|
||||||
if self.login_type == "qrcode":
|
if config.LOGIN_TYPE == "qrcode":
|
||||||
await self.login_by_qrcode()
|
await self.login_by_qrcode()
|
||||||
elif self.login_type == "phone":
|
elif config.LOGIN_TYPE == "phone":
|
||||||
await self.login_by_mobile()
|
await self.login_by_mobile()
|
||||||
elif self.login_type == "cookie":
|
elif config.LOGIN_TYPE == "cookie":
|
||||||
await self.login_by_cookies()
|
await self.login_by_cookies()
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|
|
@ -21,9 +21,6 @@ from .login import XiaoHongShuLogin
|
||||||
|
|
||||||
|
|
||||||
class XiaoHongShuCrawler(AbstractCrawler):
|
class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
platform: str
|
|
||||||
login_type: str
|
|
||||||
crawler_type: str
|
|
||||||
context_page: Page
|
context_page: Page
|
||||||
xhs_client: XiaoHongShuClient
|
xhs_client: XiaoHongShuClient
|
||||||
browser_context: BrowserContext
|
browser_context: BrowserContext
|
||||||
|
@ -32,13 +29,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
self.index_url = "https://www.xiaohongshu.com"
|
self.index_url = "https://www.xiaohongshu.com"
|
||||||
self.user_agent = utils.get_user_agent()
|
self.user_agent = utils.get_user_agent()
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
|
|
||||||
self.platform = platform
|
|
||||||
self.login_type = login_type
|
|
||||||
self.crawler_type = crawler_type
|
|
||||||
self.start_page = start_page
|
|
||||||
self.keyword = keyword
|
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
playwright_proxy_format, httpx_proxy_format = None, None
|
playwright_proxy_format, httpx_proxy_format = None, None
|
||||||
if config.ENABLE_IP_PROXY:
|
if config.ENABLE_IP_PROXY:
|
||||||
|
@ -71,7 +61,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
|
self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
|
||||||
if not await self.xhs_client.pong():
|
if not await self.xhs_client.pong():
|
||||||
login_obj = XiaoHongShuLogin(
|
login_obj = XiaoHongShuLogin(
|
||||||
login_type=self.login_type,
|
login_type=config.LOGIN_TYPE,
|
||||||
login_phone="", # input your phone number
|
login_phone="", # input your phone number
|
||||||
browser_context=self.browser_context,
|
browser_context=self.browser_context,
|
||||||
context_page=self.context_page,
|
context_page=self.context_page,
|
||||||
|
@ -80,14 +70,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
await login_obj.begin()
|
await login_obj.begin()
|
||||||
await self.xhs_client.update_cookies(browser_context=self.browser_context)
|
await self.xhs_client.update_cookies(browser_context=self.browser_context)
|
||||||
|
|
||||||
crawler_type_var.set(self.crawler_type)
|
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||||
if self.crawler_type == "search":
|
if config.CRAWLER_TYPE == "search":
|
||||||
# Search for notes and retrieve their comment information.
|
# Search for notes and retrieve their comment information.
|
||||||
await self.search()
|
await self.search()
|
||||||
elif self.crawler_type == "detail":
|
elif config.CRAWLER_TYPE == "detail":
|
||||||
# Get the information and comments of the specified post
|
# Get the information and comments of the specified post
|
||||||
await self.get_specified_notes()
|
await self.get_specified_notes()
|
||||||
elif self.crawler_type == "creator":
|
elif config.CRAWLER_TYPE == "creator":
|
||||||
# Get creator's information and their notes and comments
|
# Get creator's information and their notes and comments
|
||||||
await self.get_creators_and_notes()
|
await self.get_creators_and_notes()
|
||||||
else:
|
else:
|
||||||
|
@ -101,8 +91,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
xhs_limit_count = 20 # xhs limit page fixed value
|
xhs_limit_count = 20 # xhs limit page fixed value
|
||||||
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
|
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
|
||||||
start_page = self.start_page
|
start_page = config.START_PAGE
|
||||||
for keyword in self.keyword.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
|
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
@ -264,7 +254,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
# feat issue #14
|
# feat issue #14
|
||||||
# we will save login state to avoid login every time
|
# we will save login state to avoid login every time
|
||||||
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
||||||
config.USER_DATA_DIR % self.platform) # type: ignore
|
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||||
browser_context = await chromium.launch_persistent_context(
|
browser_context = await chromium.launch_persistent_context(
|
||||||
user_data_dir=user_data_dir,
|
user_data_dir=user_data_dir,
|
||||||
accept_downloads=True,
|
accept_downloads=True,
|
||||||
|
|
|
@ -22,7 +22,7 @@ class XiaoHongShuLogin(AbstractLogin):
|
||||||
login_phone: Optional[str] = "",
|
login_phone: Optional[str] = "",
|
||||||
cookie_str: str = ""
|
cookie_str: str = ""
|
||||||
):
|
):
|
||||||
self.login_type = login_type
|
config.LOGIN_TYPE = login_type
|
||||||
self.browser_context = browser_context
|
self.browser_context = browser_context
|
||||||
self.context_page = context_page
|
self.context_page = context_page
|
||||||
self.login_phone = login_phone
|
self.login_phone = login_phone
|
||||||
|
@ -49,11 +49,11 @@ class XiaoHongShuLogin(AbstractLogin):
|
||||||
async def begin(self):
|
async def begin(self):
|
||||||
"""Start login xiaohongshu"""
|
"""Start login xiaohongshu"""
|
||||||
utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
|
utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
|
||||||
if self.login_type == "qrcode":
|
if config.LOGIN_TYPE == "qrcode":
|
||||||
await self.login_by_qrcode()
|
await self.login_by_qrcode()
|
||||||
elif self.login_type == "phone":
|
elif config.LOGIN_TYPE == "phone":
|
||||||
await self.login_by_mobile()
|
await self.login_by_mobile()
|
||||||
elif self.login_type == "cookie":
|
elif config.LOGIN_TYPE == "cookie":
|
||||||
await self.login_by_cookies()
|
await self.login_by_cookies()
|
||||||
else:
|
else:
|
||||||
raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
|
raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||||
|
|
|
@ -311,4 +311,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||||
ALTER TABLE `douyin_aweme_comment`
|
ALTER TABLE `douyin_aweme_comment`
|
||||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||||
|
|
||||||
|
ALTER TABLE `bilibili_video_comment`
|
||||||
|
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||||
|
|
||||||
SET FOREIGN_KEY_CHECKS = 1;
|
SET FOREIGN_KEY_CHECKS = 1;
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from .crawler_util import *
|
from .crawler_util import *
|
||||||
|
@ -18,3 +19,13 @@ def init_loging_config():
|
||||||
|
|
||||||
|
|
||||||
logger = init_loging_config()
|
logger = init_loging_config()
|
||||||
|
|
||||||
|
def str2bool(v):
|
||||||
|
if isinstance(v, bool):
|
||||||
|
return v
|
||||||
|
if v.lower() in ('yes', 'true', 't', 'y', '1'):
|
||||||
|
return True
|
||||||
|
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
raise argparse.ArgumentTypeError('Boolean value expected.')
|
||||||
|
|
Loading…
Reference in New Issue