Merge pull request #229 from Tianci-King/main
feat(core): 新增控制爬虫参数起始页面的页数start_page;perf(argparse): 向命令行解析器添加程序参数…
This commit is contained in:
commit
a341dc2aff
|
@ -165,4 +165,6 @@ cython_debug/
|
||||||
/temp_image/
|
/temp_image/
|
||||||
/browser_data/
|
/browser_data/
|
||||||
/data/
|
/data/
|
||||||
/cache
|
/cache
|
||||||
|
|
||||||
|
*/.DS_Store
|
|
@ -6,7 +6,7 @@ from playwright.async_api import BrowserContext, BrowserType
|
||||||
|
|
||||||
class AbstractCrawler(ABC):
|
class AbstractCrawler(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str):
|
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
|
|
@ -27,6 +27,9 @@ SAVE_DATA_OPTION = "json" # csv or db or json
|
||||||
# 用户浏览器缓存的浏览器文件配置
|
# 用户浏览器缓存的浏览器文件配置
|
||||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||||
|
|
||||||
|
# 爬取开始页数 默认从第一页开始
|
||||||
|
START_PAGE = 1
|
||||||
|
|
||||||
# 爬取视频/帖子的数量控制
|
# 爬取视频/帖子的数量控制
|
||||||
CRAWLER_MAX_NOTES_COUNT = 20
|
CRAWLER_MAX_NOTES_COUNT = 20
|
||||||
|
|
||||||
|
|
10
main.py
10
main.py
|
@ -38,7 +38,11 @@ async def main():
|
||||||
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
||||||
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
|
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
|
||||||
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
|
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
|
||||||
|
parser.add_argument('--start', type=int, help='crawler type (number of start page)',
|
||||||
|
default=config.START_PAGE)
|
||||||
|
parser.add_argument('--keyword', type=str, help='crawler type (please input keywords)',
|
||||||
|
default=config.KEYWORDS)
|
||||||
|
|
||||||
# init db
|
# init db
|
||||||
if config.SAVE_DATA_OPTION == "db":
|
if config.SAVE_DATA_OPTION == "db":
|
||||||
await db.init_db()
|
await db.init_db()
|
||||||
|
@ -48,7 +52,9 @@ async def main():
|
||||||
crawler.init_config(
|
crawler.init_config(
|
||||||
platform=args.platform,
|
platform=args.platform,
|
||||||
login_type=args.lt,
|
login_type=args.lt,
|
||||||
crawler_type=args.type
|
crawler_type=args.type,
|
||||||
|
start_page=args.start,
|
||||||
|
keyword=args.key
|
||||||
)
|
)
|
||||||
await crawler.start()
|
await crawler.start()
|
||||||
|
|
||||||
|
|
|
@ -37,10 +37,12 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
self.index_url = "https://www.bilibili.com"
|
self.index_url = "https://www.bilibili.com"
|
||||||
self.user_agent = utils.get_user_agent()
|
self.user_agent = utils.get_user_agent()
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str):
|
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
|
||||||
self.platform = platform
|
self.platform = platform
|
||||||
self.login_type = login_type
|
self.login_type = login_type
|
||||||
self.crawler_type = crawler_type
|
self.crawler_type = crawler_type
|
||||||
|
self.start_page = start_page
|
||||||
|
self.keyword = keyword
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
playwright_proxy_format, httpx_proxy_format = None, None
|
playwright_proxy_format, httpx_proxy_format = None, None
|
||||||
|
@ -96,10 +98,16 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
bili_limit_count =20 # bilibili limit page fixed value
|
bili_limit_count =20 # bilibili limit page fixed value
|
||||||
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
|
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
|
||||||
for keyword in config.KEYWORDS.split(","):
|
start_page = self.start_page # start page number
|
||||||
|
for keyword in self.keyword.split(","):
|
||||||
utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
|
utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
if page < start_page:
|
||||||
|
utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
|
||||||
|
page += 1
|
||||||
|
continue
|
||||||
|
|
||||||
video_id_list: List[str] = []
|
video_id_list: List[str] = []
|
||||||
videos_res = await self.bili_client.search_video_by_keyword(
|
videos_res = await self.bili_client.search_video_by_keyword(
|
||||||
keyword=keyword,
|
keyword=keyword,
|
||||||
|
|
|
@ -32,10 +32,12 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
||||||
self.index_url = "https://www.douyin.com"
|
self.index_url = "https://www.douyin.com"
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str) -> None:
|
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
|
||||||
self.platform = platform
|
self.platform = platform
|
||||||
self.login_type = login_type
|
self.login_type = login_type
|
||||||
self.crawler_type = crawler_type
|
self.crawler_type = crawler_type
|
||||||
|
self.start_page = start_page
|
||||||
|
self.keyword = keyword
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
playwright_proxy_format, httpx_proxy_format = None, None
|
playwright_proxy_format, httpx_proxy_format = None, None
|
||||||
|
@ -84,11 +86,16 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
dy_limit_count = 10 # douyin limit page fixed value
|
dy_limit_count = 10 # douyin limit page fixed value
|
||||||
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
|
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
|
||||||
for keyword in config.KEYWORDS.split(","):
|
start_page = self.start_page # start page number
|
||||||
|
for keyword in self.keyword.split(","):
|
||||||
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
|
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
|
||||||
aweme_list: List[str] = []
|
aweme_list: List[str] = []
|
||||||
page = 0
|
page = 0
|
||||||
while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
if page < start_page:
|
||||||
|
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
|
||||||
|
page += 1
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
|
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
|
||||||
offset=page * dy_limit_count,
|
offset=page * dy_limit_count,
|
||||||
|
|
|
@ -32,10 +32,12 @@ class KuaishouCrawler(AbstractCrawler):
|
||||||
self.index_url = "https://www.kuaishou.com"
|
self.index_url = "https://www.kuaishou.com"
|
||||||
self.user_agent = utils.get_user_agent()
|
self.user_agent = utils.get_user_agent()
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str):
|
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
|
||||||
self.platform = platform
|
self.platform = platform
|
||||||
self.login_type = login_type
|
self.login_type = login_type
|
||||||
self.crawler_type = crawler_type
|
self.crawler_type = crawler_type
|
||||||
|
self.start_page = start_page
|
||||||
|
self.keyword = keyword
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
playwright_proxy_format, httpx_proxy_format = None, None
|
playwright_proxy_format, httpx_proxy_format = None, None
|
||||||
|
@ -88,10 +90,16 @@ class KuaishouCrawler(AbstractCrawler):
|
||||||
ks_limit_count = 20 # kuaishou limit page fixed value
|
ks_limit_count = 20 # kuaishou limit page fixed value
|
||||||
if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
|
if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
|
||||||
for keyword in config.KEYWORDS.split(","):
|
start_page = self.start_page
|
||||||
|
for keyword in self.keyword.split(","):
|
||||||
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
|
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
while page * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
if page < start_page:
|
||||||
|
utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}")
|
||||||
|
page += 1
|
||||||
|
continue
|
||||||
|
|
||||||
video_id_list: List[str] = []
|
video_id_list: List[str] = []
|
||||||
videos_res = await self.ks_client.search_info_by_keyword(
|
videos_res = await self.ks_client.search_info_by_keyword(
|
||||||
keyword=keyword,
|
keyword=keyword,
|
||||||
|
|
|
@ -40,10 +40,12 @@ class WeiboCrawler(AbstractCrawler):
|
||||||
self.user_agent = utils.get_user_agent()
|
self.user_agent = utils.get_user_agent()
|
||||||
self.mobile_user_agent = utils.get_mobile_user_agent()
|
self.mobile_user_agent = utils.get_mobile_user_agent()
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str):
|
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
|
||||||
self.platform = platform
|
self.platform = platform
|
||||||
self.login_type = login_type
|
self.login_type = login_type
|
||||||
self.crawler_type = crawler_type
|
self.crawler_type = crawler_type
|
||||||
|
self.start_page = start_page
|
||||||
|
self.keyword = keyword
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
playwright_proxy_format, httpx_proxy_format = None, None
|
playwright_proxy_format, httpx_proxy_format = None, None
|
||||||
|
@ -106,10 +108,16 @@ class WeiboCrawler(AbstractCrawler):
|
||||||
weibo_limit_count = 10 # weibo limit page fixed value
|
weibo_limit_count = 10 # weibo limit page fixed value
|
||||||
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
|
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
|
||||||
for keyword in config.KEYWORDS.split(","):
|
start_page = self.start_page
|
||||||
|
for keyword in self.keyword.split(","):
|
||||||
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
|
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
while page * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
if page < start_page:
|
||||||
|
utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
|
||||||
|
page += 1
|
||||||
|
continue
|
||||||
|
|
||||||
search_res = await self.wb_client.get_note_by_keyword(
|
search_res = await self.wb_client.get_note_by_keyword(
|
||||||
keyword=keyword,
|
keyword=keyword,
|
||||||
page=page,
|
page=page,
|
||||||
|
|
|
@ -32,10 +32,12 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
self.index_url = "https://www.xiaohongshu.com"
|
self.index_url = "https://www.xiaohongshu.com"
|
||||||
self.user_agent = utils.get_user_agent()
|
self.user_agent = utils.get_user_agent()
|
||||||
|
|
||||||
def init_config(self, platform: str, login_type: str, crawler_type: str) -> None:
|
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
|
||||||
self.platform = platform
|
self.platform = platform
|
||||||
self.login_type = login_type
|
self.login_type = login_type
|
||||||
self.crawler_type = crawler_type
|
self.crawler_type = crawler_type
|
||||||
|
self.start_page = start_page
|
||||||
|
self.keyword = keyword
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
playwright_proxy_format, httpx_proxy_format = None, None
|
playwright_proxy_format, httpx_proxy_format = None, None
|
||||||
|
@ -99,31 +101,41 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
xhs_limit_count = 20 # xhs limit page fixed value
|
xhs_limit_count = 20 # xhs limit page fixed value
|
||||||
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
|
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
|
||||||
for keyword in config.KEYWORDS.split(","):
|
start_page = self.start_page
|
||||||
|
for keyword in self.keyword.split(","):
|
||||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
|
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
note_id_list: List[str] = []
|
if page < start_page:
|
||||||
notes_res = await self.xhs_client.get_note_by_keyword(
|
utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
|
||||||
keyword=keyword,
|
page += 1
|
||||||
page=page,
|
continue
|
||||||
sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
|
|
||||||
)
|
try:
|
||||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
|
note_id_list: List[str] = []
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
notes_res = await self.xhs_client.get_note_by_keyword(
|
||||||
task_list = [
|
keyword=keyword,
|
||||||
self.get_note_detail(post_item.get("id"), semaphore)
|
page=page,
|
||||||
for post_item in notes_res.get("items", {})
|
sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
|
||||||
if post_item.get('model_type') not in ('rec_query', 'hot_query')
|
)
|
||||||
]
|
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
|
||||||
note_details = await asyncio.gather(*task_list)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
for note_detail in note_details:
|
task_list = [
|
||||||
if note_detail is not None:
|
self.get_note_detail(post_item.get("id"), semaphore)
|
||||||
await xhs_store.update_xhs_note(note_detail)
|
for post_item in notes_res.get("items", {})
|
||||||
note_id_list.append(note_detail.get("note_id"))
|
if post_item.get('model_type') not in ('rec_query', 'hot_query')
|
||||||
page += 1
|
]
|
||||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
|
note_details = await asyncio.gather(*task_list)
|
||||||
await self.batch_get_note_comments(note_id_list)
|
for note_detail in note_details:
|
||||||
|
if note_detail is not None:
|
||||||
|
await xhs_store.update_xhs_note(note_detail)
|
||||||
|
note_id_list.append(note_detail.get("note_id"))
|
||||||
|
page += 1
|
||||||
|
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
|
||||||
|
await self.batch_get_note_comments(note_id_list)
|
||||||
|
except DataFetchError:
|
||||||
|
utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
|
||||||
|
break
|
||||||
|
|
||||||
async def get_creators_and_notes(self) -> None:
|
async def get_creators_and_notes(self) -> None:
|
||||||
"""Get creator's notes and retrieve their comment information."""
|
"""Get creator's notes and retrieve their comment information."""
|
||||||
|
|
Loading…
Reference in New Issue