feat(core): 新增控制爬虫 参数起始页面的页数start_page;perf(argparse): 向命令行解析器添加程序参数起始页面页数和关键字
This commit is contained in:
parent
bba9841c26
commit
1115b0d90c
|
@ -165,4 +165,6 @@ cython_debug/
|
|||
/temp_image/
|
||||
/browser_data/
|
||||
/data/
|
||||
/cache
|
||||
/cache
|
||||
|
||||
*/.DS_Store
|
|
@ -6,7 +6,7 @@ from playwright.async_api import BrowserContext, BrowserType
|
|||
|
||||
class AbstractCrawler(ABC):
|
||||
@abstractmethod
|
||||
def init_config(self, platform: str, login_type: str, crawler_type: str):
|
||||
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
|
|
|
@ -27,6 +27,9 @@ SAVE_DATA_OPTION = "json" # csv or db or json
|
|||
# 用户浏览器缓存的浏览器文件配置
|
||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||
|
||||
# 爬取开始页数 默认从第一页开始
|
||||
START_PAGE = 1
|
||||
|
||||
# 爬取视频/帖子的数量控制
|
||||
CRAWLER_MAX_NOTES_COUNT = 20
|
||||
|
||||
|
|
10
main.py
10
main.py
|
@ -38,7 +38,11 @@ async def main():
|
|||
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
||||
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
|
||||
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
|
||||
|
||||
parser.add_argument('--start', type=int, help='crawler type (number of start page)',
|
||||
default=config.START_PAGE)
|
||||
parser.add_argument('--keyword', type=str, help='crawler type (please input keywords)',
|
||||
default=config.KEYWORDS)
|
||||
|
||||
# init db
|
||||
if config.SAVE_DATA_OPTION == "db":
|
||||
await db.init_db()
|
||||
|
@ -48,7 +52,9 @@ async def main():
|
|||
crawler.init_config(
|
||||
platform=args.platform,
|
||||
login_type=args.lt,
|
||||
crawler_type=args.type
|
||||
crawler_type=args.type,
|
||||
start_page=args.start,
|
||||
keyword=args.key
|
||||
)
|
||||
await crawler.start()
|
||||
|
||||
|
|
|
@ -37,10 +37,12 @@ class BilibiliCrawler(AbstractCrawler):
|
|||
self.index_url = "https://www.bilibili.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
|
||||
def init_config(self, platform: str, login_type: str, crawler_type: str):
|
||||
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
|
||||
self.platform = platform
|
||||
self.login_type = login_type
|
||||
self.crawler_type = crawler_type
|
||||
self.start_page = start_page
|
||||
self.keyword = keyword
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
|
@ -96,10 +98,16 @@ class BilibiliCrawler(AbstractCrawler):
|
|||
bili_limit_count =20 # bilibili limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
start_page = self.start_page # start page number
|
||||
for keyword in self.keyword.split(","):
|
||||
utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
|
||||
page += 1
|
||||
continue
|
||||
|
||||
video_id_list: List[str] = []
|
||||
videos_res = await self.bili_client.search_video_by_keyword(
|
||||
keyword=keyword,
|
||||
|
|
|
@ -32,10 +32,12 @@ class DouYinCrawler(AbstractCrawler):
|
|||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
||||
self.index_url = "https://www.douyin.com"
|
||||
|
||||
def init_config(self, platform: str, login_type: str, crawler_type: str) -> None:
|
||||
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
|
||||
self.platform = platform
|
||||
self.login_type = login_type
|
||||
self.crawler_type = crawler_type
|
||||
self.start_page = start_page
|
||||
self.keyword = keyword
|
||||
|
||||
async def start(self) -> None:
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
|
@ -84,11 +86,16 @@ class DouYinCrawler(AbstractCrawler):
|
|||
dy_limit_count = 10 # douyin limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
start_page = self.start_page # start page number
|
||||
for keyword in self.keyword.split(","):
|
||||
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
|
||||
aweme_list: List[str] = []
|
||||
page = 0
|
||||
while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
|
||||
page += 1
|
||||
continue
|
||||
try:
|
||||
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
|
||||
offset=page * dy_limit_count,
|
||||
|
|
|
@ -32,10 +32,12 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
self.index_url = "https://www.kuaishou.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
|
||||
def init_config(self, platform: str, login_type: str, crawler_type: str):
|
||||
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
|
||||
self.platform = platform
|
||||
self.login_type = login_type
|
||||
self.crawler_type = crawler_type
|
||||
self.start_page = start_page
|
||||
self.keyword = keyword
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
|
@ -88,10 +90,16 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
ks_limit_count = 20 # kuaishou limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
start_page = self.start_page
|
||||
for keyword in self.keyword.split(","):
|
||||
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while page * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}")
|
||||
page += 1
|
||||
continue
|
||||
|
||||
video_id_list: List[str] = []
|
||||
videos_res = await self.ks_client.search_info_by_keyword(
|
||||
keyword=keyword,
|
||||
|
|
|
@ -40,10 +40,12 @@ class WeiboCrawler(AbstractCrawler):
|
|||
self.user_agent = utils.get_user_agent()
|
||||
self.mobile_user_agent = utils.get_mobile_user_agent()
|
||||
|
||||
def init_config(self, platform: str, login_type: str, crawler_type: str):
|
||||
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
|
||||
self.platform = platform
|
||||
self.login_type = login_type
|
||||
self.crawler_type = crawler_type
|
||||
self.start_page = start_page
|
||||
self.keyword = keyword
|
||||
|
||||
async def start(self):
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
|
@ -106,10 +108,16 @@ class WeiboCrawler(AbstractCrawler):
|
|||
weibo_limit_count = 10 # weibo limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
start_page = self.start_page
|
||||
for keyword in self.keyword.split(","):
|
||||
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while page * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
|
||||
page += 1
|
||||
continue
|
||||
|
||||
search_res = await self.wb_client.get_note_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
|
|
|
@ -32,10 +32,12 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
self.index_url = "https://www.xiaohongshu.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
|
||||
def init_config(self, platform: str, login_type: str, crawler_type: str) -> None:
|
||||
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
|
||||
self.platform = platform
|
||||
self.login_type = login_type
|
||||
self.crawler_type = crawler_type
|
||||
self.start_page = start_page
|
||||
self.keyword = keyword
|
||||
|
||||
async def start(self) -> None:
|
||||
playwright_proxy_format, httpx_proxy_format = None, None
|
||||
|
@ -99,31 +101,41 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
xhs_limit_count = 20 # xhs limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
start_page = self.start_page
|
||||
for keyword in self.keyword.split(","):
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
note_id_list: List[str] = []
|
||||
notes_res = await self.xhs_client.get_note_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
|
||||
)
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail(post_item.get("id"), semaphore)
|
||||
for post_item in notes_res.get("items", {})
|
||||
if post_item.get('model_type') not in ('rec_query', 'hot_query')
|
||||
]
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
for note_detail in note_details:
|
||||
if note_detail is not None:
|
||||
await xhs_store.update_xhs_note(note_detail)
|
||||
note_id_list.append(note_detail.get("note_id"))
|
||||
page += 1
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
|
||||
await self.batch_get_note_comments(note_id_list)
|
||||
while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
|
||||
page += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
note_id_list: List[str] = []
|
||||
notes_res = await self.xhs_client.get_note_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
|
||||
)
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail(post_item.get("id"), semaphore)
|
||||
for post_item in notes_res.get("items", {})
|
||||
if post_item.get('model_type') not in ('rec_query', 'hot_query')
|
||||
]
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
for note_detail in note_details:
|
||||
if note_detail is not None:
|
||||
await xhs_store.update_xhs_note(note_detail)
|
||||
note_id_list.append(note_detail.get("note_id"))
|
||||
page += 1
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
|
||||
await self.batch_get_note_comments(note_id_list)
|
||||
except DataFetchError:
|
||||
utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
|
||||
break
|
||||
|
||||
async def get_creators_and_notes(self) -> None:
|
||||
"""Get creator's notes and retrieve their comment information."""
|
||||
|
|
Loading…
Reference in New Issue