Merge pull request #229 from Tianci-King/main

feat(core): 新增控制爬虫参数起始页面的页数start_page;perf(argparse): 向命令行解析器添加程序参数…
This commit is contained in:
程序员阿江-Relakkes 2024-04-13 13:37:35 +08:00 committed by GitHub
commit a341dc2aff
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 94 additions and 40 deletions

4
.gitignore vendored
View File

@ -165,4 +165,6 @@ cython_debug/
/temp_image/
/browser_data/
/data/
/cache
/cache
*/.DS_Store

View File

@ -6,7 +6,7 @@ from playwright.async_api import BrowserContext, BrowserType
class AbstractCrawler(ABC):
@abstractmethod
def init_config(self, platform: str, login_type: str, crawler_type: str):
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
pass
@abstractmethod

View File

@ -27,6 +27,9 @@ SAVE_DATA_OPTION = "json" # csv or db or json
# 用户浏览器缓存的浏览器文件配置
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
# 爬取开始页数 默认从第一页开始
START_PAGE = 1
# 爬取视频/帖子的数量控制
CRAWLER_MAX_NOTES_COUNT = 20

10
main.py
View File

@ -38,7 +38,11 @@ async def main():
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
parser.add_argument('--start', type=int, help='crawler type (number of start page)',
default=config.START_PAGE)
parser.add_argument('--keyword', type=str, help='crawler type (please input keywords)',
default=config.KEYWORDS)
# init db
if config.SAVE_DATA_OPTION == "db":
await db.init_db()
@ -48,7 +52,9 @@ async def main():
crawler.init_config(
platform=args.platform,
login_type=args.lt,
crawler_type=args.type
crawler_type=args.type,
start_page=args.start,
keyword=args.key
)
await crawler.start()

View File

@ -37,10 +37,12 @@ class BilibiliCrawler(AbstractCrawler):
self.index_url = "https://www.bilibili.com"
self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str):
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None
@ -96,10 +98,16 @@ class BilibiliCrawler(AbstractCrawler):
bili_limit_count =20 # bilibili limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
for keyword in config.KEYWORDS.split(","):
start_page = self.start_page # start page number
for keyword in self.keyword.split(","):
utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
page = 1
while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
page += 1
continue
video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword,

View File

@ -32,10 +32,12 @@ class DouYinCrawler(AbstractCrawler):
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.index_url = "https://www.douyin.com"
def init_config(self, platform: str, login_type: str, crawler_type: str) -> None:
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None
@ -84,11 +86,16 @@ class DouYinCrawler(AbstractCrawler):
dy_limit_count = 10 # douyin limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
for keyword in config.KEYWORDS.split(","):
start_page = self.start_page # start page number
for keyword in self.keyword.split(","):
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
aweme_list: List[str] = []
page = 0
while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[DouYinCrawler.search] Skip {page}")
page += 1
continue
try:
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
offset=page * dy_limit_count,

View File

@ -32,10 +32,12 @@ class KuaishouCrawler(AbstractCrawler):
self.index_url = "https://www.kuaishou.com"
self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str):
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None
@ -88,10 +90,16 @@ class KuaishouCrawler(AbstractCrawler):
ks_limit_count = 20 # kuaishou limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
for keyword in config.KEYWORDS.split(","):
start_page = self.start_page
for keyword in self.keyword.split(","):
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
page = 1
while page * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}")
page += 1
continue
video_id_list: List[str] = []
videos_res = await self.ks_client.search_info_by_keyword(
keyword=keyword,

View File

@ -40,10 +40,12 @@ class WeiboCrawler(AbstractCrawler):
self.user_agent = utils.get_user_agent()
self.mobile_user_agent = utils.get_mobile_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str):
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str):
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self):
playwright_proxy_format, httpx_proxy_format = None, None
@ -106,10 +108,16 @@ class WeiboCrawler(AbstractCrawler):
weibo_limit_count = 10 # weibo limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
for keyword in config.KEYWORDS.split(","):
start_page = self.start_page
for keyword in self.keyword.split(","):
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
page = 1
while page * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}")
page += 1
continue
search_res = await self.wb_client.get_note_by_keyword(
keyword=keyword,
page=page,

View File

@ -32,10 +32,12 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.index_url = "https://www.xiaohongshu.com"
self.user_agent = utils.get_user_agent()
def init_config(self, platform: str, login_type: str, crawler_type: str) -> None:
def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None:
self.platform = platform
self.login_type = login_type
self.crawler_type = crawler_type
self.start_page = start_page
self.keyword = keyword
async def start(self) -> None:
playwright_proxy_format, httpx_proxy_format = None, None
@ -99,31 +101,41 @@ class XiaoHongShuCrawler(AbstractCrawler):
xhs_limit_count = 20 # xhs limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
for keyword in config.KEYWORDS.split(","):
start_page = self.start_page
for keyword in self.keyword.split(","):
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
page = 1
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
note_id_list: List[str] = []
notes_res = await self.xhs_client.get_note_by_keyword(
keyword=keyword,
page=page,
sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
)
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_note_detail(post_item.get("id"), semaphore)
for post_item in notes_res.get("items", {})
if post_item.get('model_type') not in ('rec_query', 'hot_query')
]
note_details = await asyncio.gather(*task_list)
for note_detail in note_details:
if note_detail is not None:
await xhs_store.update_xhs_note(note_detail)
note_id_list.append(note_detail.get("note_id"))
page += 1
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
await self.batch_get_note_comments(note_id_list)
while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
page += 1
continue
try:
note_id_list: List[str] = []
notes_res = await self.xhs_client.get_note_by_keyword(
keyword=keyword,
page=page,
sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
)
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_note_detail(post_item.get("id"), semaphore)
for post_item in notes_res.get("items", {})
if post_item.get('model_type') not in ('rec_query', 'hot_query')
]
note_details = await asyncio.gather(*task_list)
for note_detail in note_details:
if note_detail is not None:
await xhs_store.update_xhs_note(note_detail)
note_id_list.append(note_detail.get("note_id"))
page += 1
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
await self.batch_get_note_comments(note_id_list)
except DataFetchError:
utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error")
break
async def get_creators_and_notes(self) -> None:
"""Get creator's notes and retrieve their comment information."""