diff --git a/README.md b/README.md index 1301aca..9e89cae 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ ### 运行爬虫程序 ```shell + # 默认没有开启评论爬取模式,有需要请到配置文件中指定 # 从配置文件中读取关键词搜索相关的帖子并爬去帖子信息与评论 python main.py --platform xhs --lt qrcode --type search diff --git a/config/base_config.py b/config/base_config.py index af58561..da7885a 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -3,7 +3,7 @@ PLATFORM = "xhs" KEYWORDS = "python,golang" LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" -SORT_TYPE="popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书 +SORT_TYPE = "popularity_descending" # 具体值参见media_platform.xxx.field下的枚举值,展示只支持小红书 CRAWLER_TYPE = "search" # 是否开启 IP 代理 @@ -19,7 +19,7 @@ HEADLESS = True SAVE_LOGIN_STATE = True # 数据保存类型选项配置,支持三种类型:csv、db、json -SAVE_DATA_OPTION = "json" # csv or db or json +SAVE_DATA_OPTION = "json" # csv or db or json # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name @@ -30,12 +30,8 @@ CRAWLER_MAX_NOTES_COUNT = 20 # 并发爬虫数量控制 MAX_CONCURRENCY_NUM = 4 - -# 评论关键词筛选(只会留下包含关键词的评论,为空不限制) -COMMENT_KEYWORDS = [ - # "真棒" - # ........................ -] +# 是否开启爬评论模式, 默认不开启爬评论 +ENABLE_GET_COMMENTS = False # 指定小红书需要爬虫的笔记ID列表 XHS_SPECIFIED_ID_LIST = [ @@ -78,4 +74,4 @@ XHS_CREATOR_ID_LIST = [ "61b87386000000001000b18b", "5e8558100000000001005bc5", # ........................ -] \ No newline at end of file +] diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index 0a7a0ab..406ef22 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -127,6 +127,10 @@ class BilibiliCrawler(AbstractCrawler): :param video_id_list: :return: """ + if not config.ENABLE_GET_COMMENTS: + utils.logger.info(f"[BilibiliCrawler.batch_get_note_comments] Crawling comment mode is not enabled") + return + utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list: List[Task] = [] diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 1eacf89..918386a 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -132,6 +132,10 @@ class DouYinCrawler(AbstractCrawler): return None async def batch_get_note_comments(self, aweme_list: List[str]) -> None: + if not config.ENABLE_GET_COMMENTS: + utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled") + return + task_list: List[Task] = [] semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) for aweme_id in aweme_list: @@ -145,7 +149,7 @@ class DouYinCrawler(AbstractCrawler): async with semaphore: try: # 将关键词列表传递给 get_aweme_all_comments 方法 - comments = await self.dy_client.get_aweme_all_comments( + await self.dy_client.get_aweme_all_comments( aweme_id=aweme_id, crawl_interval=random.random(), callback=douyin_store.batch_update_dy_aweme_comments diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index 7a007c2..dbc8b56 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -144,6 +144,10 @@ class KuaishouCrawler(AbstractCrawler): :param video_id_list: :return: """ + if not config.ENABLE_GET_COMMENTS: + utils.logger.info(f"[KuaishouCrawler.batch_get_note_comments] Crawling comment mode is not enabled") + return + utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list: List[Task] = [] diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index 318ff68..eca4324 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -166,6 +166,10 @@ class WeiboCrawler(AbstractCrawler): :param note_id_list: :return: """ + if not config.ENABLE_GET_COMMENTS: + utils.logger.info(f"[WeiboCrawler.batch_get_note_comments] Crawling comment mode is not enabled") + return + utils.logger.info(f"[WeiboCrawler.batch_get_notes_comments] note ids:{note_id_list}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list: List[Task] = [] diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 96e70ea..987ba5f 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -105,7 +105,7 @@ class XiaoHongShuCrawler(AbstractCrawler): notes_res = await self.xhs_client.get_note_by_keyword( keyword=keyword, page=page, - sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE!='' else SearchSortType.GENERAL, + sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL, ) utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) @@ -122,7 +122,7 @@ class XiaoHongShuCrawler(AbstractCrawler): page += 1 utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}") await self.batch_get_note_comments(note_id_list) - + async def get_creators_and_notes(self) -> None: """Get creator's notes and retrieve their comment information.""" utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators") @@ -151,7 +151,8 @@ class XiaoHongShuCrawler(AbstractCrawler): # save creator info await xhs_store.save_creator(creator, creator_and_notes_info.get('creator')) - utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] save creator info:{creator_and_notes_info.get('creator')}") + utils.logger.info( + f"[XiaoHongShuCrawler.get_creators_and_notes] save creator info:{creator_and_notes_info.get('creator')}") else: # get notes notes = await self.xhs_client.get_notes_by_creator(creator, cursor) @@ -164,7 +165,8 @@ class XiaoHongShuCrawler(AbstractCrawler): cursor = notes.get('cursor') has_more_notes = notes.get('has_more_notes') notes_res = notes.get('notes') - utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] get creator's notes res:{notes_res}") + utils.logger.info( + f"[XiaoHongShuCrawler.get_creators_and_notes] get creator's notes res:{notes_res}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ @@ -211,6 +213,10 @@ class XiaoHongShuCrawler(AbstractCrawler): async def batch_get_note_comments(self, note_list: List[str]): """Batch get note comments""" + if not config.ENABLE_GET_COMMENTS: + utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Crawling comment mode is not enabled") + return + utils.logger.info( f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)