From 478db4cc4ba5958515da52473efa85f950cf39b7 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Tue, 28 May 2024 01:07:19 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=8A=96=E9=9F=B3=E6=8C=87=E5=AE=9A?= =?UTF-8?q?=E5=88=9B=E4=BD=9C=E8=80=85done?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 14 ++++---- config/base_config.py | 6 ++++ media_platform/douyin/client.py | 36 ++++++++++++++++++++ media_platform/douyin/core.py | 55 ++++++++++++++++++++++++++++--- schema/tables.sql | 21 ++++++++++++ store/douyin/__init__.py | 28 ++++++++++++++++ store/douyin/douyin_store_impl.py | 54 ++++++++++++++++++++++++++---- store/douyin/douyin_store_sql.py | 46 ++++++++++++++++++++++++++ 8 files changed, 241 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index fd15951..d6739d2 100644 --- a/README.md +++ b/README.md @@ -18,12 +18,12 @@ > 下面不支持的项目,相关的代码架构已经搭建好,只需要实现对应的方法即可,欢迎大家提交PR | 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | -|-----|-------|----------|------|---------|-------|-------| -| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 抖音 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | -| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | -| B 站 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | -| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | +|-----|-------|----------|------|--------|-------|-------| +| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 抖音 | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | +| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | +| B 站 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | +| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ## 使用方法 @@ -84,7 +84,7 @@ ## 开发者服务 - 付费咨询:提供 200 元/小时的咨询服务,帮你快速解决项目中遇到的问题。 -- 知识星球:沉淀高质量常见问题、最佳实践文档,提供付费知识星球服务,主动提问,作者会定期回答问题 +- 知识星球:沉淀高质量常见问题、最佳实践文档、多年编程+爬虫经验分享,提供付费知识星球服务,主动提问,作者会定期回答问题

知识星球

- 课程服务: > 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个课程,从设计出发一步步带你如何使用,门槛大大降低,同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~
diff --git a/config/base_config.py b/config/base_config.py index 7bab87a..3a8918f 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -89,3 +89,9 @@ XHS_CREATOR_ID_LIST = [ "63e36c9a000000002703502b", # ........................ ] + +# 指定Dy创作者ID列表(sec_id) +DY_CREATOR_ID_LIST = [ + "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE", + # ........................ +] diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index 796ca74..a8596d3 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -199,3 +199,39 @@ class DOUYINClient(AbstractApiClient): continue # todo fetch sub comments return result + + async def get_user_info(self, sec_user_id: str): + uri = "/aweme/v1/web/user/profile/other/" + params = { + "sec_user_id": sec_user_id, + "publish_video_strategy_type": 2, + "personal_center_strategy": 1, + } + return await self.get(uri, params) + + async def get_user_aweme_posts(self, sec_user_id: str, max_cursor: str = "") -> Dict: + uri = "/aweme/v1/web/aweme/post/" + params = { + "sec_user_id": sec_user_id, + "count": 18, + "max_cursor": max_cursor, + "locate_query": "false", + "publish_video_strategy_type": 2 + } + return await self.get(uri, params) + + async def get_all_user_aweme_posts(self, sec_user_id: str, callback: Optional[Callable] = None): + posts_has_more = 1 + max_cursor = "" + result = [] + while posts_has_more == 1: + aweme_post_res = await self.get_user_aweme_posts(sec_user_id, max_cursor) + posts_has_more = aweme_post_res.get("has_more", 0) + max_cursor = aweme_post_res.get("max_cursor") + aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else [] + utils.logger.info( + f"[DOUYINClient.get_all_user_aweme_posts] got sec_user_id:{sec_user_id} video len : {len(aweme_list)}") + if callback: + await callback(aweme_list) + result.extend(aweme_list) + return result diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index c65018f..d97314a 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -27,8 +27,11 @@ class DouYinCrawler(AbstractCrawler): context_page: Page dy_client: DOUYINClient browser_context: BrowserContext + start_page: int + keyword: str def __init__(self) -> None: + self.start_page = None self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed self.index_url = "https://www.douyin.com" @@ -64,7 +67,7 @@ class DouYinCrawler(AbstractCrawler): if not await self.dy_client.pong(browser_context=self.browser_context): login_obj = DouYinLogin( login_type=self.login_type, - login_phone="", # you phone number + login_phone="", # you phone number browser_context=self.browser_context, context_page=self.context_page, cookie_str=config.COOKIES @@ -78,6 +81,9 @@ class DouYinCrawler(AbstractCrawler): elif self.crawler_type == "detail": # Get the information and comments of the specified post await self.get_specified_awemes() + elif self.crawler_type == "creator": + # Get the information and comments of the specified creator + await self.get_creators_and_videos() utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...") @@ -107,7 +113,8 @@ class DouYinCrawler(AbstractCrawler): page += 1 if "data" not in posts_res: - utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed,账号也许被风控了。") + utils.logger.error( + f"[DouYinCrawler.search] search douyin keyword: {keyword} failed,账号也许被风控了。") break for post_item in posts_res.get("data"): @@ -142,10 +149,14 @@ class DouYinCrawler(AbstractCrawler): utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}") return None except KeyError as ex: - utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}") + utils.logger.error( + f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}") return None async def batch_get_note_comments(self, aweme_list: List[str]) -> None: + """ + Batch get note comments + """ if not config.ENABLE_GET_COMMENTS: utils.logger.info(f"[DouYinCrawler.batch_get_note_comments] Crawling comment mode is not enabled") return @@ -156,7 +167,7 @@ class DouYinCrawler(AbstractCrawler): task = asyncio.create_task( self.get_comments(aweme_id, semaphore), name=aweme_id) task_list.append(task) - if len(task_list) > 0 : + if len(task_list) > 0: await asyncio.wait(task_list) async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None: @@ -169,10 +180,44 @@ class DouYinCrawler(AbstractCrawler): callback=douyin_store.batch_update_dy_aweme_comments ) - utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...") + utils.logger.info( + f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...") except DataFetchError as e: utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}") + async def get_creators_and_videos(self) -> None: + """ + Get the information and videos of the specified creator + """ + utils.logger.info("[DouYinCrawler.get_creators_and_videos] Begin get douyin creators") + for user_id in config.DY_CREATOR_ID_LIST: + creator_info: Dict = await self.dy_client.get_user_info(user_id) + if creator_info: + await douyin_store.save_creator(user_id, creator=creator_info) + + # Get all video information of the creator + all_video_list = await self.dy_client.get_all_user_aweme_posts( + sec_user_id=user_id, + callback=self.fetch_creator_video_detail + ) + + video_ids = [video_item.get("aweme_id") for video_item in all_video_list] + await self.batch_get_note_comments(video_ids) + + async def fetch_creator_video_detail(self, video_list: List[Dict]): + """ + Concurrently obtain the specified post list and save the data + """ + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_aweme_detail(post_item.get("aweme_id"), semaphore) for post_item in video_list + ] + + note_details = await asyncio.gather(*task_list) + for aweme_item in note_details: + if aweme_item is not None: + await douyin_store.update_douyin_aweme(aweme_item) + @staticmethod def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: """format proxy info for playwright and httpx""" diff --git a/schema/tables.sql b/schema/tables.sql index 007a55e..3fbda14 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -103,6 +103,27 @@ CREATE TABLE `douyin_aweme_comment` ( KEY `idx_douyin_awem_aweme_i_c50049` (`aweme_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音视频评论'; +-- ---------------------------- +-- Table structure for dy_creator +-- ---------------------------- +DROP TABLE IF EXISTS `dy_creator`; +CREATE TABLE `dy_creator` ( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(128) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `desc` longtext COMMENT '用户描述', + `gender` varchar(1) DEFAULT NULL COMMENT '性别', + `follows` varchar(16) DEFAULT NULL COMMENT '关注数', + `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', + `interaction` varchar(16) DEFAULT NULL COMMENT '获赞数', + `videos_count` varchar(16) DEFAULT NULL COMMENT '作品数', + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音博主信息'; + -- ---------------------------- -- Table structure for kuaishou_video -- ---------------------------- diff --git a/store/douyin/__init__.py b/store/douyin/__init__.py index 9f0bfea..73acd36 100644 --- a/store/douyin/__init__.py +++ b/store/douyin/__init__.py @@ -92,3 +92,31 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): f"[store.douyin.update_dy_aweme_comment] douyin aweme comment: {comment_id}, content: {save_comment_item.get('content')}") await DouyinStoreFactory.create_store().store_comment(comment_item=save_comment_item) + + + + +async def save_creator(user_id: str, creator: Dict): + user_info = creator.get('user', {}) + gender_map = { + 0: '未知', + 1: '男', + 2: '女' + } + avatar_uri = user_info.get('avatar_300x300', {}).get('uri') + local_db_item = { + 'user_id': user_id, + 'nickname': user_info.get('nickname'), + 'gender': gender_map.get(user_info.get('gender'), '未知'), + 'avatar': f"https://p3-pc.douyinpic.com/img/{avatar_uri}" + r"~c5_300x300.jpeg?from=2956013662", + 'desc': user_info.get('signature'), + 'ip_location': user_info.get('ip_location'), + 'follows': user_info.get("following_count", 0), + 'fans': user_info.get("max_follower_count", 0), + 'interaction': user_info.get("total_favorited", 0), + 'videos_count': user_info.get("aweme_count", 0), + "last_modify_ts": utils.get_current_timestamp(), + + } + utils.logger.info(f"[store.douyin.save_creator] creator:{local_db_item}") + await DouyinStoreFactory.create_store().store_creator(local_db_item) diff --git a/store/douyin/douyin_store_impl.py b/store/douyin/douyin_store_impl.py index ec5dfc6..b87fe6d 100644 --- a/store/douyin/douyin_store_impl.py +++ b/store/douyin/douyin_store_impl.py @@ -26,13 +26,14 @@ def calculate_number_of_files(file_store_path: str) -> int: if not os.path.exists(file_store_path): return 1 try: - return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 + return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1 except ValueError: return 1 + class DouyinCsvStoreImplement(AbstractStore): csv_store_path: str = "data/douyin" - file_count:int=calculate_number_of_files(csv_store_path) + file_count: int = calculate_number_of_files(csv_store_path) def make_save_file_name(self, store_type: str) -> str: """ @@ -65,7 +66,7 @@ class DouyinCsvStoreImplement(AbstractStore): async def store_content(self, content_item: Dict): """ - Xiaohongshu content CSV storage implementation + Douyin content CSV storage implementation Args: content_item: note item dict @@ -76,7 +77,7 @@ class DouyinCsvStoreImplement(AbstractStore): async def store_comment(self, comment_item: Dict): """ - Xiaohongshu comment CSV storage implementation + Douyin comment CSV storage implementation Args: comment_item: comment item dict @@ -85,6 +86,17 @@ class DouyinCsvStoreImplement(AbstractStore): """ await self.save_data_to_csv(save_item=comment_item, store_type="comments") + async def store_creator(self, creator: Dict): + """ + Douyin creator CSV storage implementation + Args: + creator: creator item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=creator, store_type="creator") + class DouyinDbStoreImplement(AbstractStore): async def store_content(self, content_item: Dict): @@ -109,7 +121,6 @@ class DouyinDbStoreImplement(AbstractStore): else: await update_content_by_content_id(aweme_id, content_item=content_item) - async def store_comment(self, comment_item: Dict): """ Douyin content DB storage implementation @@ -130,11 +141,29 @@ class DouyinDbStoreImplement(AbstractStore): else: await update_comment_by_comment_id(comment_id, comment_item=comment_item) + async def store_creator(self, creator: Dict): + """ + Douyin content DB storage implementation + Args: + creator: creator dict + + Returns: + + """ + from .douyin_store_sql import (add_new_creator, query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) class DouyinJsonStoreImplement(AbstractStore): json_store_path: str = "data/douyin" lock = asyncio.Lock() - file_count:int=calculate_number_of_files(json_store_path) + file_count: int = calculate_number_of_files(json_store_path) def make_save_file_name(self, store_type: str) -> str: """ @@ -146,7 +175,6 @@ class DouyinJsonStoreImplement(AbstractStore): """ - return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json" async def save_data_to_json(self, save_item: Dict, store_type: str): @@ -193,3 +221,15 @@ class DouyinJsonStoreImplement(AbstractStore): """ await self.save_data_to_json(comment_item, "comments") + + + async def store_creator(self, creator: Dict): + """ + Douyin creator CSV storage implementation + Args: + creator: creator item dict + + Returns: + + """ + await self.save_data_to_json(save_item=creator, store_type="creator") \ No newline at end of file diff --git a/store/douyin/douyin_store_sql.py b/store/douyin/douyin_store_sql.py index 2cc4cc6..0f62fa2 100644 --- a/store/douyin/douyin_store_sql.py +++ b/store/douyin/douyin_store_sql.py @@ -100,3 +100,49 @@ async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> i async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() effect_row: int = await async_db_conn.update_table("douyin_aweme_comment", comment_item, "comment_id", comment_id) return effect_row + + +async def query_creator_by_user_id(user_id: str) -> Dict: + """ + 查询一条创作者记录 + Args: + user_id: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + sql: str = f"select * from dy_creator where user_id = '{user_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_creator(creator_item: Dict) -> int: + """ + 新增一条创作者信息 + Args: + creator_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("dy_creator", creator_item) + return last_row_id + + +async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int: + """ + 更新一条创作者信息 + Args: + user_id: + creator_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("dy_creator", creator_item, "user_id", user_id) + return effect_row \ No newline at end of file