diff --git a/README.md b/README.md index e89aa9e..5a69254 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,8 @@ - [x] 抖音登录(二维码、手机号、cookies) - [x] 抖音滑块(模拟滑动实现,准确率不太OK) - [x] 支持登录成功后的上下文浏览器环境保留 +- [x] 数据持久化到硬盘(关系型数据库) -## 待实现 - -- [ ] 数据持久化到硬盘 ## 使用方法 @@ -32,9 +30,13 @@ `pip install -r requirements.txt` 2. 安装playwright浏览器驱动 `playwright install` -3. 运行爬虫程序 +3. 是否选择开启保存数据到DB中 + 如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量 +
再执行 `python db.py` 初始化数据库信息,生成相关的数据库表结构 +4. 运行爬虫程序 `python main.py --platform xhs --lt qrcode` -4. 打开小红书扫二维码登录 +5. 打开对应APP扫二维码登录 + ## 项目代码结构 @@ -67,24 +69,16 @@ MediaCrawler │ ├── help.py # 辅助函数 │ └── login.py # 登录实现 ├── modles -│ ├── douyin -│ │ └── m_douyin.py -│ └── xhs -│ └── m_xhs.py +│ ├── douyin.py # 抖音数据模型 +│ └── xiaohongshu.py # 小红书数据模型 ├── tools │ └── utils.py # 工具函数 ├── main.py # 程序入口 └── recv_sms_notification.py # 短信转发器的HTTP SERVER接口 ``` +## 数据持久化 -## 小红书运行截图 - -![小红书运行截图](https://s2.loli.net/2023/06/09/PVBe3X5vf4yncrd.gif) - -## 抖音运行截图 - -- ![抖音运行截图](https://s2.loli.net/2023/06/25/GXfkeLhpTyNiAqH.gif) - +![数据持久化](https://s2.loli.net/2023/07/24/ZTcGWz8jPAy7b5M.png) ## 支持一下 diff --git a/config/__init__.py b/config/__init__.py index d77edcc..4721e8f 100644 --- a/config/__init__.py +++ b/config/__init__.py @@ -1,2 +1,3 @@ from .base_config import * from .account_config import * +from .db_config import * diff --git a/config/base_config.py b/config/base_config.py index 79e7216..541c898 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -2,11 +2,7 @@ PLATFORM = "xhs" KEYWORDS = "健身,旅游" LOGIN_TYPE = "qrcode" # qrcode or phone or cookies -COOKIES = "web_session=xxxxcfed1566xxxxxxxxxxxxxxxxxxx;" # if platform is xhs, pleas set only web_session cookie attr - -# redis config -REDIS_DB_HOST = "redis://127.0.0.1" # your redis host -REDIS_DB_PWD = "123456" # your redis password +COOKIES = "web_session=xxxxcfed1566xxxxxxxxxxxxxxxxxxx;" # if platform is xhs, pleas set only web_session cookie attr # enable ip proxy ENABLE_IP_PROXY = False @@ -18,7 +14,7 @@ RETRY_INTERVAL = 60 * 30 # 30 minutes HEADLESS = True # save login state -SAVE_LOGIN_STATE = False +SAVE_LOGIN_STATE = True # save user data dir USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name diff --git a/config/db_config.py b/config/db_config.py new file mode 100644 index 0000000..a5dfe98 --- /dev/null +++ b/config/db_config.py @@ -0,0 +1,9 @@ +# redis config +REDIS_DB_HOST = "redis://127.0.0.1" # your redis host +REDIS_DB_PWD = "123456" # your redis password + +# mysql config +RELATION_DB_URL = "mysql://root:youdbpassword@localhost:3306/media_crawler" + +# save data to database option +IS_SAVED_DATABASED = True # if you want to save data to database, set True diff --git a/db.py b/db.py new file mode 100644 index 0000000..27caf4a --- /dev/null +++ b/db.py @@ -0,0 +1,24 @@ +from tortoise import Tortoise +from tortoise import run_async + +from config.db_config import * + +from tools import utils + + +async def init_db(create_db: bool = False) -> None: + await Tortoise.init( + db_url=RELATION_DB_URL, + modules={'models': ['models']}, + _create_db=create_db + ) + + +async def init(): + await init_db(create_db=True) + await Tortoise.generate_schemas() + utils.logger.info("Init DB Success!") + + +if __name__ == '__main__': + run_async(init()) diff --git a/images/douyin.gif b/images/douyin.gif deleted file mode 100644 index 0440328..0000000 Binary files a/images/douyin.gif and /dev/null differ diff --git a/images/xiaoshongshu.gif b/images/xiaoshongshu.gif deleted file mode 100644 index 2fb45de..0000000 Binary files a/images/xiaoshongshu.gif and /dev/null differ diff --git a/main.py b/main.py index c682140..d00cb5a 100644 --- a/main.py +++ b/main.py @@ -2,8 +2,8 @@ import sys import asyncio import argparse +import db import config -from tools import utils from base import proxy_account_pool from media_platform.douyin import DouYinCrawler from media_platform.xhs import XiaoHongShuCrawler @@ -29,6 +29,10 @@ async def main(): # init account pool account_pool = proxy_account_pool.create_account_pool() + # init db + if config.IS_SAVED_DATABASED: + await db.init_db() + args = parser.parse_args() crawler = CrawlerFactory().create_crawler(platform=args.platform) crawler.init_config( diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 68c6f6c..ba19abe 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -23,12 +23,12 @@ class DouYinCrawler(AbstractCrawler): dy_client: DOUYINClient def __init__(self) -> None: - self.browser_context: Optional[BrowserContext] = None # type: ignore - self.context_page: Optional[Page] = None # type: ignore + self.browser_context: Optional[BrowserContext] = None # type: ignore + self.context_page: Optional[Page] = None # type: ignore self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed self.index_url = "https://www.douyin.com" - self.command_args: Optional[Namespace] = None # type: ignore - self.account_pool: Optional[AccountPool] = None # type: ignore + self.command_args: Optional[Namespace] = None # type: ignore + self.account_pool: Optional[AccountPool] = None # type: ignore def init_config(self, **kwargs): for key, value in kwargs.items(): @@ -53,7 +53,7 @@ class DouYinCrawler(AbstractCrawler): self.dy_client = await self.create_douyin_client(httpx_proxy) if not await self.dy_client.ping(browser_context=self.browser_context): login_obj = DouYinLogin( - login_type=self.command_args.lt, # type: ignore + login_type=self.command_args.lt, # type: ignore login_phone=account_phone, browser_context=self.browser_context, context_page=self.context_page, @@ -88,27 +88,29 @@ class DouYinCrawler(AbstractCrawler): post_item.get("aweme_mix_info", {}).get("mix_items")[0] except TypeError: continue - aweme_list.append(aweme_info.get("aweme_id","")) + aweme_list.append(aweme_info.get("aweme_id", "")) await douyin.update_douyin_aweme(aweme_item=aweme_info) utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}") - # await self.batch_get_note_comments(aweme_list) + await self.batch_get_note_comments(aweme_list) async def batch_get_note_comments(self, aweme_list: List[str]): task_list: List[Task] = [] + _semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) for aweme_id in aweme_list: - task = asyncio.create_task(self.get_comments(aweme_id), name=aweme_id) + task = asyncio.create_task(self.get_comments(aweme_id, _semaphore), name=aweme_id) task_list.append(task) await asyncio.wait(task_list) - async def get_comments(self, aweme_id: str): - try: - await self.dy_client.get_aweme_all_comments( - aweme_id=aweme_id, - callback=douyin.batch_update_dy_aweme_comments - ) - utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...") - except DataFetchError as e: - utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}") + async def get_comments(self, aweme_id: str, semaphore: "asyncio.Semaphore"): + async with semaphore: + try: + await self.dy_client.get_aweme_all_comments( + aweme_id=aweme_id, + callback=douyin.batch_update_dy_aweme_comments + ) + utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...") + except DataFetchError as e: + utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}") def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]: """Create proxy info for playwright and httpx""" @@ -116,7 +118,7 @@ class DouYinCrawler(AbstractCrawler): return None, None, None # phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888 - phone, ip_proxy = self.account_pool.get_account() # type: ignore + phone, ip_proxy = self.account_pool.get_account() # type: ignore playwright_proxy = { "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}", "username": config.IP_PROXY_USER, @@ -127,7 +129,7 @@ class DouYinCrawler(AbstractCrawler): async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient: """Create douyin client""" - cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore + cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore douyin_client = DOUYINClient( proxies=httpx_proxy, headers={ @@ -152,18 +154,19 @@ class DouYinCrawler(AbstractCrawler): ) -> BrowserContext: """Launch browser and create browser context""" if config.SAVE_LOGIN_STATE: - user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform) # type: ignore + user_data_dir = os.path.join(os.getcwd(), "browser_data", + config.USER_DATA_DIR % self.command_args.platform) # type: ignore browser_context = await chromium.launch_persistent_context( user_data_dir=user_data_dir, accept_downloads=True, headless=headless, - proxy=playwright_proxy, # type: ignore + proxy=playwright_proxy, # type: ignore viewport={"width": 1920, "height": 1080}, user_agent=user_agent - ) # type: ignore + ) # type: ignore return browser_context else: - browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore browser_context = await browser.new_context( viewport={"width": 1920, "height": 1080}, user_agent=user_agent diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 418e361..67fd113 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -83,12 +83,15 @@ class XHSClient: async def ping(self) -> bool: """get a note to check if login state is ok""" utils.logger.info("begin to ping xhs...") - note_id = "5e5cb38a000000000100185e" + ping_flag = False try: - note_card: Dict = await self.get_note_by_id(note_id) - return note_card.get("note_id") == note_id - except Exception: - return False + note_card: Dict = await self.get_note_by_keyword(keyword="小红书") + if note_card.get("items"): + ping_flag = True + except Exception as e: + utils.logger.error(f"ping xhs failed: {e}") + ping_flag = False + return ping_flag async def update_cookies(self, browser_context: BrowserContext): cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index f7225d6..fc1936b 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -15,7 +15,7 @@ from tools import utils from .exception import * from .login import XHSLogin from .client import XHSClient -from models import xhs as xhs_model +from models import xiaohongshu as xhs_model from base.base_crawler import AbstractCrawler from base.proxy_account_pool import AccountPool diff --git a/models/__init__.py b/models/__init__.py index e69de29..d491f01 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -0,0 +1,3 @@ +from .douyin import * +from .xiaohongshu import * + diff --git a/models/douyin.py b/models/douyin.py new file mode 100644 index 0000000..1f453ea --- /dev/null +++ b/models/douyin.py @@ -0,0 +1,133 @@ +import json +from typing import Dict, List + +from tortoise.models import Model +from tortoise import fields + +import config +from tools import utils + + +class DouyinBaseModel(Model): + id = fields.IntField(pk=True, autoincrement=True, description="自增ID") + user_id = fields.CharField(null=True, max_length=64, description="用户ID") + sec_uid = fields.CharField(null=True, max_length=128, description="用户sec_uid") + short_user_id = fields.CharField(null=True, max_length=64, description="用户短ID") + user_unique_id = fields.CharField(null=True, max_length=64, description="用户唯一ID") + nickname = fields.CharField(null=True, max_length=64, description="用户昵称") + avatar = fields.CharField(null=True, max_length=255, description="用户头像地址") + user_signature = fields.CharField(null=True, max_length=500, description="用户签名") + ip_location = fields.CharField(null=True, max_length=255, description="评论时的IP地址") + add_ts = fields.BigIntField(description="记录添加时间戳") + last_modify_ts = fields.BigIntField(description="记录最后修改时间戳") + + class Meta: + abstract = True + + +class DouyinAweme(DouyinBaseModel): + aweme_id = fields.CharField(max_length=64, index=True, description="视频ID") + aweme_type = fields.CharField(max_length=16, description="视频类型") + title = fields.CharField(null=True, max_length=500, description="视频标题") + desc = fields.TextField(null=True, description="视频描述") + create_time = fields.BigIntField(description="视频发布时间戳", index=True) + liked_count = fields.CharField(null=True, max_length=16, description="视频点赞数") + comment_count = fields.CharField(null=True, max_length=16, description="视频评论数") + share_count = fields.CharField(null=True, max_length=16, description="视频分享数") + + class Meta: + table = "douyin_aweme" + table_description = "抖音视频" + + def __str__(self): + return f"{self.aweme_id} - {self.title}" + + +class DouyinAwemeComment(DouyinBaseModel): + comment_id = fields.CharField(max_length=64, index=True, description="评论ID") + aweme_id = fields.CharField(max_length=64, index=True, description="视频ID") + content = fields.TextField(null=True, description="评论内容") + create_time = fields.BigIntField(description="评论时间戳") + sub_comment_count = fields.CharField(max_length=16, description="评论回复数") + + class Meta: + table = "douyin_aweme_comment" + table_description = "抖音视频评论" + + def __str__(self): + return f"{self.comment_id} - {self.content}" + + +async def update_douyin_aweme(aweme_item: Dict): + aweme_id = aweme_item.get("aweme_id") + user_info = aweme_item.get("author", {}) + interact_info = aweme_item.get("statistics", {}) + local_db_item = { + "aweme_id": aweme_id, + "aweme_type": aweme_item.get("aweme_type"), + "title": aweme_item.get("desc", ""), + "desc": aweme_item.get("desc", ""), + "create_time": aweme_item.get("create_time"), + "user_id": user_info.get("uid"), + "sec_uid": user_info.get("sec_uid"), + "short_user_id": user_info.get("short_id"), + "user_unique_id": user_info.get("unique_id"), + "user_signature": user_info.get("signature"), + "nickname": user_info.get("nickname"), + "avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0], + "liked_count": interact_info.get("digg_count"), + "collected_count": interact_info.get("collect_count"), + "comment_count": interact_info.get("comment_count"), + "share_count": interact_info.get("share_count"), + "ip_location": aweme_item.get("ip_label", ""), + "last_modify_ts": utils.get_current_timestamp(), + } + print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}") + if config.IS_SAVED_DATABASED: + if not await DouyinAweme.filter(aweme_id=aweme_id).exists(): + local_db_item["add_ts"] = utils.get_current_timestamp() + await DouyinAweme.create(**local_db_item) + else: + await DouyinAweme.filter(aweme_id=aweme_id).update(**local_db_item) + + +async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]): + if not comments: + return + for comment_item in comments: + await update_dy_aweme_comment(aweme_id, comment_item) + + +async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): + comment_aweme_id = comment_item.get("aweme_id") + if aweme_id != comment_aweme_id: + print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}") + return + user_info = comment_item.get("user", {}) + comment_id = comment_item.get("cid") + avatar_info = user_info.get("avatar_medium", {}) or user_info.get("avatar_300x300", {}) or user_info.get( + "avatar_168x168", {}) or user_info.get("avatar_thumb", {}) or {} + local_db_item = { + "comment_id": comment_id, + "create_time": comment_item.get("create_time"), + "ip_location": comment_item.get("ip_label", ""), + "aweme_id": aweme_id, + "content": comment_item.get("text"), + "content_extra": json.dumps(comment_item.get("text_extra", [])), + "user_id": user_info.get("uid"), + "sec_uid": user_info.get("sec_uid"), + "short_user_id": user_info.get("short_id"), + "user_unique_id": user_info.get("unique_id"), + "user_signature": user_info.get("signature"), + "nickname": user_info.get("nickname"), + "avatar": avatar_info.get("url_list", [""])[0], + "sub_comment_count": comment_item.get("reply_comment_total", 0), + "last_modify_ts": utils.get_current_timestamp(), + } + print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}") + if config.IS_SAVED_DATABASED: + if not await DouyinAwemeComment.filter(comment_id=comment_id).exists(): + local_db_item["add_ts"] = utils.get_current_timestamp() + await DouyinAwemeComment.create(**local_db_item) + else: + await DouyinAwemeComment.filter(comment_id=comment_id).update(**local_db_item) diff --git a/models/douyin/__init__.py b/models/douyin/__init__.py deleted file mode 100644 index 2b1372c..0000000 --- a/models/douyin/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .m_douyin import * diff --git a/models/douyin/m_douyin.py b/models/douyin/m_douyin.py deleted file mode 100644 index 53f36c1..0000000 --- a/models/douyin/m_douyin.py +++ /dev/null @@ -1,64 +0,0 @@ -import json -from typing import Dict, List - -from tools import utils - - -async def update_douyin_aweme(aweme_item: Dict): - aweme_id = aweme_item.get("aweme_id") - user_info = aweme_item.get("author", {}) - local_db_item = { - "aweme_id": aweme_id, - "aweme_type": aweme_item.get("aweme_type"), - "title": aweme_item.get("desc", ""), - "desc": aweme_item.get("desc", ""), - "create_time": aweme_item.get("create_time"), - "user_id": user_info.get("uid"), - "sec_uid": user_info.get("sec_uid"), - "short_user_id": user_info.get("short_id"), - "user_unique_id": user_info.get("unique_id"), - "user_signature": user_info.get("signature"), - "nickname": user_info.get("nickname"), - "avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0], - "ip_location": aweme_item.get("ip_label", ""), - "last_modify_ts": utils.get_current_timestamp(), - } - # do something ... - print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}") - - -async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]): - if not comments: - return - for comment_item in comments: - await update_dy_aweme_comment(aweme_id, comment_item) - - -async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): - comment_aweme_id = comment_item.get("aweme_id") - if aweme_id != comment_aweme_id: - print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}") - return - user_info = comment_item.get("user", {}) - comment_id = comment_item.get("cid") - avatar_info = user_info.get("avatar_medium", {}) or user_info.get("avatar_300x300", {}) or user_info.get( - "avatar_168x168", {}) or user_info.get("avatar_thumb", {}) or {} - local_db_item = { - "comment_id": comment_id, - "create_time": comment_item.get("create_time"), - "ip_location": comment_item.get("ip_label", ""), - "aweme_id": aweme_id, - "content": comment_item.get("text"), - "content_extra": json.dumps(comment_item.get("text_extra", [])), - "user_id": user_info.get("uid"), - "sec_uid": user_info.get("sec_uid"), - "short_user_id": user_info.get("short_id"), - "user_unique_id": user_info.get("unique_id"), - "user_signature": user_info.get("signature"), - "nickname": user_info.get("nickname"), - "avatar": avatar_info.get("url_list", [""])[0], - "sub_comment_count": comment_item.get("reply_comment_total", 0), - "last_modify_ts": utils.get_current_timestamp(), - } - # do something ... - print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}") diff --git a/models/xhs/__init__.py b/models/xhs/__init__.py deleted file mode 100644 index 1f15b57..0000000 --- a/models/xhs/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .m_xhs import * diff --git a/models/xhs/m_xhs.py b/models/xhs/m_xhs.py deleted file mode 100644 index 6e66fd1..0000000 --- a/models/xhs/m_xhs.py +++ /dev/null @@ -1,46 +0,0 @@ -from typing import Dict, List - -from tools import utils - - -async def update_xhs_note(note_item: Dict): - note_id = note_item.get("note_id") - user_info = note_item.get("user", {}) - interact_info = note_item.get("interact_info") - image_list: List[Dict]= note_item.get("image_list", []) - - local_db_item = { - "note_id": note_item.get("note_id"), - "type": note_item.get("type"), - "title": note_item.get("title"), - "desc": note_item.get("desc", ""), - "time": note_item.get("time"), - "last_update_time": note_item.get("last_update_time", 0), - "user_id": user_info.get("user_id"), - "nickname": user_info.get("nickname"), - "avatar": user_info.get("avatar"), - "ip_location": note_item.get("ip_location", ""), - "image_list": ','.join([img.get('url','') for img in image_list]), - "last_modify_ts": utils.get_current_timestamp(), - } - # do something ... - print("xhs note:", local_db_item) - - -async def update_xhs_note_comment(note_id: str, comment_item: Dict): - user_info = comment_item.get("user_info", {}) - comment_id = comment_item.get("id") - local_db_item = { - "comment_id": comment_id, - "create_time": comment_item.get("create_time"), - "ip_location": comment_item.get("ip_location"), - "note_id": note_id, - "content": comment_item.get("content"), - "user_id": user_info.get("user_id"), - "nickname": user_info.get("nickname"), - "avatar": user_info.get("image"), - "sub_comment_count": comment_item.get("sub_comment_count"), - "last_modify_ts": utils.get_current_timestamp(), - } - # do something ... - print("xhs note comment:", local_db_item) diff --git a/models/xiaohongshu.py b/models/xiaohongshu.py new file mode 100644 index 0000000..c558084 --- /dev/null +++ b/models/xiaohongshu.py @@ -0,0 +1,113 @@ +from typing import List, Dict + +from tortoise.models import Model +from tortoise import fields + +import config +from tools import utils + + +class XhsBaseModel(Model): + id = fields.IntField(pk=True, autoincrement=True, description="自增ID") + user_id = fields.CharField(max_length=64, description="用户ID") + nickname = fields.CharField(null=True, max_length=64, description="用户昵称") + avatar = fields.CharField(null=True, max_length=255, description="用户头像地址") + ip_location = fields.CharField(null=True, max_length=255, description="评论时的IP地址") + add_ts = fields.BigIntField(description="记录添加时间戳") + last_modify_ts = fields.BigIntField(description="记录最后修改时间戳") + + class Meta: + abstract = True + + +class XHSNote(XhsBaseModel): + note_id = fields.CharField(max_length=64, index=True, description="笔记ID") + type = fields.CharField(null=True, max_length=16, description="笔记类型(normal | video)") + title = fields.CharField(null=True, max_length=255, description="笔记标题") + desc = fields.TextField(null=True, description="笔记描述") + time = fields.BigIntField(description="笔记发布时间戳", index=True) + last_update_time = fields.BigIntField(description="笔记最后更新时间戳") + liked_count = fields.CharField(null=True, max_length=16, description="笔记点赞数") + collected_count = fields.CharField(null=True, max_length=16, description="笔记收藏数") + comment_count = fields.CharField(null=True, max_length=16, description="笔记评论数") + share_count = fields.CharField(null=True, max_length=16, description="笔记分享数") + image_list = fields.TextField(null=True, description="笔记封面图片列表") + + class Meta: + table = "xhs_note" + table_description = "小红书笔记" + + def __str__(self): + return f"{self.note_id} - {self.title}" + + +class XHSNoteComment(XhsBaseModel): + comment_id = fields.CharField(max_length=64, index=True, description="评论ID") + create_time = fields.BigIntField(index=True, description="评论时间戳") + note_id = fields.CharField(max_length=64, description="笔记ID") + content = fields.TextField(description="评论内容") + sub_comment_count = fields.IntField(description="子评论数量") + + class Meta: + table = "xhs_note_comment" + table_description = "小红书笔记评论" + + def __str__(self): + return f"{self.comment_id} - {self.content}" + + +async def update_xhs_note(note_item: Dict): + note_id = note_item.get("note_id") + user_info = note_item.get("user", {}) + interact_info = note_item.get("interact_info", {}) + image_list: List[Dict] = note_item.get("image_list", []) + + local_db_item = { + "note_id": note_item.get("note_id"), + "type": note_item.get("type"), + "title": note_item.get("title"), + "desc": note_item.get("desc", ""), + "time": note_item.get("time"), + "last_update_time": note_item.get("last_update_time", 0), + "user_id": user_info.get("user_id"), + "nickname": user_info.get("nickname"), + "avatar": user_info.get("avatar"), + "liked_count": interact_info.get("liked_count"), + "collected_count": interact_info.get("collected_count"), + "comment_count": interact_info.get("comment_count"), + "share_count": interact_info.get("share_count"), + "ip_location": note_item.get("ip_location", ""), + "image_list": ','.join([img.get('url', '') for img in image_list]), + "last_modify_ts": utils.get_current_timestamp(), + } + print("xhs note:", local_db_item) + if config.IS_SAVED_DATABASED: + if not await XHSNote.filter(note_id=note_id).first(): + local_db_item["add_ts"] = utils.get_current_timestamp() + await XHSNote.create(**local_db_item) + else: + await XHSNote.filter(note_id=note_id).update(**local_db_item) + + +async def update_xhs_note_comment(note_id: str, comment_item: Dict): + user_info = comment_item.get("user_info", {}) + comment_id = comment_item.get("id") + local_db_item = { + "comment_id": comment_id, + "create_time": comment_item.get("create_time"), + "ip_location": comment_item.get("ip_location"), + "note_id": note_id, + "content": comment_item.get("content"), + "user_id": user_info.get("user_id"), + "nickname": user_info.get("nickname"), + "avatar": user_info.get("image"), + "sub_comment_count": comment_item.get("sub_comment_count"), + "last_modify_ts": utils.get_current_timestamp(), + } + print("xhs note comment:", local_db_item) + if config.IS_SAVED_DATABASED: + if not await XHSNoteComment.filter(comment_id=comment_id).first(): + local_db_item["add_ts"] = utils.get_current_timestamp() + await XHSNoteComment.create(**local_db_item) + else: + await XHSNoteComment.filter(comment_id=comment_id).update(**local_db_item) diff --git a/requirements.txt b/requirements.txt index 25b142b..8be486a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,5 @@ tenacity==8.2.2 tornado==6.3.2 PyExecJS==1.5.1 opencv-python==4.7.0.72 +tortoise-orm[asyncmy]==0.19.3 +aerich==0.7.2 diff --git a/test/test_utils.py b/test/test_utils.py index 179f03a..92acb0d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- + from tools import utils @@ -7,7 +8,3 @@ def test_convert_cookies(): cookie_dict = utils.convert_str_cookie_to_dict(xhs_cookies) assert cookie_dict.get("webId") == "1190c4d3cxxxx125xxx" assert cookie_dict.get("a1") == "x000101360" - - -if __name__ == '__main__': - test_convert_cookies() \ No newline at end of file