feat: 增加配置项支持自由选择数据是否保存到关系型数据库中

This commit is contained in:
Relakkes 2023-07-24 20:59:43 +08:00
parent 745e59c875
commit e75707443b
20 changed files with 339 additions and 169 deletions

View File

@ -21,10 +21,8 @@
- [x] 抖音登录二维码、手机号、cookies
- [x] 抖音滑块模拟滑动实现准确率不太OK
- [x] 支持登录成功后的上下文浏览器环境保留
- [x] 数据持久化到硬盘(关系型数据库)
## 待实现
- [ ] 数据持久化到硬盘
## 使用方法
@ -32,9 +30,13 @@
`pip install -r requirements.txt`
2. 安装playwright浏览器驱动
`playwright install`
3. 运行爬虫程序
3. 是否选择开启保存数据到DB中
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量
<br>再执行 `python db.py` 初始化数据库信息,生成相关的数据库表结构
4. 运行爬虫程序
`python main.py --platform xhs --lt qrcode`
4. 打开小红书扫二维码登录
5. 打开对应APP扫二维码登录
## 项目代码结构
@ -67,24 +69,16 @@ MediaCrawler
│ ├── help.py # 辅助函数
│ └── login.py # 登录实现
├── modles
│ ├── douyin
│ │ └── m_douyin.py
│ └── xhs
│ └── m_xhs.py
│ ├── douyin.py # 抖音数据模型
│ └── xiaohongshu.py # 小红书数据模型
├── tools
│ └── utils.py # 工具函数
├── main.py # 程序入口
└── recv_sms_notification.py # 短信转发器的HTTP SERVER接口
```
## 数据持久化
## 小红书运行截图
![小红书运行截图](https://s2.loli.net/2023/06/09/PVBe3X5vf4yncrd.gif)
## 抖音运行截图
- ![抖音运行截图](https://s2.loli.net/2023/06/25/GXfkeLhpTyNiAqH.gif)
![数据持久化](https://s2.loli.net/2023/07/24/ZTcGWz8jPAy7b5M.png)
## 支持一下

View File

@ -1,2 +1,3 @@
from .base_config import *
from .account_config import *
from .db_config import *

View File

@ -2,11 +2,7 @@
PLATFORM = "xhs"
KEYWORDS = "健身,旅游"
LOGIN_TYPE = "qrcode" # qrcode or phone or cookies
COOKIES = "web_session=xxxxcfed1566xxxxxxxxxxxxxxxxxxx;" # if platform is xhs, pleas set only web_session cookie attr
# redis config
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
REDIS_DB_PWD = "123456" # your redis password
COOKIES = "web_session=xxxxcfed1566xxxxxxxxxxxxxxxxxxx;" # if platform is xhs, pleas set only web_session cookie attr
# enable ip proxy
ENABLE_IP_PROXY = False
@ -18,7 +14,7 @@ RETRY_INTERVAL = 60 * 30 # 30 minutes
HEADLESS = True
# save login state
SAVE_LOGIN_STATE = False
SAVE_LOGIN_STATE = True
# save user data dir
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name

9
config/db_config.py Normal file
View File

@ -0,0 +1,9 @@
# redis config
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
REDIS_DB_PWD = "123456" # your redis password
# mysql config
RELATION_DB_URL = "mysql://root:youdbpassword@localhost:3306/media_crawler"
# save data to database option
IS_SAVED_DATABASED = True # if you want to save data to database, set True

24
db.py Normal file
View File

@ -0,0 +1,24 @@
from tortoise import Tortoise
from tortoise import run_async
from config.db_config import *
from tools import utils
async def init_db(create_db: bool = False) -> None:
await Tortoise.init(
db_url=RELATION_DB_URL,
modules={'models': ['models']},
_create_db=create_db
)
async def init():
await init_db(create_db=True)
await Tortoise.generate_schemas()
utils.logger.info("Init DB Success!")
if __name__ == '__main__':
run_async(init())

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.4 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.9 MiB

View File

@ -2,8 +2,8 @@ import sys
import asyncio
import argparse
import db
import config
from tools import utils
from base import proxy_account_pool
from media_platform.douyin import DouYinCrawler
from media_platform.xhs import XiaoHongShuCrawler
@ -29,6 +29,10 @@ async def main():
# init account pool
account_pool = proxy_account_pool.create_account_pool()
# init db
if config.IS_SAVED_DATABASED:
await db.init_db()
args = parser.parse_args()
crawler = CrawlerFactory().create_crawler(platform=args.platform)
crawler.init_config(

View File

@ -23,12 +23,12 @@ class DouYinCrawler(AbstractCrawler):
dy_client: DOUYINClient
def __init__(self) -> None:
self.browser_context: Optional[BrowserContext] = None # type: ignore
self.context_page: Optional[Page] = None # type: ignore
self.browser_context: Optional[BrowserContext] = None # type: ignore
self.context_page: Optional[Page] = None # type: ignore
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.index_url = "https://www.douyin.com"
self.command_args: Optional[Namespace] = None # type: ignore
self.account_pool: Optional[AccountPool] = None # type: ignore
self.command_args: Optional[Namespace] = None # type: ignore
self.account_pool: Optional[AccountPool] = None # type: ignore
def init_config(self, **kwargs):
for key, value in kwargs.items():
@ -53,7 +53,7 @@ class DouYinCrawler(AbstractCrawler):
self.dy_client = await self.create_douyin_client(httpx_proxy)
if not await self.dy_client.ping(browser_context=self.browser_context):
login_obj = DouYinLogin(
login_type=self.command_args.lt, # type: ignore
login_type=self.command_args.lt, # type: ignore
login_phone=account_phone,
browser_context=self.browser_context,
context_page=self.context_page,
@ -88,27 +88,29 @@ class DouYinCrawler(AbstractCrawler):
post_item.get("aweme_mix_info", {}).get("mix_items")[0]
except TypeError:
continue
aweme_list.append(aweme_info.get("aweme_id",""))
aweme_list.append(aweme_info.get("aweme_id", ""))
await douyin.update_douyin_aweme(aweme_item=aweme_info)
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
# await self.batch_get_note_comments(aweme_list)
await self.batch_get_note_comments(aweme_list)
async def batch_get_note_comments(self, aweme_list: List[str]):
task_list: List[Task] = []
_semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
for aweme_id in aweme_list:
task = asyncio.create_task(self.get_comments(aweme_id), name=aweme_id)
task = asyncio.create_task(self.get_comments(aweme_id, _semaphore), name=aweme_id)
task_list.append(task)
await asyncio.wait(task_list)
async def get_comments(self, aweme_id: str):
try:
await self.dy_client.get_aweme_all_comments(
aweme_id=aweme_id,
callback=douyin.batch_update_dy_aweme_comments
)
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
except DataFetchError as e:
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
async def get_comments(self, aweme_id: str, semaphore: "asyncio.Semaphore"):
async with semaphore:
try:
await self.dy_client.get_aweme_all_comments(
aweme_id=aweme_id,
callback=douyin.batch_update_dy_aweme_comments
)
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
except DataFetchError as e:
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]:
"""Create proxy info for playwright and httpx"""
@ -116,7 +118,7 @@ class DouYinCrawler(AbstractCrawler):
return None, None, None
# phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888
phone, ip_proxy = self.account_pool.get_account() # type: ignore
phone, ip_proxy = self.account_pool.get_account() # type: ignore
playwright_proxy = {
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
"username": config.IP_PROXY_USER,
@ -127,7 +129,7 @@ class DouYinCrawler(AbstractCrawler):
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient:
"""Create douyin client"""
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
douyin_client = DOUYINClient(
proxies=httpx_proxy,
headers={
@ -152,18 +154,19 @@ class DouYinCrawler(AbstractCrawler):
) -> BrowserContext:
"""Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform) # type: ignore
user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.command_args.platform) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy, # type: ignore
proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
) # type: ignore
) # type: ignore
return browser_context
else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=user_agent

View File

@ -83,12 +83,15 @@ class XHSClient:
async def ping(self) -> bool:
"""get a note to check if login state is ok"""
utils.logger.info("begin to ping xhs...")
note_id = "5e5cb38a000000000100185e"
ping_flag = False
try:
note_card: Dict = await self.get_note_by_id(note_id)
return note_card.get("note_id") == note_id
except Exception:
return False
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
if note_card.get("items"):
ping_flag = True
except Exception as e:
utils.logger.error(f"ping xhs failed: {e}")
ping_flag = False
return ping_flag
async def update_cookies(self, browser_context: BrowserContext):
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())

View File

@ -15,7 +15,7 @@ from tools import utils
from .exception import *
from .login import XHSLogin
from .client import XHSClient
from models import xhs as xhs_model
from models import xiaohongshu as xhs_model
from base.base_crawler import AbstractCrawler
from base.proxy_account_pool import AccountPool

View File

@ -0,0 +1,3 @@
from .douyin import *
from .xiaohongshu import *

133
models/douyin.py Normal file
View File

@ -0,0 +1,133 @@
import json
from typing import Dict, List
from tortoise.models import Model
from tortoise import fields
import config
from tools import utils
class DouyinBaseModel(Model):
id = fields.IntField(pk=True, autoincrement=True, description="自增ID")
user_id = fields.CharField(null=True, max_length=64, description="用户ID")
sec_uid = fields.CharField(null=True, max_length=128, description="用户sec_uid")
short_user_id = fields.CharField(null=True, max_length=64, description="用户短ID")
user_unique_id = fields.CharField(null=True, max_length=64, description="用户唯一ID")
nickname = fields.CharField(null=True, max_length=64, description="用户昵称")
avatar = fields.CharField(null=True, max_length=255, description="用户头像地址")
user_signature = fields.CharField(null=True, max_length=500, description="用户签名")
ip_location = fields.CharField(null=True, max_length=255, description="评论时的IP地址")
add_ts = fields.BigIntField(description="记录添加时间戳")
last_modify_ts = fields.BigIntField(description="记录最后修改时间戳")
class Meta:
abstract = True
class DouyinAweme(DouyinBaseModel):
aweme_id = fields.CharField(max_length=64, index=True, description="视频ID")
aweme_type = fields.CharField(max_length=16, description="视频类型")
title = fields.CharField(null=True, max_length=500, description="视频标题")
desc = fields.TextField(null=True, description="视频描述")
create_time = fields.BigIntField(description="视频发布时间戳", index=True)
liked_count = fields.CharField(null=True, max_length=16, description="视频点赞数")
comment_count = fields.CharField(null=True, max_length=16, description="视频评论数")
share_count = fields.CharField(null=True, max_length=16, description="视频分享数")
class Meta:
table = "douyin_aweme"
table_description = "抖音视频"
def __str__(self):
return f"{self.aweme_id} - {self.title}"
class DouyinAwemeComment(DouyinBaseModel):
comment_id = fields.CharField(max_length=64, index=True, description="评论ID")
aweme_id = fields.CharField(max_length=64, index=True, description="视频ID")
content = fields.TextField(null=True, description="评论内容")
create_time = fields.BigIntField(description="评论时间戳")
sub_comment_count = fields.CharField(max_length=16, description="评论回复数")
class Meta:
table = "douyin_aweme_comment"
table_description = "抖音视频评论"
def __str__(self):
return f"{self.comment_id} - {self.content}"
async def update_douyin_aweme(aweme_item: Dict):
aweme_id = aweme_item.get("aweme_id")
user_info = aweme_item.get("author", {})
interact_info = aweme_item.get("statistics", {})
local_db_item = {
"aweme_id": aweme_id,
"aweme_type": aweme_item.get("aweme_type"),
"title": aweme_item.get("desc", ""),
"desc": aweme_item.get("desc", ""),
"create_time": aweme_item.get("create_time"),
"user_id": user_info.get("uid"),
"sec_uid": user_info.get("sec_uid"),
"short_user_id": user_info.get("short_id"),
"user_unique_id": user_info.get("unique_id"),
"user_signature": user_info.get("signature"),
"nickname": user_info.get("nickname"),
"avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
"liked_count": interact_info.get("digg_count"),
"collected_count": interact_info.get("collect_count"),
"comment_count": interact_info.get("comment_count"),
"share_count": interact_info.get("share_count"),
"ip_location": aweme_item.get("ip_label", ""),
"last_modify_ts": utils.get_current_timestamp(),
}
print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
if config.IS_SAVED_DATABASED:
if not await DouyinAweme.filter(aweme_id=aweme_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp()
await DouyinAweme.create(**local_db_item)
else:
await DouyinAweme.filter(aweme_id=aweme_id).update(**local_db_item)
async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
if not comments:
return
for comment_item in comments:
await update_dy_aweme_comment(aweme_id, comment_item)
async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
comment_aweme_id = comment_item.get("aweme_id")
if aweme_id != comment_aweme_id:
print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}")
return
user_info = comment_item.get("user", {})
comment_id = comment_item.get("cid")
avatar_info = user_info.get("avatar_medium", {}) or user_info.get("avatar_300x300", {}) or user_info.get(
"avatar_168x168", {}) or user_info.get("avatar_thumb", {}) or {}
local_db_item = {
"comment_id": comment_id,
"create_time": comment_item.get("create_time"),
"ip_location": comment_item.get("ip_label", ""),
"aweme_id": aweme_id,
"content": comment_item.get("text"),
"content_extra": json.dumps(comment_item.get("text_extra", [])),
"user_id": user_info.get("uid"),
"sec_uid": user_info.get("sec_uid"),
"short_user_id": user_info.get("short_id"),
"user_unique_id": user_info.get("unique_id"),
"user_signature": user_info.get("signature"),
"nickname": user_info.get("nickname"),
"avatar": avatar_info.get("url_list", [""])[0],
"sub_comment_count": comment_item.get("reply_comment_total", 0),
"last_modify_ts": utils.get_current_timestamp(),
}
print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")
if config.IS_SAVED_DATABASED:
if not await DouyinAwemeComment.filter(comment_id=comment_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp()
await DouyinAwemeComment.create(**local_db_item)
else:
await DouyinAwemeComment.filter(comment_id=comment_id).update(**local_db_item)

View File

@ -1 +0,0 @@
from .m_douyin import *

View File

@ -1,64 +0,0 @@
import json
from typing import Dict, List
from tools import utils
async def update_douyin_aweme(aweme_item: Dict):
aweme_id = aweme_item.get("aweme_id")
user_info = aweme_item.get("author", {})
local_db_item = {
"aweme_id": aweme_id,
"aweme_type": aweme_item.get("aweme_type"),
"title": aweme_item.get("desc", ""),
"desc": aweme_item.get("desc", ""),
"create_time": aweme_item.get("create_time"),
"user_id": user_info.get("uid"),
"sec_uid": user_info.get("sec_uid"),
"short_user_id": user_info.get("short_id"),
"user_unique_id": user_info.get("unique_id"),
"user_signature": user_info.get("signature"),
"nickname": user_info.get("nickname"),
"avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
"ip_location": aweme_item.get("ip_label", ""),
"last_modify_ts": utils.get_current_timestamp(),
}
# do something ...
print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
if not comments:
return
for comment_item in comments:
await update_dy_aweme_comment(aweme_id, comment_item)
async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
comment_aweme_id = comment_item.get("aweme_id")
if aweme_id != comment_aweme_id:
print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}")
return
user_info = comment_item.get("user", {})
comment_id = comment_item.get("cid")
avatar_info = user_info.get("avatar_medium", {}) or user_info.get("avatar_300x300", {}) or user_info.get(
"avatar_168x168", {}) or user_info.get("avatar_thumb", {}) or {}
local_db_item = {
"comment_id": comment_id,
"create_time": comment_item.get("create_time"),
"ip_location": comment_item.get("ip_label", ""),
"aweme_id": aweme_id,
"content": comment_item.get("text"),
"content_extra": json.dumps(comment_item.get("text_extra", [])),
"user_id": user_info.get("uid"),
"sec_uid": user_info.get("sec_uid"),
"short_user_id": user_info.get("short_id"),
"user_unique_id": user_info.get("unique_id"),
"user_signature": user_info.get("signature"),
"nickname": user_info.get("nickname"),
"avatar": avatar_info.get("url_list", [""])[0],
"sub_comment_count": comment_item.get("reply_comment_total", 0),
"last_modify_ts": utils.get_current_timestamp(),
}
# do something ...
print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")

View File

@ -1 +0,0 @@
from .m_xhs import *

View File

@ -1,46 +0,0 @@
from typing import Dict, List
from tools import utils
async def update_xhs_note(note_item: Dict):
note_id = note_item.get("note_id")
user_info = note_item.get("user", {})
interact_info = note_item.get("interact_info")
image_list: List[Dict]= note_item.get("image_list", [])
local_db_item = {
"note_id": note_item.get("note_id"),
"type": note_item.get("type"),
"title": note_item.get("title"),
"desc": note_item.get("desc", ""),
"time": note_item.get("time"),
"last_update_time": note_item.get("last_update_time", 0),
"user_id": user_info.get("user_id"),
"nickname": user_info.get("nickname"),
"avatar": user_info.get("avatar"),
"ip_location": note_item.get("ip_location", ""),
"image_list": ','.join([img.get('url','') for img in image_list]),
"last_modify_ts": utils.get_current_timestamp(),
}
# do something ...
print("xhs note:", local_db_item)
async def update_xhs_note_comment(note_id: str, comment_item: Dict):
user_info = comment_item.get("user_info", {})
comment_id = comment_item.get("id")
local_db_item = {
"comment_id": comment_id,
"create_time": comment_item.get("create_time"),
"ip_location": comment_item.get("ip_location"),
"note_id": note_id,
"content": comment_item.get("content"),
"user_id": user_info.get("user_id"),
"nickname": user_info.get("nickname"),
"avatar": user_info.get("image"),
"sub_comment_count": comment_item.get("sub_comment_count"),
"last_modify_ts": utils.get_current_timestamp(),
}
# do something ...
print("xhs note comment:", local_db_item)

113
models/xiaohongshu.py Normal file
View File

@ -0,0 +1,113 @@
from typing import List, Dict
from tortoise.models import Model
from tortoise import fields
import config
from tools import utils
class XhsBaseModel(Model):
id = fields.IntField(pk=True, autoincrement=True, description="自增ID")
user_id = fields.CharField(max_length=64, description="用户ID")
nickname = fields.CharField(null=True, max_length=64, description="用户昵称")
avatar = fields.CharField(null=True, max_length=255, description="用户头像地址")
ip_location = fields.CharField(null=True, max_length=255, description="评论时的IP地址")
add_ts = fields.BigIntField(description="记录添加时间戳")
last_modify_ts = fields.BigIntField(description="记录最后修改时间戳")
class Meta:
abstract = True
class XHSNote(XhsBaseModel):
note_id = fields.CharField(max_length=64, index=True, description="笔记ID")
type = fields.CharField(null=True, max_length=16, description="笔记类型(normal | video)")
title = fields.CharField(null=True, max_length=255, description="笔记标题")
desc = fields.TextField(null=True, description="笔记描述")
time = fields.BigIntField(description="笔记发布时间戳", index=True)
last_update_time = fields.BigIntField(description="笔记最后更新时间戳")
liked_count = fields.CharField(null=True, max_length=16, description="笔记点赞数")
collected_count = fields.CharField(null=True, max_length=16, description="笔记收藏数")
comment_count = fields.CharField(null=True, max_length=16, description="笔记评论数")
share_count = fields.CharField(null=True, max_length=16, description="笔记分享数")
image_list = fields.TextField(null=True, description="笔记封面图片列表")
class Meta:
table = "xhs_note"
table_description = "小红书笔记"
def __str__(self):
return f"{self.note_id} - {self.title}"
class XHSNoteComment(XhsBaseModel):
comment_id = fields.CharField(max_length=64, index=True, description="评论ID")
create_time = fields.BigIntField(index=True, description="评论时间戳")
note_id = fields.CharField(max_length=64, description="笔记ID")
content = fields.TextField(description="评论内容")
sub_comment_count = fields.IntField(description="子评论数量")
class Meta:
table = "xhs_note_comment"
table_description = "小红书笔记评论"
def __str__(self):
return f"{self.comment_id} - {self.content}"
async def update_xhs_note(note_item: Dict):
note_id = note_item.get("note_id")
user_info = note_item.get("user", {})
interact_info = note_item.get("interact_info", {})
image_list: List[Dict] = note_item.get("image_list", [])
local_db_item = {
"note_id": note_item.get("note_id"),
"type": note_item.get("type"),
"title": note_item.get("title"),
"desc": note_item.get("desc", ""),
"time": note_item.get("time"),
"last_update_time": note_item.get("last_update_time", 0),
"user_id": user_info.get("user_id"),
"nickname": user_info.get("nickname"),
"avatar": user_info.get("avatar"),
"liked_count": interact_info.get("liked_count"),
"collected_count": interact_info.get("collected_count"),
"comment_count": interact_info.get("comment_count"),
"share_count": interact_info.get("share_count"),
"ip_location": note_item.get("ip_location", ""),
"image_list": ','.join([img.get('url', '') for img in image_list]),
"last_modify_ts": utils.get_current_timestamp(),
}
print("xhs note:", local_db_item)
if config.IS_SAVED_DATABASED:
if not await XHSNote.filter(note_id=note_id).first():
local_db_item["add_ts"] = utils.get_current_timestamp()
await XHSNote.create(**local_db_item)
else:
await XHSNote.filter(note_id=note_id).update(**local_db_item)
async def update_xhs_note_comment(note_id: str, comment_item: Dict):
user_info = comment_item.get("user_info", {})
comment_id = comment_item.get("id")
local_db_item = {
"comment_id": comment_id,
"create_time": comment_item.get("create_time"),
"ip_location": comment_item.get("ip_location"),
"note_id": note_id,
"content": comment_item.get("content"),
"user_id": user_info.get("user_id"),
"nickname": user_info.get("nickname"),
"avatar": user_info.get("image"),
"sub_comment_count": comment_item.get("sub_comment_count"),
"last_modify_ts": utils.get_current_timestamp(),
}
print("xhs note comment:", local_db_item)
if config.IS_SAVED_DATABASED:
if not await XHSNoteComment.filter(comment_id=comment_id).first():
local_db_item["add_ts"] = utils.get_current_timestamp()
await XHSNoteComment.create(**local_db_item)
else:
await XHSNoteComment.filter(comment_id=comment_id).update(**local_db_item)

View File

@ -6,3 +6,5 @@ tenacity==8.2.2
tornado==6.3.2
PyExecJS==1.5.1
opencv-python==4.7.0.72
tortoise-orm[asyncmy]==0.19.3
aerich==0.7.2

View File

@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
from tools import utils
@ -7,7 +8,3 @@ def test_convert_cookies():
cookie_dict = utils.convert_str_cookie_to_dict(xhs_cookies)
assert cookie_dict.get("webId") == "1190c4d3cxxxx125xxx"
assert cookie_dict.get("a1") == "x000101360"
if __name__ == '__main__':
test_convert_cookies()