feat: 增加配置项支持自由选择数据是否保存到关系型数据库中
This commit is contained in:
parent
745e59c875
commit
e75707443b
28
README.md
28
README.md
|
@ -21,10 +21,8 @@
|
|||
- [x] 抖音登录(二维码、手机号、cookies)
|
||||
- [x] 抖音滑块(模拟滑动实现,准确率不太OK)
|
||||
- [x] 支持登录成功后的上下文浏览器环境保留
|
||||
- [x] 数据持久化到硬盘(关系型数据库)
|
||||
|
||||
## 待实现
|
||||
|
||||
- [ ] 数据持久化到硬盘
|
||||
|
||||
## 使用方法
|
||||
|
||||
|
@ -32,9 +30,13 @@
|
|||
`pip install -r requirements.txt`
|
||||
2. 安装playwright浏览器驱动
|
||||
`playwright install`
|
||||
3. 运行爬虫程序
|
||||
3. 是否选择开启保存数据到DB中
|
||||
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量
|
||||
<br>再执行 `python db.py` 初始化数据库信息,生成相关的数据库表结构
|
||||
4. 运行爬虫程序
|
||||
`python main.py --platform xhs --lt qrcode`
|
||||
4. 打开小红书扫二维码登录
|
||||
5. 打开对应APP扫二维码登录
|
||||
|
||||
|
||||
## 项目代码结构
|
||||
|
||||
|
@ -67,24 +69,16 @@ MediaCrawler
|
|||
│ ├── help.py # 辅助函数
|
||||
│ └── login.py # 登录实现
|
||||
├── modles
|
||||
│ ├── douyin
|
||||
│ │ └── m_douyin.py
|
||||
│ └── xhs
|
||||
│ └── m_xhs.py
|
||||
│ ├── douyin.py # 抖音数据模型
|
||||
│ └── xiaohongshu.py # 小红书数据模型
|
||||
├── tools
|
||||
│ └── utils.py # 工具函数
|
||||
├── main.py # 程序入口
|
||||
└── recv_sms_notification.py # 短信转发器的HTTP SERVER接口
|
||||
```
|
||||
## 数据持久化
|
||||
|
||||
## 小红书运行截图
|
||||
|
||||
![小红书运行截图](https://s2.loli.net/2023/06/09/PVBe3X5vf4yncrd.gif)
|
||||
|
||||
## 抖音运行截图
|
||||
|
||||
- ![抖音运行截图](https://s2.loli.net/2023/06/25/GXfkeLhpTyNiAqH.gif)
|
||||
|
||||
![数据持久化](https://s2.loli.net/2023/07/24/ZTcGWz8jPAy7b5M.png)
|
||||
|
||||
## 支持一下
|
||||
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
from .base_config import *
|
||||
from .account_config import *
|
||||
from .db_config import *
|
||||
|
|
|
@ -2,11 +2,7 @@
|
|||
PLATFORM = "xhs"
|
||||
KEYWORDS = "健身,旅游"
|
||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookies
|
||||
COOKIES = "web_session=xxxxcfed1566xxxxxxxxxxxxxxxxxxx;" # if platform is xhs, pleas set only web_session cookie attr
|
||||
|
||||
# redis config
|
||||
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
|
||||
REDIS_DB_PWD = "123456" # your redis password
|
||||
COOKIES = "web_session=xxxxcfed1566xxxxxxxxxxxxxxxxxxx;" # if platform is xhs, pleas set only web_session cookie attr
|
||||
|
||||
# enable ip proxy
|
||||
ENABLE_IP_PROXY = False
|
||||
|
@ -18,7 +14,7 @@ RETRY_INTERVAL = 60 * 30 # 30 minutes
|
|||
HEADLESS = True
|
||||
|
||||
# save login state
|
||||
SAVE_LOGIN_STATE = False
|
||||
SAVE_LOGIN_STATE = True
|
||||
|
||||
# save user data dir
|
||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
# redis config
|
||||
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
|
||||
REDIS_DB_PWD = "123456" # your redis password
|
||||
|
||||
# mysql config
|
||||
RELATION_DB_URL = "mysql://root:youdbpassword@localhost:3306/media_crawler"
|
||||
|
||||
# save data to database option
|
||||
IS_SAVED_DATABASED = True # if you want to save data to database, set True
|
|
@ -0,0 +1,24 @@
|
|||
from tortoise import Tortoise
|
||||
from tortoise import run_async
|
||||
|
||||
from config.db_config import *
|
||||
|
||||
from tools import utils
|
||||
|
||||
|
||||
async def init_db(create_db: bool = False) -> None:
|
||||
await Tortoise.init(
|
||||
db_url=RELATION_DB_URL,
|
||||
modules={'models': ['models']},
|
||||
_create_db=create_db
|
||||
)
|
||||
|
||||
|
||||
async def init():
|
||||
await init_db(create_db=True)
|
||||
await Tortoise.generate_schemas()
|
||||
utils.logger.info("Init DB Success!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run_async(init())
|
Binary file not shown.
Before Width: | Height: | Size: 2.4 MiB |
Binary file not shown.
Before Width: | Height: | Size: 2.9 MiB |
6
main.py
6
main.py
|
@ -2,8 +2,8 @@ import sys
|
|||
import asyncio
|
||||
import argparse
|
||||
|
||||
import db
|
||||
import config
|
||||
from tools import utils
|
||||
from base import proxy_account_pool
|
||||
from media_platform.douyin import DouYinCrawler
|
||||
from media_platform.xhs import XiaoHongShuCrawler
|
||||
|
@ -29,6 +29,10 @@ async def main():
|
|||
# init account pool
|
||||
account_pool = proxy_account_pool.create_account_pool()
|
||||
|
||||
# init db
|
||||
if config.IS_SAVED_DATABASED:
|
||||
await db.init_db()
|
||||
|
||||
args = parser.parse_args()
|
||||
crawler = CrawlerFactory().create_crawler(platform=args.platform)
|
||||
crawler.init_config(
|
||||
|
|
|
@ -23,12 +23,12 @@ class DouYinCrawler(AbstractCrawler):
|
|||
dy_client: DOUYINClient
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.browser_context: Optional[BrowserContext] = None # type: ignore
|
||||
self.context_page: Optional[Page] = None # type: ignore
|
||||
self.browser_context: Optional[BrowserContext] = None # type: ignore
|
||||
self.context_page: Optional[Page] = None # type: ignore
|
||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
||||
self.index_url = "https://www.douyin.com"
|
||||
self.command_args: Optional[Namespace] = None # type: ignore
|
||||
self.account_pool: Optional[AccountPool] = None # type: ignore
|
||||
self.command_args: Optional[Namespace] = None # type: ignore
|
||||
self.account_pool: Optional[AccountPool] = None # type: ignore
|
||||
|
||||
def init_config(self, **kwargs):
|
||||
for key, value in kwargs.items():
|
||||
|
@ -53,7 +53,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||
self.dy_client = await self.create_douyin_client(httpx_proxy)
|
||||
if not await self.dy_client.ping(browser_context=self.browser_context):
|
||||
login_obj = DouYinLogin(
|
||||
login_type=self.command_args.lt, # type: ignore
|
||||
login_type=self.command_args.lt, # type: ignore
|
||||
login_phone=account_phone,
|
||||
browser_context=self.browser_context,
|
||||
context_page=self.context_page,
|
||||
|
@ -88,27 +88,29 @@ class DouYinCrawler(AbstractCrawler):
|
|||
post_item.get("aweme_mix_info", {}).get("mix_items")[0]
|
||||
except TypeError:
|
||||
continue
|
||||
aweme_list.append(aweme_info.get("aweme_id",""))
|
||||
aweme_list.append(aweme_info.get("aweme_id", ""))
|
||||
await douyin.update_douyin_aweme(aweme_item=aweme_info)
|
||||
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
||||
# await self.batch_get_note_comments(aweme_list)
|
||||
await self.batch_get_note_comments(aweme_list)
|
||||
|
||||
async def batch_get_note_comments(self, aweme_list: List[str]):
|
||||
task_list: List[Task] = []
|
||||
_semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
for aweme_id in aweme_list:
|
||||
task = asyncio.create_task(self.get_comments(aweme_id), name=aweme_id)
|
||||
task = asyncio.create_task(self.get_comments(aweme_id, _semaphore), name=aweme_id)
|
||||
task_list.append(task)
|
||||
await asyncio.wait(task_list)
|
||||
|
||||
async def get_comments(self, aweme_id: str):
|
||||
try:
|
||||
await self.dy_client.get_aweme_all_comments(
|
||||
aweme_id=aweme_id,
|
||||
callback=douyin.batch_update_dy_aweme_comments
|
||||
)
|
||||
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
|
||||
except DataFetchError as e:
|
||||
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
|
||||
async def get_comments(self, aweme_id: str, semaphore: "asyncio.Semaphore"):
|
||||
async with semaphore:
|
||||
try:
|
||||
await self.dy_client.get_aweme_all_comments(
|
||||
aweme_id=aweme_id,
|
||||
callback=douyin.batch_update_dy_aweme_comments
|
||||
)
|
||||
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
|
||||
except DataFetchError as e:
|
||||
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
|
||||
|
||||
def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]:
|
||||
"""Create proxy info for playwright and httpx"""
|
||||
|
@ -116,7 +118,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||
return None, None, None
|
||||
|
||||
# phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888
|
||||
phone, ip_proxy = self.account_pool.get_account() # type: ignore
|
||||
phone, ip_proxy = self.account_pool.get_account() # type: ignore
|
||||
playwright_proxy = {
|
||||
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
|
||||
"username": config.IP_PROXY_USER,
|
||||
|
@ -127,7 +129,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||
|
||||
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient:
|
||||
"""Create douyin client"""
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
|
||||
douyin_client = DOUYINClient(
|
||||
proxies=httpx_proxy,
|
||||
headers={
|
||||
|
@ -152,18 +154,19 @@ class DouYinCrawler(AbstractCrawler):
|
|||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform) # type: ignore
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
||||
config.USER_DATA_DIR % self.command_args.platform) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent
|
||||
) # type: ignore
|
||||
) # type: ignore
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent
|
||||
|
|
|
@ -83,12 +83,15 @@ class XHSClient:
|
|||
async def ping(self) -> bool:
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("begin to ping xhs...")
|
||||
note_id = "5e5cb38a000000000100185e"
|
||||
ping_flag = False
|
||||
try:
|
||||
note_card: Dict = await self.get_note_by_id(note_id)
|
||||
return note_card.get("note_id") == note_id
|
||||
except Exception:
|
||||
return False
|
||||
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
|
||||
if note_card.get("items"):
|
||||
ping_flag = True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"ping xhs failed: {e}")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||
|
|
|
@ -15,7 +15,7 @@ from tools import utils
|
|||
from .exception import *
|
||||
from .login import XHSLogin
|
||||
from .client import XHSClient
|
||||
from models import xhs as xhs_model
|
||||
from models import xiaohongshu as xhs_model
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from base.proxy_account_pool import AccountPool
|
||||
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
from .douyin import *
|
||||
from .xiaohongshu import *
|
||||
|
|
@ -0,0 +1,133 @@
|
|||
import json
|
||||
from typing import Dict, List
|
||||
|
||||
from tortoise.models import Model
|
||||
from tortoise import fields
|
||||
|
||||
import config
|
||||
from tools import utils
|
||||
|
||||
|
||||
class DouyinBaseModel(Model):
|
||||
id = fields.IntField(pk=True, autoincrement=True, description="自增ID")
|
||||
user_id = fields.CharField(null=True, max_length=64, description="用户ID")
|
||||
sec_uid = fields.CharField(null=True, max_length=128, description="用户sec_uid")
|
||||
short_user_id = fields.CharField(null=True, max_length=64, description="用户短ID")
|
||||
user_unique_id = fields.CharField(null=True, max_length=64, description="用户唯一ID")
|
||||
nickname = fields.CharField(null=True, max_length=64, description="用户昵称")
|
||||
avatar = fields.CharField(null=True, max_length=255, description="用户头像地址")
|
||||
user_signature = fields.CharField(null=True, max_length=500, description="用户签名")
|
||||
ip_location = fields.CharField(null=True, max_length=255, description="评论时的IP地址")
|
||||
add_ts = fields.BigIntField(description="记录添加时间戳")
|
||||
last_modify_ts = fields.BigIntField(description="记录最后修改时间戳")
|
||||
|
||||
class Meta:
|
||||
abstract = True
|
||||
|
||||
|
||||
class DouyinAweme(DouyinBaseModel):
|
||||
aweme_id = fields.CharField(max_length=64, index=True, description="视频ID")
|
||||
aweme_type = fields.CharField(max_length=16, description="视频类型")
|
||||
title = fields.CharField(null=True, max_length=500, description="视频标题")
|
||||
desc = fields.TextField(null=True, description="视频描述")
|
||||
create_time = fields.BigIntField(description="视频发布时间戳", index=True)
|
||||
liked_count = fields.CharField(null=True, max_length=16, description="视频点赞数")
|
||||
comment_count = fields.CharField(null=True, max_length=16, description="视频评论数")
|
||||
share_count = fields.CharField(null=True, max_length=16, description="视频分享数")
|
||||
|
||||
class Meta:
|
||||
table = "douyin_aweme"
|
||||
table_description = "抖音视频"
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.aweme_id} - {self.title}"
|
||||
|
||||
|
||||
class DouyinAwemeComment(DouyinBaseModel):
|
||||
comment_id = fields.CharField(max_length=64, index=True, description="评论ID")
|
||||
aweme_id = fields.CharField(max_length=64, index=True, description="视频ID")
|
||||
content = fields.TextField(null=True, description="评论内容")
|
||||
create_time = fields.BigIntField(description="评论时间戳")
|
||||
sub_comment_count = fields.CharField(max_length=16, description="评论回复数")
|
||||
|
||||
class Meta:
|
||||
table = "douyin_aweme_comment"
|
||||
table_description = "抖音视频评论"
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.comment_id} - {self.content}"
|
||||
|
||||
|
||||
async def update_douyin_aweme(aweme_item: Dict):
|
||||
aweme_id = aweme_item.get("aweme_id")
|
||||
user_info = aweme_item.get("author", {})
|
||||
interact_info = aweme_item.get("statistics", {})
|
||||
local_db_item = {
|
||||
"aweme_id": aweme_id,
|
||||
"aweme_type": aweme_item.get("aweme_type"),
|
||||
"title": aweme_item.get("desc", ""),
|
||||
"desc": aweme_item.get("desc", ""),
|
||||
"create_time": aweme_item.get("create_time"),
|
||||
"user_id": user_info.get("uid"),
|
||||
"sec_uid": user_info.get("sec_uid"),
|
||||
"short_user_id": user_info.get("short_id"),
|
||||
"user_unique_id": user_info.get("unique_id"),
|
||||
"user_signature": user_info.get("signature"),
|
||||
"nickname": user_info.get("nickname"),
|
||||
"avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
|
||||
"liked_count": interact_info.get("digg_count"),
|
||||
"collected_count": interact_info.get("collect_count"),
|
||||
"comment_count": interact_info.get("comment_count"),
|
||||
"share_count": interact_info.get("share_count"),
|
||||
"ip_location": aweme_item.get("ip_label", ""),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
|
||||
if config.IS_SAVED_DATABASED:
|
||||
if not await DouyinAweme.filter(aweme_id=aweme_id).exists():
|
||||
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||
await DouyinAweme.create(**local_db_item)
|
||||
else:
|
||||
await DouyinAweme.filter(aweme_id=aweme_id).update(**local_db_item)
|
||||
|
||||
|
||||
async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
|
||||
if not comments:
|
||||
return
|
||||
for comment_item in comments:
|
||||
await update_dy_aweme_comment(aweme_id, comment_item)
|
||||
|
||||
|
||||
async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
|
||||
comment_aweme_id = comment_item.get("aweme_id")
|
||||
if aweme_id != comment_aweme_id:
|
||||
print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}")
|
||||
return
|
||||
user_info = comment_item.get("user", {})
|
||||
comment_id = comment_item.get("cid")
|
||||
avatar_info = user_info.get("avatar_medium", {}) or user_info.get("avatar_300x300", {}) or user_info.get(
|
||||
"avatar_168x168", {}) or user_info.get("avatar_thumb", {}) or {}
|
||||
local_db_item = {
|
||||
"comment_id": comment_id,
|
||||
"create_time": comment_item.get("create_time"),
|
||||
"ip_location": comment_item.get("ip_label", ""),
|
||||
"aweme_id": aweme_id,
|
||||
"content": comment_item.get("text"),
|
||||
"content_extra": json.dumps(comment_item.get("text_extra", [])),
|
||||
"user_id": user_info.get("uid"),
|
||||
"sec_uid": user_info.get("sec_uid"),
|
||||
"short_user_id": user_info.get("short_id"),
|
||||
"user_unique_id": user_info.get("unique_id"),
|
||||
"user_signature": user_info.get("signature"),
|
||||
"nickname": user_info.get("nickname"),
|
||||
"avatar": avatar_info.get("url_list", [""])[0],
|
||||
"sub_comment_count": comment_item.get("reply_comment_total", 0),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")
|
||||
if config.IS_SAVED_DATABASED:
|
||||
if not await DouyinAwemeComment.filter(comment_id=comment_id).exists():
|
||||
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||
await DouyinAwemeComment.create(**local_db_item)
|
||||
else:
|
||||
await DouyinAwemeComment.filter(comment_id=comment_id).update(**local_db_item)
|
|
@ -1 +0,0 @@
|
|||
from .m_douyin import *
|
|
@ -1,64 +0,0 @@
|
|||
import json
|
||||
from typing import Dict, List
|
||||
|
||||
from tools import utils
|
||||
|
||||
|
||||
async def update_douyin_aweme(aweme_item: Dict):
|
||||
aweme_id = aweme_item.get("aweme_id")
|
||||
user_info = aweme_item.get("author", {})
|
||||
local_db_item = {
|
||||
"aweme_id": aweme_id,
|
||||
"aweme_type": aweme_item.get("aweme_type"),
|
||||
"title": aweme_item.get("desc", ""),
|
||||
"desc": aweme_item.get("desc", ""),
|
||||
"create_time": aweme_item.get("create_time"),
|
||||
"user_id": user_info.get("uid"),
|
||||
"sec_uid": user_info.get("sec_uid"),
|
||||
"short_user_id": user_info.get("short_id"),
|
||||
"user_unique_id": user_info.get("unique_id"),
|
||||
"user_signature": user_info.get("signature"),
|
||||
"nickname": user_info.get("nickname"),
|
||||
"avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
|
||||
"ip_location": aweme_item.get("ip_label", ""),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
# do something ...
|
||||
print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
|
||||
|
||||
|
||||
async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
|
||||
if not comments:
|
||||
return
|
||||
for comment_item in comments:
|
||||
await update_dy_aweme_comment(aweme_id, comment_item)
|
||||
|
||||
|
||||
async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
|
||||
comment_aweme_id = comment_item.get("aweme_id")
|
||||
if aweme_id != comment_aweme_id:
|
||||
print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}")
|
||||
return
|
||||
user_info = comment_item.get("user", {})
|
||||
comment_id = comment_item.get("cid")
|
||||
avatar_info = user_info.get("avatar_medium", {}) or user_info.get("avatar_300x300", {}) or user_info.get(
|
||||
"avatar_168x168", {}) or user_info.get("avatar_thumb", {}) or {}
|
||||
local_db_item = {
|
||||
"comment_id": comment_id,
|
||||
"create_time": comment_item.get("create_time"),
|
||||
"ip_location": comment_item.get("ip_label", ""),
|
||||
"aweme_id": aweme_id,
|
||||
"content": comment_item.get("text"),
|
||||
"content_extra": json.dumps(comment_item.get("text_extra", [])),
|
||||
"user_id": user_info.get("uid"),
|
||||
"sec_uid": user_info.get("sec_uid"),
|
||||
"short_user_id": user_info.get("short_id"),
|
||||
"user_unique_id": user_info.get("unique_id"),
|
||||
"user_signature": user_info.get("signature"),
|
||||
"nickname": user_info.get("nickname"),
|
||||
"avatar": avatar_info.get("url_list", [""])[0],
|
||||
"sub_comment_count": comment_item.get("reply_comment_total", 0),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
# do something ...
|
||||
print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")
|
|
@ -1 +0,0 @@
|
|||
from .m_xhs import *
|
|
@ -1,46 +0,0 @@
|
|||
from typing import Dict, List
|
||||
|
||||
from tools import utils
|
||||
|
||||
|
||||
async def update_xhs_note(note_item: Dict):
|
||||
note_id = note_item.get("note_id")
|
||||
user_info = note_item.get("user", {})
|
||||
interact_info = note_item.get("interact_info")
|
||||
image_list: List[Dict]= note_item.get("image_list", [])
|
||||
|
||||
local_db_item = {
|
||||
"note_id": note_item.get("note_id"),
|
||||
"type": note_item.get("type"),
|
||||
"title": note_item.get("title"),
|
||||
"desc": note_item.get("desc", ""),
|
||||
"time": note_item.get("time"),
|
||||
"last_update_time": note_item.get("last_update_time", 0),
|
||||
"user_id": user_info.get("user_id"),
|
||||
"nickname": user_info.get("nickname"),
|
||||
"avatar": user_info.get("avatar"),
|
||||
"ip_location": note_item.get("ip_location", ""),
|
||||
"image_list": ','.join([img.get('url','') for img in image_list]),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
# do something ...
|
||||
print("xhs note:", local_db_item)
|
||||
|
||||
|
||||
async def update_xhs_note_comment(note_id: str, comment_item: Dict):
|
||||
user_info = comment_item.get("user_info", {})
|
||||
comment_id = comment_item.get("id")
|
||||
local_db_item = {
|
||||
"comment_id": comment_id,
|
||||
"create_time": comment_item.get("create_time"),
|
||||
"ip_location": comment_item.get("ip_location"),
|
||||
"note_id": note_id,
|
||||
"content": comment_item.get("content"),
|
||||
"user_id": user_info.get("user_id"),
|
||||
"nickname": user_info.get("nickname"),
|
||||
"avatar": user_info.get("image"),
|
||||
"sub_comment_count": comment_item.get("sub_comment_count"),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
# do something ...
|
||||
print("xhs note comment:", local_db_item)
|
|
@ -0,0 +1,113 @@
|
|||
from typing import List, Dict
|
||||
|
||||
from tortoise.models import Model
|
||||
from tortoise import fields
|
||||
|
||||
import config
|
||||
from tools import utils
|
||||
|
||||
|
||||
class XhsBaseModel(Model):
|
||||
id = fields.IntField(pk=True, autoincrement=True, description="自增ID")
|
||||
user_id = fields.CharField(max_length=64, description="用户ID")
|
||||
nickname = fields.CharField(null=True, max_length=64, description="用户昵称")
|
||||
avatar = fields.CharField(null=True, max_length=255, description="用户头像地址")
|
||||
ip_location = fields.CharField(null=True, max_length=255, description="评论时的IP地址")
|
||||
add_ts = fields.BigIntField(description="记录添加时间戳")
|
||||
last_modify_ts = fields.BigIntField(description="记录最后修改时间戳")
|
||||
|
||||
class Meta:
|
||||
abstract = True
|
||||
|
||||
|
||||
class XHSNote(XhsBaseModel):
|
||||
note_id = fields.CharField(max_length=64, index=True, description="笔记ID")
|
||||
type = fields.CharField(null=True, max_length=16, description="笔记类型(normal | video)")
|
||||
title = fields.CharField(null=True, max_length=255, description="笔记标题")
|
||||
desc = fields.TextField(null=True, description="笔记描述")
|
||||
time = fields.BigIntField(description="笔记发布时间戳", index=True)
|
||||
last_update_time = fields.BigIntField(description="笔记最后更新时间戳")
|
||||
liked_count = fields.CharField(null=True, max_length=16, description="笔记点赞数")
|
||||
collected_count = fields.CharField(null=True, max_length=16, description="笔记收藏数")
|
||||
comment_count = fields.CharField(null=True, max_length=16, description="笔记评论数")
|
||||
share_count = fields.CharField(null=True, max_length=16, description="笔记分享数")
|
||||
image_list = fields.TextField(null=True, description="笔记封面图片列表")
|
||||
|
||||
class Meta:
|
||||
table = "xhs_note"
|
||||
table_description = "小红书笔记"
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.note_id} - {self.title}"
|
||||
|
||||
|
||||
class XHSNoteComment(XhsBaseModel):
|
||||
comment_id = fields.CharField(max_length=64, index=True, description="评论ID")
|
||||
create_time = fields.BigIntField(index=True, description="评论时间戳")
|
||||
note_id = fields.CharField(max_length=64, description="笔记ID")
|
||||
content = fields.TextField(description="评论内容")
|
||||
sub_comment_count = fields.IntField(description="子评论数量")
|
||||
|
||||
class Meta:
|
||||
table = "xhs_note_comment"
|
||||
table_description = "小红书笔记评论"
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.comment_id} - {self.content}"
|
||||
|
||||
|
||||
async def update_xhs_note(note_item: Dict):
|
||||
note_id = note_item.get("note_id")
|
||||
user_info = note_item.get("user", {})
|
||||
interact_info = note_item.get("interact_info", {})
|
||||
image_list: List[Dict] = note_item.get("image_list", [])
|
||||
|
||||
local_db_item = {
|
||||
"note_id": note_item.get("note_id"),
|
||||
"type": note_item.get("type"),
|
||||
"title": note_item.get("title"),
|
||||
"desc": note_item.get("desc", ""),
|
||||
"time": note_item.get("time"),
|
||||
"last_update_time": note_item.get("last_update_time", 0),
|
||||
"user_id": user_info.get("user_id"),
|
||||
"nickname": user_info.get("nickname"),
|
||||
"avatar": user_info.get("avatar"),
|
||||
"liked_count": interact_info.get("liked_count"),
|
||||
"collected_count": interact_info.get("collected_count"),
|
||||
"comment_count": interact_info.get("comment_count"),
|
||||
"share_count": interact_info.get("share_count"),
|
||||
"ip_location": note_item.get("ip_location", ""),
|
||||
"image_list": ','.join([img.get('url', '') for img in image_list]),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
print("xhs note:", local_db_item)
|
||||
if config.IS_SAVED_DATABASED:
|
||||
if not await XHSNote.filter(note_id=note_id).first():
|
||||
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||
await XHSNote.create(**local_db_item)
|
||||
else:
|
||||
await XHSNote.filter(note_id=note_id).update(**local_db_item)
|
||||
|
||||
|
||||
async def update_xhs_note_comment(note_id: str, comment_item: Dict):
|
||||
user_info = comment_item.get("user_info", {})
|
||||
comment_id = comment_item.get("id")
|
||||
local_db_item = {
|
||||
"comment_id": comment_id,
|
||||
"create_time": comment_item.get("create_time"),
|
||||
"ip_location": comment_item.get("ip_location"),
|
||||
"note_id": note_id,
|
||||
"content": comment_item.get("content"),
|
||||
"user_id": user_info.get("user_id"),
|
||||
"nickname": user_info.get("nickname"),
|
||||
"avatar": user_info.get("image"),
|
||||
"sub_comment_count": comment_item.get("sub_comment_count"),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
print("xhs note comment:", local_db_item)
|
||||
if config.IS_SAVED_DATABASED:
|
||||
if not await XHSNoteComment.filter(comment_id=comment_id).first():
|
||||
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||
await XHSNoteComment.create(**local_db_item)
|
||||
else:
|
||||
await XHSNoteComment.filter(comment_id=comment_id).update(**local_db_item)
|
|
@ -6,3 +6,5 @@ tenacity==8.2.2
|
|||
tornado==6.3.2
|
||||
PyExecJS==1.5.1
|
||||
opencv-python==4.7.0.72
|
||||
tortoise-orm[asyncmy]==0.19.3
|
||||
aerich==0.7.2
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from tools import utils
|
||||
|
||||
|
||||
|
@ -7,7 +8,3 @@ def test_convert_cookies():
|
|||
cookie_dict = utils.convert_str_cookie_to_dict(xhs_cookies)
|
||||
assert cookie_dict.get("webId") == "1190c4d3cxxxx125xxx"
|
||||
assert cookie_dict.get("a1") == "x000101360"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_convert_cookies()
|
Loading…
Reference in New Issue