feat: 增加配置项支持自由选择数据是否保存到关系型数据库中
This commit is contained in:
parent
745e59c875
commit
e75707443b
28
README.md
28
README.md
|
@ -21,10 +21,8 @@
|
||||||
- [x] 抖音登录(二维码、手机号、cookies)
|
- [x] 抖音登录(二维码、手机号、cookies)
|
||||||
- [x] 抖音滑块(模拟滑动实现,准确率不太OK)
|
- [x] 抖音滑块(模拟滑动实现,准确率不太OK)
|
||||||
- [x] 支持登录成功后的上下文浏览器环境保留
|
- [x] 支持登录成功后的上下文浏览器环境保留
|
||||||
|
- [x] 数据持久化到硬盘(关系型数据库)
|
||||||
|
|
||||||
## 待实现
|
|
||||||
|
|
||||||
- [ ] 数据持久化到硬盘
|
|
||||||
|
|
||||||
## 使用方法
|
## 使用方法
|
||||||
|
|
||||||
|
@ -32,9 +30,13 @@
|
||||||
`pip install -r requirements.txt`
|
`pip install -r requirements.txt`
|
||||||
2. 安装playwright浏览器驱动
|
2. 安装playwright浏览器驱动
|
||||||
`playwright install`
|
`playwright install`
|
||||||
3. 运行爬虫程序
|
3. 是否选择开启保存数据到DB中
|
||||||
|
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量
|
||||||
|
<br>再执行 `python db.py` 初始化数据库信息,生成相关的数据库表结构
|
||||||
|
4. 运行爬虫程序
|
||||||
`python main.py --platform xhs --lt qrcode`
|
`python main.py --platform xhs --lt qrcode`
|
||||||
4. 打开小红书扫二维码登录
|
5. 打开对应APP扫二维码登录
|
||||||
|
|
||||||
|
|
||||||
## 项目代码结构
|
## 项目代码结构
|
||||||
|
|
||||||
|
@ -67,24 +69,16 @@ MediaCrawler
|
||||||
│ ├── help.py # 辅助函数
|
│ ├── help.py # 辅助函数
|
||||||
│ └── login.py # 登录实现
|
│ └── login.py # 登录实现
|
||||||
├── modles
|
├── modles
|
||||||
│ ├── douyin
|
│ ├── douyin.py # 抖音数据模型
|
||||||
│ │ └── m_douyin.py
|
│ └── xiaohongshu.py # 小红书数据模型
|
||||||
│ └── xhs
|
|
||||||
│ └── m_xhs.py
|
|
||||||
├── tools
|
├── tools
|
||||||
│ └── utils.py # 工具函数
|
│ └── utils.py # 工具函数
|
||||||
├── main.py # 程序入口
|
├── main.py # 程序入口
|
||||||
└── recv_sms_notification.py # 短信转发器的HTTP SERVER接口
|
└── recv_sms_notification.py # 短信转发器的HTTP SERVER接口
|
||||||
```
|
```
|
||||||
|
## 数据持久化
|
||||||
|
|
||||||
## 小红书运行截图
|
![数据持久化](https://s2.loli.net/2023/07/24/ZTcGWz8jPAy7b5M.png)
|
||||||
|
|
||||||
![小红书运行截图](https://s2.loli.net/2023/06/09/PVBe3X5vf4yncrd.gif)
|
|
||||||
|
|
||||||
## 抖音运行截图
|
|
||||||
|
|
||||||
- ![抖音运行截图](https://s2.loli.net/2023/06/25/GXfkeLhpTyNiAqH.gif)
|
|
||||||
|
|
||||||
|
|
||||||
## 支持一下
|
## 支持一下
|
||||||
|
|
||||||
|
|
|
@ -1,2 +1,3 @@
|
||||||
from .base_config import *
|
from .base_config import *
|
||||||
from .account_config import *
|
from .account_config import *
|
||||||
|
from .db_config import *
|
||||||
|
|
|
@ -2,11 +2,7 @@
|
||||||
PLATFORM = "xhs"
|
PLATFORM = "xhs"
|
||||||
KEYWORDS = "健身,旅游"
|
KEYWORDS = "健身,旅游"
|
||||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookies
|
LOGIN_TYPE = "qrcode" # qrcode or phone or cookies
|
||||||
COOKIES = "web_session=xxxxcfed1566xxxxxxxxxxxxxxxxxxx;" # if platform is xhs, pleas set only web_session cookie attr
|
COOKIES = "web_session=xxxxcfed1566xxxxxxxxxxxxxxxxxxx;" # if platform is xhs, pleas set only web_session cookie attr
|
||||||
|
|
||||||
# redis config
|
|
||||||
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
|
|
||||||
REDIS_DB_PWD = "123456" # your redis password
|
|
||||||
|
|
||||||
# enable ip proxy
|
# enable ip proxy
|
||||||
ENABLE_IP_PROXY = False
|
ENABLE_IP_PROXY = False
|
||||||
|
@ -18,7 +14,7 @@ RETRY_INTERVAL = 60 * 30 # 30 minutes
|
||||||
HEADLESS = True
|
HEADLESS = True
|
||||||
|
|
||||||
# save login state
|
# save login state
|
||||||
SAVE_LOGIN_STATE = False
|
SAVE_LOGIN_STATE = True
|
||||||
|
|
||||||
# save user data dir
|
# save user data dir
|
||||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
# redis config
|
||||||
|
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
|
||||||
|
REDIS_DB_PWD = "123456" # your redis password
|
||||||
|
|
||||||
|
# mysql config
|
||||||
|
RELATION_DB_URL = "mysql://root:youdbpassword@localhost:3306/media_crawler"
|
||||||
|
|
||||||
|
# save data to database option
|
||||||
|
IS_SAVED_DATABASED = True # if you want to save data to database, set True
|
|
@ -0,0 +1,24 @@
|
||||||
|
from tortoise import Tortoise
|
||||||
|
from tortoise import run_async
|
||||||
|
|
||||||
|
from config.db_config import *
|
||||||
|
|
||||||
|
from tools import utils
|
||||||
|
|
||||||
|
|
||||||
|
async def init_db(create_db: bool = False) -> None:
|
||||||
|
await Tortoise.init(
|
||||||
|
db_url=RELATION_DB_URL,
|
||||||
|
modules={'models': ['models']},
|
||||||
|
_create_db=create_db
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def init():
|
||||||
|
await init_db(create_db=True)
|
||||||
|
await Tortoise.generate_schemas()
|
||||||
|
utils.logger.info("Init DB Success!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
run_async(init())
|
Binary file not shown.
Before Width: | Height: | Size: 2.4 MiB |
Binary file not shown.
Before Width: | Height: | Size: 2.9 MiB |
6
main.py
6
main.py
|
@ -2,8 +2,8 @@ import sys
|
||||||
import asyncio
|
import asyncio
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
import db
|
||||||
import config
|
import config
|
||||||
from tools import utils
|
|
||||||
from base import proxy_account_pool
|
from base import proxy_account_pool
|
||||||
from media_platform.douyin import DouYinCrawler
|
from media_platform.douyin import DouYinCrawler
|
||||||
from media_platform.xhs import XiaoHongShuCrawler
|
from media_platform.xhs import XiaoHongShuCrawler
|
||||||
|
@ -29,6 +29,10 @@ async def main():
|
||||||
# init account pool
|
# init account pool
|
||||||
account_pool = proxy_account_pool.create_account_pool()
|
account_pool = proxy_account_pool.create_account_pool()
|
||||||
|
|
||||||
|
# init db
|
||||||
|
if config.IS_SAVED_DATABASED:
|
||||||
|
await db.init_db()
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
crawler = CrawlerFactory().create_crawler(platform=args.platform)
|
crawler = CrawlerFactory().create_crawler(platform=args.platform)
|
||||||
crawler.init_config(
|
crawler.init_config(
|
||||||
|
|
|
@ -23,12 +23,12 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
dy_client: DOUYINClient
|
dy_client: DOUYINClient
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.browser_context: Optional[BrowserContext] = None # type: ignore
|
self.browser_context: Optional[BrowserContext] = None # type: ignore
|
||||||
self.context_page: Optional[Page] = None # type: ignore
|
self.context_page: Optional[Page] = None # type: ignore
|
||||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
||||||
self.index_url = "https://www.douyin.com"
|
self.index_url = "https://www.douyin.com"
|
||||||
self.command_args: Optional[Namespace] = None # type: ignore
|
self.command_args: Optional[Namespace] = None # type: ignore
|
||||||
self.account_pool: Optional[AccountPool] = None # type: ignore
|
self.account_pool: Optional[AccountPool] = None # type: ignore
|
||||||
|
|
||||||
def init_config(self, **kwargs):
|
def init_config(self, **kwargs):
|
||||||
for key, value in kwargs.items():
|
for key, value in kwargs.items():
|
||||||
|
@ -53,7 +53,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
self.dy_client = await self.create_douyin_client(httpx_proxy)
|
self.dy_client = await self.create_douyin_client(httpx_proxy)
|
||||||
if not await self.dy_client.ping(browser_context=self.browser_context):
|
if not await self.dy_client.ping(browser_context=self.browser_context):
|
||||||
login_obj = DouYinLogin(
|
login_obj = DouYinLogin(
|
||||||
login_type=self.command_args.lt, # type: ignore
|
login_type=self.command_args.lt, # type: ignore
|
||||||
login_phone=account_phone,
|
login_phone=account_phone,
|
||||||
browser_context=self.browser_context,
|
browser_context=self.browser_context,
|
||||||
context_page=self.context_page,
|
context_page=self.context_page,
|
||||||
|
@ -88,27 +88,29 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
post_item.get("aweme_mix_info", {}).get("mix_items")[0]
|
post_item.get("aweme_mix_info", {}).get("mix_items")[0]
|
||||||
except TypeError:
|
except TypeError:
|
||||||
continue
|
continue
|
||||||
aweme_list.append(aweme_info.get("aweme_id",""))
|
aweme_list.append(aweme_info.get("aweme_id", ""))
|
||||||
await douyin.update_douyin_aweme(aweme_item=aweme_info)
|
await douyin.update_douyin_aweme(aweme_item=aweme_info)
|
||||||
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
||||||
# await self.batch_get_note_comments(aweme_list)
|
await self.batch_get_note_comments(aweme_list)
|
||||||
|
|
||||||
async def batch_get_note_comments(self, aweme_list: List[str]):
|
async def batch_get_note_comments(self, aweme_list: List[str]):
|
||||||
task_list: List[Task] = []
|
task_list: List[Task] = []
|
||||||
|
_semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
for aweme_id in aweme_list:
|
for aweme_id in aweme_list:
|
||||||
task = asyncio.create_task(self.get_comments(aweme_id), name=aweme_id)
|
task = asyncio.create_task(self.get_comments(aweme_id, _semaphore), name=aweme_id)
|
||||||
task_list.append(task)
|
task_list.append(task)
|
||||||
await asyncio.wait(task_list)
|
await asyncio.wait(task_list)
|
||||||
|
|
||||||
async def get_comments(self, aweme_id: str):
|
async def get_comments(self, aweme_id: str, semaphore: "asyncio.Semaphore"):
|
||||||
try:
|
async with semaphore:
|
||||||
await self.dy_client.get_aweme_all_comments(
|
try:
|
||||||
aweme_id=aweme_id,
|
await self.dy_client.get_aweme_all_comments(
|
||||||
callback=douyin.batch_update_dy_aweme_comments
|
aweme_id=aweme_id,
|
||||||
)
|
callback=douyin.batch_update_dy_aweme_comments
|
||||||
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
|
)
|
||||||
except DataFetchError as e:
|
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
|
||||||
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
|
except DataFetchError as e:
|
||||||
|
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
|
||||||
|
|
||||||
def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]:
|
def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]:
|
||||||
"""Create proxy info for playwright and httpx"""
|
"""Create proxy info for playwright and httpx"""
|
||||||
|
@ -116,7 +118,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
return None, None, None
|
return None, None, None
|
||||||
|
|
||||||
# phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888
|
# phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888
|
||||||
phone, ip_proxy = self.account_pool.get_account() # type: ignore
|
phone, ip_proxy = self.account_pool.get_account() # type: ignore
|
||||||
playwright_proxy = {
|
playwright_proxy = {
|
||||||
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
|
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
|
||||||
"username": config.IP_PROXY_USER,
|
"username": config.IP_PROXY_USER,
|
||||||
|
@ -127,7 +129,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
|
|
||||||
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient:
|
async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient:
|
||||||
"""Create douyin client"""
|
"""Create douyin client"""
|
||||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
|
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) # type: ignore
|
||||||
douyin_client = DOUYINClient(
|
douyin_client = DOUYINClient(
|
||||||
proxies=httpx_proxy,
|
proxies=httpx_proxy,
|
||||||
headers={
|
headers={
|
||||||
|
@ -152,18 +154,19 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
) -> BrowserContext:
|
) -> BrowserContext:
|
||||||
"""Launch browser and create browser context"""
|
"""Launch browser and create browser context"""
|
||||||
if config.SAVE_LOGIN_STATE:
|
if config.SAVE_LOGIN_STATE:
|
||||||
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform) # type: ignore
|
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
||||||
|
config.USER_DATA_DIR % self.command_args.platform) # type: ignore
|
||||||
browser_context = await chromium.launch_persistent_context(
|
browser_context = await chromium.launch_persistent_context(
|
||||||
user_data_dir=user_data_dir,
|
user_data_dir=user_data_dir,
|
||||||
accept_downloads=True,
|
accept_downloads=True,
|
||||||
headless=headless,
|
headless=headless,
|
||||||
proxy=playwright_proxy, # type: ignore
|
proxy=playwright_proxy, # type: ignore
|
||||||
viewport={"width": 1920, "height": 1080},
|
viewport={"width": 1920, "height": 1080},
|
||||||
user_agent=user_agent
|
user_agent=user_agent
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
return browser_context
|
return browser_context
|
||||||
else:
|
else:
|
||||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||||
browser_context = await browser.new_context(
|
browser_context = await browser.new_context(
|
||||||
viewport={"width": 1920, "height": 1080},
|
viewport={"width": 1920, "height": 1080},
|
||||||
user_agent=user_agent
|
user_agent=user_agent
|
||||||
|
|
|
@ -83,12 +83,15 @@ class XHSClient:
|
||||||
async def ping(self) -> bool:
|
async def ping(self) -> bool:
|
||||||
"""get a note to check if login state is ok"""
|
"""get a note to check if login state is ok"""
|
||||||
utils.logger.info("begin to ping xhs...")
|
utils.logger.info("begin to ping xhs...")
|
||||||
note_id = "5e5cb38a000000000100185e"
|
ping_flag = False
|
||||||
try:
|
try:
|
||||||
note_card: Dict = await self.get_note_by_id(note_id)
|
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
|
||||||
return note_card.get("note_id") == note_id
|
if note_card.get("items"):
|
||||||
except Exception:
|
ping_flag = True
|
||||||
return False
|
except Exception as e:
|
||||||
|
utils.logger.error(f"ping xhs failed: {e}")
|
||||||
|
ping_flag = False
|
||||||
|
return ping_flag
|
||||||
|
|
||||||
async def update_cookies(self, browser_context: BrowserContext):
|
async def update_cookies(self, browser_context: BrowserContext):
|
||||||
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
||||||
|
|
|
@ -15,7 +15,7 @@ from tools import utils
|
||||||
from .exception import *
|
from .exception import *
|
||||||
from .login import XHSLogin
|
from .login import XHSLogin
|
||||||
from .client import XHSClient
|
from .client import XHSClient
|
||||||
from models import xhs as xhs_model
|
from models import xiaohongshu as xhs_model
|
||||||
from base.base_crawler import AbstractCrawler
|
from base.base_crawler import AbstractCrawler
|
||||||
from base.proxy_account_pool import AccountPool
|
from base.proxy_account_pool import AccountPool
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
from .douyin import *
|
||||||
|
from .xiaohongshu import *
|
||||||
|
|
|
@ -0,0 +1,133 @@
|
||||||
|
import json
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
from tortoise.models import Model
|
||||||
|
from tortoise import fields
|
||||||
|
|
||||||
|
import config
|
||||||
|
from tools import utils
|
||||||
|
|
||||||
|
|
||||||
|
class DouyinBaseModel(Model):
|
||||||
|
id = fields.IntField(pk=True, autoincrement=True, description="自增ID")
|
||||||
|
user_id = fields.CharField(null=True, max_length=64, description="用户ID")
|
||||||
|
sec_uid = fields.CharField(null=True, max_length=128, description="用户sec_uid")
|
||||||
|
short_user_id = fields.CharField(null=True, max_length=64, description="用户短ID")
|
||||||
|
user_unique_id = fields.CharField(null=True, max_length=64, description="用户唯一ID")
|
||||||
|
nickname = fields.CharField(null=True, max_length=64, description="用户昵称")
|
||||||
|
avatar = fields.CharField(null=True, max_length=255, description="用户头像地址")
|
||||||
|
user_signature = fields.CharField(null=True, max_length=500, description="用户签名")
|
||||||
|
ip_location = fields.CharField(null=True, max_length=255, description="评论时的IP地址")
|
||||||
|
add_ts = fields.BigIntField(description="记录添加时间戳")
|
||||||
|
last_modify_ts = fields.BigIntField(description="记录最后修改时间戳")
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
abstract = True
|
||||||
|
|
||||||
|
|
||||||
|
class DouyinAweme(DouyinBaseModel):
|
||||||
|
aweme_id = fields.CharField(max_length=64, index=True, description="视频ID")
|
||||||
|
aweme_type = fields.CharField(max_length=16, description="视频类型")
|
||||||
|
title = fields.CharField(null=True, max_length=500, description="视频标题")
|
||||||
|
desc = fields.TextField(null=True, description="视频描述")
|
||||||
|
create_time = fields.BigIntField(description="视频发布时间戳", index=True)
|
||||||
|
liked_count = fields.CharField(null=True, max_length=16, description="视频点赞数")
|
||||||
|
comment_count = fields.CharField(null=True, max_length=16, description="视频评论数")
|
||||||
|
share_count = fields.CharField(null=True, max_length=16, description="视频分享数")
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
table = "douyin_aweme"
|
||||||
|
table_description = "抖音视频"
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.aweme_id} - {self.title}"
|
||||||
|
|
||||||
|
|
||||||
|
class DouyinAwemeComment(DouyinBaseModel):
|
||||||
|
comment_id = fields.CharField(max_length=64, index=True, description="评论ID")
|
||||||
|
aweme_id = fields.CharField(max_length=64, index=True, description="视频ID")
|
||||||
|
content = fields.TextField(null=True, description="评论内容")
|
||||||
|
create_time = fields.BigIntField(description="评论时间戳")
|
||||||
|
sub_comment_count = fields.CharField(max_length=16, description="评论回复数")
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
table = "douyin_aweme_comment"
|
||||||
|
table_description = "抖音视频评论"
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.comment_id} - {self.content}"
|
||||||
|
|
||||||
|
|
||||||
|
async def update_douyin_aweme(aweme_item: Dict):
|
||||||
|
aweme_id = aweme_item.get("aweme_id")
|
||||||
|
user_info = aweme_item.get("author", {})
|
||||||
|
interact_info = aweme_item.get("statistics", {})
|
||||||
|
local_db_item = {
|
||||||
|
"aweme_id": aweme_id,
|
||||||
|
"aweme_type": aweme_item.get("aweme_type"),
|
||||||
|
"title": aweme_item.get("desc", ""),
|
||||||
|
"desc": aweme_item.get("desc", ""),
|
||||||
|
"create_time": aweme_item.get("create_time"),
|
||||||
|
"user_id": user_info.get("uid"),
|
||||||
|
"sec_uid": user_info.get("sec_uid"),
|
||||||
|
"short_user_id": user_info.get("short_id"),
|
||||||
|
"user_unique_id": user_info.get("unique_id"),
|
||||||
|
"user_signature": user_info.get("signature"),
|
||||||
|
"nickname": user_info.get("nickname"),
|
||||||
|
"avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
|
||||||
|
"liked_count": interact_info.get("digg_count"),
|
||||||
|
"collected_count": interact_info.get("collect_count"),
|
||||||
|
"comment_count": interact_info.get("comment_count"),
|
||||||
|
"share_count": interact_info.get("share_count"),
|
||||||
|
"ip_location": aweme_item.get("ip_label", ""),
|
||||||
|
"last_modify_ts": utils.get_current_timestamp(),
|
||||||
|
}
|
||||||
|
print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
|
||||||
|
if config.IS_SAVED_DATABASED:
|
||||||
|
if not await DouyinAweme.filter(aweme_id=aweme_id).exists():
|
||||||
|
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||||
|
await DouyinAweme.create(**local_db_item)
|
||||||
|
else:
|
||||||
|
await DouyinAweme.filter(aweme_id=aweme_id).update(**local_db_item)
|
||||||
|
|
||||||
|
|
||||||
|
async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
|
||||||
|
if not comments:
|
||||||
|
return
|
||||||
|
for comment_item in comments:
|
||||||
|
await update_dy_aweme_comment(aweme_id, comment_item)
|
||||||
|
|
||||||
|
|
||||||
|
async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
|
||||||
|
comment_aweme_id = comment_item.get("aweme_id")
|
||||||
|
if aweme_id != comment_aweme_id:
|
||||||
|
print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}")
|
||||||
|
return
|
||||||
|
user_info = comment_item.get("user", {})
|
||||||
|
comment_id = comment_item.get("cid")
|
||||||
|
avatar_info = user_info.get("avatar_medium", {}) or user_info.get("avatar_300x300", {}) or user_info.get(
|
||||||
|
"avatar_168x168", {}) or user_info.get("avatar_thumb", {}) or {}
|
||||||
|
local_db_item = {
|
||||||
|
"comment_id": comment_id,
|
||||||
|
"create_time": comment_item.get("create_time"),
|
||||||
|
"ip_location": comment_item.get("ip_label", ""),
|
||||||
|
"aweme_id": aweme_id,
|
||||||
|
"content": comment_item.get("text"),
|
||||||
|
"content_extra": json.dumps(comment_item.get("text_extra", [])),
|
||||||
|
"user_id": user_info.get("uid"),
|
||||||
|
"sec_uid": user_info.get("sec_uid"),
|
||||||
|
"short_user_id": user_info.get("short_id"),
|
||||||
|
"user_unique_id": user_info.get("unique_id"),
|
||||||
|
"user_signature": user_info.get("signature"),
|
||||||
|
"nickname": user_info.get("nickname"),
|
||||||
|
"avatar": avatar_info.get("url_list", [""])[0],
|
||||||
|
"sub_comment_count": comment_item.get("reply_comment_total", 0),
|
||||||
|
"last_modify_ts": utils.get_current_timestamp(),
|
||||||
|
}
|
||||||
|
print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")
|
||||||
|
if config.IS_SAVED_DATABASED:
|
||||||
|
if not await DouyinAwemeComment.filter(comment_id=comment_id).exists():
|
||||||
|
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||||
|
await DouyinAwemeComment.create(**local_db_item)
|
||||||
|
else:
|
||||||
|
await DouyinAwemeComment.filter(comment_id=comment_id).update(**local_db_item)
|
|
@ -1 +0,0 @@
|
||||||
from .m_douyin import *
|
|
|
@ -1,64 +0,0 @@
|
||||||
import json
|
|
||||||
from typing import Dict, List
|
|
||||||
|
|
||||||
from tools import utils
|
|
||||||
|
|
||||||
|
|
||||||
async def update_douyin_aweme(aweme_item: Dict):
|
|
||||||
aweme_id = aweme_item.get("aweme_id")
|
|
||||||
user_info = aweme_item.get("author", {})
|
|
||||||
local_db_item = {
|
|
||||||
"aweme_id": aweme_id,
|
|
||||||
"aweme_type": aweme_item.get("aweme_type"),
|
|
||||||
"title": aweme_item.get("desc", ""),
|
|
||||||
"desc": aweme_item.get("desc", ""),
|
|
||||||
"create_time": aweme_item.get("create_time"),
|
|
||||||
"user_id": user_info.get("uid"),
|
|
||||||
"sec_uid": user_info.get("sec_uid"),
|
|
||||||
"short_user_id": user_info.get("short_id"),
|
|
||||||
"user_unique_id": user_info.get("unique_id"),
|
|
||||||
"user_signature": user_info.get("signature"),
|
|
||||||
"nickname": user_info.get("nickname"),
|
|
||||||
"avatar": user_info.get("avatar_thumb", {}).get("url_list", [""])[0],
|
|
||||||
"ip_location": aweme_item.get("ip_label", ""),
|
|
||||||
"last_modify_ts": utils.get_current_timestamp(),
|
|
||||||
}
|
|
||||||
# do something ...
|
|
||||||
print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
|
|
||||||
|
|
||||||
|
|
||||||
async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
|
|
||||||
if not comments:
|
|
||||||
return
|
|
||||||
for comment_item in comments:
|
|
||||||
await update_dy_aweme_comment(aweme_id, comment_item)
|
|
||||||
|
|
||||||
|
|
||||||
async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
|
|
||||||
comment_aweme_id = comment_item.get("aweme_id")
|
|
||||||
if aweme_id != comment_aweme_id:
|
|
||||||
print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}")
|
|
||||||
return
|
|
||||||
user_info = comment_item.get("user", {})
|
|
||||||
comment_id = comment_item.get("cid")
|
|
||||||
avatar_info = user_info.get("avatar_medium", {}) or user_info.get("avatar_300x300", {}) or user_info.get(
|
|
||||||
"avatar_168x168", {}) or user_info.get("avatar_thumb", {}) or {}
|
|
||||||
local_db_item = {
|
|
||||||
"comment_id": comment_id,
|
|
||||||
"create_time": comment_item.get("create_time"),
|
|
||||||
"ip_location": comment_item.get("ip_label", ""),
|
|
||||||
"aweme_id": aweme_id,
|
|
||||||
"content": comment_item.get("text"),
|
|
||||||
"content_extra": json.dumps(comment_item.get("text_extra", [])),
|
|
||||||
"user_id": user_info.get("uid"),
|
|
||||||
"sec_uid": user_info.get("sec_uid"),
|
|
||||||
"short_user_id": user_info.get("short_id"),
|
|
||||||
"user_unique_id": user_info.get("unique_id"),
|
|
||||||
"user_signature": user_info.get("signature"),
|
|
||||||
"nickname": user_info.get("nickname"),
|
|
||||||
"avatar": avatar_info.get("url_list", [""])[0],
|
|
||||||
"sub_comment_count": comment_item.get("reply_comment_total", 0),
|
|
||||||
"last_modify_ts": utils.get_current_timestamp(),
|
|
||||||
}
|
|
||||||
# do something ...
|
|
||||||
print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")
|
|
|
@ -1 +0,0 @@
|
||||||
from .m_xhs import *
|
|
|
@ -1,46 +0,0 @@
|
||||||
from typing import Dict, List
|
|
||||||
|
|
||||||
from tools import utils
|
|
||||||
|
|
||||||
|
|
||||||
async def update_xhs_note(note_item: Dict):
|
|
||||||
note_id = note_item.get("note_id")
|
|
||||||
user_info = note_item.get("user", {})
|
|
||||||
interact_info = note_item.get("interact_info")
|
|
||||||
image_list: List[Dict]= note_item.get("image_list", [])
|
|
||||||
|
|
||||||
local_db_item = {
|
|
||||||
"note_id": note_item.get("note_id"),
|
|
||||||
"type": note_item.get("type"),
|
|
||||||
"title": note_item.get("title"),
|
|
||||||
"desc": note_item.get("desc", ""),
|
|
||||||
"time": note_item.get("time"),
|
|
||||||
"last_update_time": note_item.get("last_update_time", 0),
|
|
||||||
"user_id": user_info.get("user_id"),
|
|
||||||
"nickname": user_info.get("nickname"),
|
|
||||||
"avatar": user_info.get("avatar"),
|
|
||||||
"ip_location": note_item.get("ip_location", ""),
|
|
||||||
"image_list": ','.join([img.get('url','') for img in image_list]),
|
|
||||||
"last_modify_ts": utils.get_current_timestamp(),
|
|
||||||
}
|
|
||||||
# do something ...
|
|
||||||
print("xhs note:", local_db_item)
|
|
||||||
|
|
||||||
|
|
||||||
async def update_xhs_note_comment(note_id: str, comment_item: Dict):
|
|
||||||
user_info = comment_item.get("user_info", {})
|
|
||||||
comment_id = comment_item.get("id")
|
|
||||||
local_db_item = {
|
|
||||||
"comment_id": comment_id,
|
|
||||||
"create_time": comment_item.get("create_time"),
|
|
||||||
"ip_location": comment_item.get("ip_location"),
|
|
||||||
"note_id": note_id,
|
|
||||||
"content": comment_item.get("content"),
|
|
||||||
"user_id": user_info.get("user_id"),
|
|
||||||
"nickname": user_info.get("nickname"),
|
|
||||||
"avatar": user_info.get("image"),
|
|
||||||
"sub_comment_count": comment_item.get("sub_comment_count"),
|
|
||||||
"last_modify_ts": utils.get_current_timestamp(),
|
|
||||||
}
|
|
||||||
# do something ...
|
|
||||||
print("xhs note comment:", local_db_item)
|
|
|
@ -0,0 +1,113 @@
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
|
from tortoise.models import Model
|
||||||
|
from tortoise import fields
|
||||||
|
|
||||||
|
import config
|
||||||
|
from tools import utils
|
||||||
|
|
||||||
|
|
||||||
|
class XhsBaseModel(Model):
|
||||||
|
id = fields.IntField(pk=True, autoincrement=True, description="自增ID")
|
||||||
|
user_id = fields.CharField(max_length=64, description="用户ID")
|
||||||
|
nickname = fields.CharField(null=True, max_length=64, description="用户昵称")
|
||||||
|
avatar = fields.CharField(null=True, max_length=255, description="用户头像地址")
|
||||||
|
ip_location = fields.CharField(null=True, max_length=255, description="评论时的IP地址")
|
||||||
|
add_ts = fields.BigIntField(description="记录添加时间戳")
|
||||||
|
last_modify_ts = fields.BigIntField(description="记录最后修改时间戳")
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
abstract = True
|
||||||
|
|
||||||
|
|
||||||
|
class XHSNote(XhsBaseModel):
|
||||||
|
note_id = fields.CharField(max_length=64, index=True, description="笔记ID")
|
||||||
|
type = fields.CharField(null=True, max_length=16, description="笔记类型(normal | video)")
|
||||||
|
title = fields.CharField(null=True, max_length=255, description="笔记标题")
|
||||||
|
desc = fields.TextField(null=True, description="笔记描述")
|
||||||
|
time = fields.BigIntField(description="笔记发布时间戳", index=True)
|
||||||
|
last_update_time = fields.BigIntField(description="笔记最后更新时间戳")
|
||||||
|
liked_count = fields.CharField(null=True, max_length=16, description="笔记点赞数")
|
||||||
|
collected_count = fields.CharField(null=True, max_length=16, description="笔记收藏数")
|
||||||
|
comment_count = fields.CharField(null=True, max_length=16, description="笔记评论数")
|
||||||
|
share_count = fields.CharField(null=True, max_length=16, description="笔记分享数")
|
||||||
|
image_list = fields.TextField(null=True, description="笔记封面图片列表")
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
table = "xhs_note"
|
||||||
|
table_description = "小红书笔记"
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.note_id} - {self.title}"
|
||||||
|
|
||||||
|
|
||||||
|
class XHSNoteComment(XhsBaseModel):
|
||||||
|
comment_id = fields.CharField(max_length=64, index=True, description="评论ID")
|
||||||
|
create_time = fields.BigIntField(index=True, description="评论时间戳")
|
||||||
|
note_id = fields.CharField(max_length=64, description="笔记ID")
|
||||||
|
content = fields.TextField(description="评论内容")
|
||||||
|
sub_comment_count = fields.IntField(description="子评论数量")
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
table = "xhs_note_comment"
|
||||||
|
table_description = "小红书笔记评论"
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.comment_id} - {self.content}"
|
||||||
|
|
||||||
|
|
||||||
|
async def update_xhs_note(note_item: Dict):
|
||||||
|
note_id = note_item.get("note_id")
|
||||||
|
user_info = note_item.get("user", {})
|
||||||
|
interact_info = note_item.get("interact_info", {})
|
||||||
|
image_list: List[Dict] = note_item.get("image_list", [])
|
||||||
|
|
||||||
|
local_db_item = {
|
||||||
|
"note_id": note_item.get("note_id"),
|
||||||
|
"type": note_item.get("type"),
|
||||||
|
"title": note_item.get("title"),
|
||||||
|
"desc": note_item.get("desc", ""),
|
||||||
|
"time": note_item.get("time"),
|
||||||
|
"last_update_time": note_item.get("last_update_time", 0),
|
||||||
|
"user_id": user_info.get("user_id"),
|
||||||
|
"nickname": user_info.get("nickname"),
|
||||||
|
"avatar": user_info.get("avatar"),
|
||||||
|
"liked_count": interact_info.get("liked_count"),
|
||||||
|
"collected_count": interact_info.get("collected_count"),
|
||||||
|
"comment_count": interact_info.get("comment_count"),
|
||||||
|
"share_count": interact_info.get("share_count"),
|
||||||
|
"ip_location": note_item.get("ip_location", ""),
|
||||||
|
"image_list": ','.join([img.get('url', '') for img in image_list]),
|
||||||
|
"last_modify_ts": utils.get_current_timestamp(),
|
||||||
|
}
|
||||||
|
print("xhs note:", local_db_item)
|
||||||
|
if config.IS_SAVED_DATABASED:
|
||||||
|
if not await XHSNote.filter(note_id=note_id).first():
|
||||||
|
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||||
|
await XHSNote.create(**local_db_item)
|
||||||
|
else:
|
||||||
|
await XHSNote.filter(note_id=note_id).update(**local_db_item)
|
||||||
|
|
||||||
|
|
||||||
|
async def update_xhs_note_comment(note_id: str, comment_item: Dict):
|
||||||
|
user_info = comment_item.get("user_info", {})
|
||||||
|
comment_id = comment_item.get("id")
|
||||||
|
local_db_item = {
|
||||||
|
"comment_id": comment_id,
|
||||||
|
"create_time": comment_item.get("create_time"),
|
||||||
|
"ip_location": comment_item.get("ip_location"),
|
||||||
|
"note_id": note_id,
|
||||||
|
"content": comment_item.get("content"),
|
||||||
|
"user_id": user_info.get("user_id"),
|
||||||
|
"nickname": user_info.get("nickname"),
|
||||||
|
"avatar": user_info.get("image"),
|
||||||
|
"sub_comment_count": comment_item.get("sub_comment_count"),
|
||||||
|
"last_modify_ts": utils.get_current_timestamp(),
|
||||||
|
}
|
||||||
|
print("xhs note comment:", local_db_item)
|
||||||
|
if config.IS_SAVED_DATABASED:
|
||||||
|
if not await XHSNoteComment.filter(comment_id=comment_id).first():
|
||||||
|
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||||
|
await XHSNoteComment.create(**local_db_item)
|
||||||
|
else:
|
||||||
|
await XHSNoteComment.filter(comment_id=comment_id).update(**local_db_item)
|
|
@ -6,3 +6,5 @@ tenacity==8.2.2
|
||||||
tornado==6.3.2
|
tornado==6.3.2
|
||||||
PyExecJS==1.5.1
|
PyExecJS==1.5.1
|
||||||
opencv-python==4.7.0.72
|
opencv-python==4.7.0.72
|
||||||
|
tortoise-orm[asyncmy]==0.19.3
|
||||||
|
aerich==0.7.2
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
|
||||||
|
|
||||||
|
@ -7,7 +8,3 @@ def test_convert_cookies():
|
||||||
cookie_dict = utils.convert_str_cookie_to_dict(xhs_cookies)
|
cookie_dict = utils.convert_str_cookie_to_dict(xhs_cookies)
|
||||||
assert cookie_dict.get("webId") == "1190c4d3cxxxx125xxx"
|
assert cookie_dict.get("webId") == "1190c4d3cxxxx125xxx"
|
||||||
assert cookie_dict.get("a1") == "x000101360"
|
assert cookie_dict.get("a1") == "x000101360"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
test_convert_cookies()
|
|
Loading…
Reference in New Issue