MediaCrawler/models/xiaohongshu.py

import csv
import pathlib
from typing import Dict, List

from tortoise import fields
from tortoise.contrib.pydantic import pydantic_model_creator
from tortoise.models import Model

import config
from tools import utils
from var import crawler_type_var


class XhsBaseModel(Model):
    id = fields.IntField(pk=True, autoincrement=True, description="自增ID")
    user_id = fields.CharField(max_length=64, description="用户ID")
    nickname = fields.CharField(null=True, max_length=64, description="用户昵称")
    avatar = fields.CharField(null=True, max_length=255, description="用户头像地址")
    ip_location = fields.CharField(null=True, max_length=255, description="评论时的IP地址")
    add_ts = fields.BigIntField(description="记录添加时间戳")
    last_modify_ts = fields.BigIntField(description="记录最后修改时间戳")

    class Meta:
        abstract = True


class XHSNote(XhsBaseModel):
    note_id = fields.CharField(max_length=64, index=True, description="笔记ID")
    type = fields.CharField(null=True, max_length=16, description="笔记类型(normal | video)")
    title = fields.CharField(null=True, max_length=255, description="笔记标题")
    desc = fields.TextField(null=True, description="笔记描述")
    time = fields.BigIntField(description="笔记发布时间戳", index=True)
    last_update_time = fields.BigIntField(description="笔记最后更新时间戳")
    liked_count = fields.CharField(null=True, max_length=16, description="笔记点赞数")
    collected_count = fields.CharField(null=True, max_length=16, description="笔记收藏数")
    comment_count = fields.CharField(null=True, max_length=16, description="笔记评论数")
    share_count = fields.CharField(null=True, max_length=16, description="笔记分享数")
    image_list = fields.TextField(null=True, description="笔记封面图片列表")
    note_url = fields.CharField(null=True, max_length=255, description="笔记详情页的URL")

    class Meta:
        table = "xhs_note"
        table_description = "小红书笔记"

    def __str__(self):
        return f"{self.note_id} - {self.title}"


class XHSNoteComment(XhsBaseModel):
    comment_id = fields.CharField(max_length=64, index=True, description="评论ID")
    create_time = fields.BigIntField(index=True, description="评论时间戳")
    note_id = fields.CharField(max_length=64, description="笔记ID")
    content = fields.TextField(description="评论内容")
    sub_comment_count = fields.IntField(description="子评论数量")

    class Meta:
        table = "xhs_note_comment"
        table_description = "小红书笔记评论"

    def __str__(self):
        return f"{self.comment_id} - {self.content}"


async def update_xhs_note(note_item: Dict):
    note_id = note_item.get("note_id")
    user_info = note_item.get("user", {})
    interact_info = note_item.get("interact_info", {})
    image_list: List[Dict] = note_item.get("image_list", [])

    local_db_item = {
        "note_id": note_item.get("note_id"),
        "type": note_item.get("type"),
        "title": note_item.get("title") or note_item.get("desc", "")[:255],
        "desc": note_item.get("desc", ""),
        "time": note_item.get("time"),
        "last_update_time": note_item.get("last_update_time", 0),
        "user_id": user_info.get("user_id"),
        "nickname": user_info.get("nickname"),
        "avatar": user_info.get("avatar"),
        "liked_count": interact_info.get("liked_count"),
        "collected_count": interact_info.get("collected_count"),
        "comment_count": interact_info.get("comment_count"),
        "share_count": interact_info.get("share_count"),
        "ip_location": note_item.get("ip_location", ""),
        "image_list": ','.join([img.get('url', '') for img in image_list]),
        "last_modify_ts": utils.get_current_timestamp(),
        "note_url": f"https://www.xiaohongshu.com/explore/{note_id}"
    }
    print("xhs note:", local_db_item)
    if config.IS_SAVED_DATABASED:
        if not await XHSNote.filter(note_id=note_id).first():
            local_db_item["add_ts"] = utils.get_current_timestamp()
            note_pydantic = pydantic_model_creator(XHSNote, name="XHSPydanticCreate", exclude=('id',))
            note_data = note_pydantic(**local_db_item)
            note_pydantic.validate(note_data)
            await XHSNote.create(**note_data.dict())
        else:
            note_pydantic = pydantic_model_creator(XHSNote, name="XHSPydanticUpdate", exclude=('id', 'add_ts'))
            note_data = note_pydantic(**local_db_item)
            note_pydantic.validate(note_data)
            await XHSNote.filter(note_id=note_id).update(**note_data.dict())
    else:
        # Below is a simple way to save it in CSV format.
        pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True)
        save_file_name = f"data/xhs/{crawler_type_var.get()}_notes_{utils.get_current_date()}.csv"
        with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
            writer = csv.writer(f)
            if f.tell() == 0:
                writer.writerow(local_db_item.keys())
            writer.writerow(local_db_item.values())


async def update_xhs_note_comment(note_id: str, comment_item: Dict):
    user_info = comment_item.get("user_info", {})
    comment_id = comment_item.get("id")
    local_db_item = {
        "comment_id": comment_id,
        "create_time": comment_item.get("create_time"),
        "ip_location": comment_item.get("ip_location"),
        "note_id": note_id,
        "content": comment_item.get("content"),
        "user_id": user_info.get("user_id"),
        "nickname": user_info.get("nickname"),
        "avatar": user_info.get("image"),
        "sub_comment_count": comment_item.get("sub_comment_count"),
        "last_modify_ts": utils.get_current_timestamp(),
    }
    print("xhs note comment:", local_db_item)
    if config.IS_SAVED_DATABASED:
        if not await XHSNoteComment.filter(comment_id=comment_id).first():
            local_db_item["add_ts"] = utils.get_current_timestamp()
            comment_pydantic = pydantic_model_creator(XHSNoteComment, name="CommentPydanticCreate", exclude=('id',))
            comment_data = comment_pydantic(**local_db_item)
            comment_pydantic.validate(comment_data)
            await XHSNoteComment.create(**comment_data.dict())
        else:
            comment_pydantic = pydantic_model_creator(XHSNoteComment, name="CommentPydanticUpdate",
                                                      exclude=('id', 'add_ts',))
            comment_data = comment_pydantic(**local_db_item)
            comment_pydantic.validate(comment_data)
            await XHSNoteComment.filter(comment_id=comment_id).update(**comment_data.dict())
    else:
        # Below is a simple way to save it in CSV format.
        pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True)
        save_file_name = f"data/xhs/{crawler_type_var.get()}_comment_{utils.get_current_date()}.csv"
        with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
            writer = csv.writer(f)
            if f.tell() == 0:
                writer.writerow(local_db_item.keys())
            writer.writerow(local_db_item.values())
feat: 支持数据保存到CSV中 2023-08-16 11:49:41 +00:00			`import csv`
			`import pathlib`
refactor: 优化代码 2023-07-29 07:35:40 +00:00			`from typing import Dict, List`
feat: 增加配置项支持自由选择数据是否保存到关系型数据库中 2023-07-24 12:59:43 +00:00
			`from tortoise import fields`
✨ feat: orm添加Pydantic数据验证 2023-08-13 14:39:50 +00:00			`from tortoise.contrib.pydantic import pydantic_model_creator`
refactor: 优化代码 2023-07-29 07:35:40 +00:00			`from tortoise.models import Model`
feat: 增加配置项支持自由选择数据是否保存到关系型数据库中 2023-07-24 12:59:43 +00:00
			`import config`
			`from tools import utils`
feat: 小红书增加指定帖子爬取功能 fix: 修复程序一些异常 bug refactor: 优化部分代码逻辑 2023-11-18 05:38:11 +00:00			`from var import crawler_type_var`
feat: 增加配置项支持自由选择数据是否保存到关系型数据库中 2023-07-24 12:59:43 +00:00

			`class XhsBaseModel(Model):`
			`id = fields.IntField(pk=True, autoincrement=True, description="自增ID")`
			`user_id = fields.CharField(max_length=64, description="用户ID")`
			`nickname = fields.CharField(null=True, max_length=64, description="用户昵称")`
			`avatar = fields.CharField(null=True, max_length=255, description="用户头像地址")`
			`ip_location = fields.CharField(null=True, max_length=255, description="评论时的IP地址")`
			`add_ts = fields.BigIntField(description="记录添加时间戳")`
			`last_modify_ts = fields.BigIntField(description="记录最后修改时间戳")`

			`class Meta:`
			`abstract = True`


			`class XHSNote(XhsBaseModel):`
			`note_id = fields.CharField(max_length=64, index=True, description="笔记ID")`
			`type = fields.CharField(null=True, max_length=16, description="笔记类型(normal \| video)")`
			`title = fields.CharField(null=True, max_length=255, description="笔记标题")`
			`desc = fields.TextField(null=True, description="笔记描述")`
			`time = fields.BigIntField(description="笔记发布时间戳", index=True)`
			`last_update_time = fields.BigIntField(description="笔记最后更新时间戳")`
			`liked_count = fields.CharField(null=True, max_length=16, description="笔记点赞数")`
			`collected_count = fields.CharField(null=True, max_length=16, description="笔记收藏数")`
			`comment_count = fields.CharField(null=True, max_length=16, description="笔记评论数")`
			`share_count = fields.CharField(null=True, max_length=16, description="笔记分享数")`
			`image_list = fields.TextField(null=True, description="笔记封面图片列表")`
feat: add article url for issue #63 2023-11-05 07:27:18 +00:00			`note_url = fields.CharField(null=True, max_length=255, description="笔记详情页的URL")`
feat: 增加配置项支持自由选择数据是否保存到关系型数据库中 2023-07-24 12:59:43 +00:00
			`class Meta:`
			`table = "xhs_note"`
			`table_description = "小红书笔记"`

			`def __str__(self):`
			`return f"{self.note_id} - {self.title}"`


			`class XHSNoteComment(XhsBaseModel):`
			`comment_id = fields.CharField(max_length=64, index=True, description="评论ID")`
			`create_time = fields.BigIntField(index=True, description="评论时间戳")`
			`note_id = fields.CharField(max_length=64, description="笔记ID")`
			`content = fields.TextField(description="评论内容")`
			`sub_comment_count = fields.IntField(description="子评论数量")`

			`class Meta:`
			`table = "xhs_note_comment"`
			`table_description = "小红书笔记评论"`

			`def __str__(self):`
			`return f"{self.comment_id} - {self.content}"`


			`async def update_xhs_note(note_item: Dict):`
			`note_id = note_item.get("note_id")`
			`user_info = note_item.get("user", {})`
			`interact_info = note_item.get("interact_info", {})`
			`image_list: List[Dict] = note_item.get("image_list", [])`

			`local_db_item = {`
			`"note_id": note_item.get("note_id"),`
			`"type": note_item.get("type"),`
fix: issue #23 2023-08-03 14:26:31 +00:00			`"title": note_item.get("title") or note_item.get("desc", "")[:255],`
feat: 增加配置项支持自由选择数据是否保存到关系型数据库中 2023-07-24 12:59:43 +00:00			`"desc": note_item.get("desc", ""),`
			`"time": note_item.get("time"),`
			`"last_update_time": note_item.get("last_update_time", 0),`
			`"user_id": user_info.get("user_id"),`
			`"nickname": user_info.get("nickname"),`
			`"avatar": user_info.get("avatar"),`
			`"liked_count": interact_info.get("liked_count"),`
			`"collected_count": interact_info.get("collected_count"),`
			`"comment_count": interact_info.get("comment_count"),`
			`"share_count": interact_info.get("share_count"),`
			`"ip_location": note_item.get("ip_location", ""),`
			`"image_list": ','.join([img.get('url', '') for img in image_list]),`
			`"last_modify_ts": utils.get_current_timestamp(),`
feat: add article url for issue #63 2023-11-05 07:27:18 +00:00			`"note_url": f"https://www.xiaohongshu.com/explore/{note_id}"`
feat: 增加配置项支持自由选择数据是否保存到关系型数据库中 2023-07-24 12:59:43 +00:00			`}`
			`print("xhs note:", local_db_item)`
			`if config.IS_SAVED_DATABASED:`
			`if not await XHSNote.filter(note_id=note_id).first():`
			`local_db_item["add_ts"] = utils.get_current_timestamp()`
fix: issue #32 2023-08-16 05:58:44 +00:00			`note_pydantic = pydantic_model_creator(XHSNote, name="XHSPydanticCreate", exclude=('id',))`
✨ feat: orm添加Pydantic数据验证 2023-08-13 14:39:50 +00:00			`note_data = note_pydantic(**local_db_item)`
			`note_pydantic.validate(note_data)`
			`await XHSNote.create(**note_data.dict())`
feat: 增加配置项支持自由选择数据是否保存到关系型数据库中 2023-07-24 12:59:43 +00:00			`else:`
fix: issue #32 2023-08-16 05:58:44 +00:00			`note_pydantic = pydantic_model_creator(XHSNote, name="XHSPydanticUpdate", exclude=('id', 'add_ts'))`
✨ feat: orm添加Pydantic数据验证 2023-08-13 14:39:50 +00:00			`note_data = note_pydantic(**local_db_item)`
			`note_pydantic.validate(note_data)`
			`await XHSNote.filter(note_id=note_id).update(**note_data.dict())`
feat: 支持数据保存到CSV中 2023-08-16 11:49:41 +00:00			`else:`
			`# Below is a simple way to save it in CSV format.`
			`pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True)`
feat: 小红书增加指定帖子爬取功能 fix: 修复程序一些异常 bug refactor: 优化部分代码逻辑 2023-11-18 05:38:11 +00:00			`save_file_name = f"data/xhs/{crawler_type_var.get()}_notes_{utils.get_current_date()}.csv"`
			`with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:`
feat: 支持数据保存到CSV中 2023-08-16 11:49:41 +00:00			`writer = csv.writer(f)`
			`if f.tell() == 0:`
			`writer.writerow(local_db_item.keys())`
			`writer.writerow(local_db_item.values())`
feat: 增加配置项支持自由选择数据是否保存到关系型数据库中 2023-07-24 12:59:43 +00:00

			`async def update_xhs_note_comment(note_id: str, comment_item: Dict):`
			`user_info = comment_item.get("user_info", {})`
			`comment_id = comment_item.get("id")`
			`local_db_item = {`
			`"comment_id": comment_id,`
			`"create_time": comment_item.get("create_time"),`
			`"ip_location": comment_item.get("ip_location"),`
			`"note_id": note_id,`
			`"content": comment_item.get("content"),`
			`"user_id": user_info.get("user_id"),`
			`"nickname": user_info.get("nickname"),`
			`"avatar": user_info.get("image"),`
			`"sub_comment_count": comment_item.get("sub_comment_count"),`
			`"last_modify_ts": utils.get_current_timestamp(),`
			`}`
			`print("xhs note comment:", local_db_item)`
			`if config.IS_SAVED_DATABASED:`
			`if not await XHSNoteComment.filter(comment_id=comment_id).first():`
			`local_db_item["add_ts"] = utils.get_current_timestamp()`
fix: issue #32 2023-08-16 05:58:44 +00:00			`comment_pydantic = pydantic_model_creator(XHSNoteComment, name="CommentPydanticCreate", exclude=('id',))`
✨ feat: orm添加Pydantic数据验证 2023-08-13 14:39:50 +00:00			`comment_data = comment_pydantic(**local_db_item)`
			`comment_pydantic.validate(comment_data)`
			`await XHSNoteComment.create(**comment_data.dict())`
feat: 增加配置项支持自由选择数据是否保存到关系型数据库中 2023-07-24 12:59:43 +00:00			`else:`
fix: issue #32 2023-08-16 05:58:44 +00:00			`comment_pydantic = pydantic_model_creator(XHSNoteComment, name="CommentPydanticUpdate",`
			`exclude=('id', 'add_ts',))`
✨ feat: orm添加Pydantic数据验证 2023-08-13 14:39:50 +00:00			`comment_data = comment_pydantic(**local_db_item)`
			`comment_pydantic.validate(comment_data)`
			`await XHSNoteComment.filter(comment_id=comment_id).update(**comment_data.dict())`
feat: 支持数据保存到CSV中 2023-08-16 11:49:41 +00:00			`else:`
			`# Below is a simple way to save it in CSV format.`
			`pathlib.Path(f"data/xhs").mkdir(parents=True, exist_ok=True)`
feat: 小红书增加指定帖子爬取功能 fix: 修复程序一些异常 bug refactor: 优化部分代码逻辑 2023-11-18 05:38:11 +00:00			`save_file_name = f"data/xhs/{crawler_type_var.get()}_comment_{utils.get_current_date()}.csv"`
			`with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:`
feat: 支持数据保存到CSV中 2023-08-16 11:49:41 +00:00			`writer = csv.writer(f)`
			`if f.tell() == 0:`
			`writer.writerow(local_db_item.keys())`
			`writer.writerow(local_db_item.values())`