Merge pull request #374 from NanmiCoder/feature/baidu_tieba_20240805

feat: MediaCrawler支持百度贴吧
This commit is contained in:
程序员阿江-Relakkes 2024-08-08 14:24:58 +08:00 committed by GitHub
commit a10cdcf474
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
30 changed files with 7527 additions and 268 deletions

View File

@ -7,7 +7,7 @@
> 点击查看更为详细的免责声明。[点击跳转](#disclaimer)
# 仓库描述
**小红书爬虫****抖音爬虫** **快手爬虫** **B站爬虫** **微博爬虫**...。
**小红书爬虫****抖音爬虫** **快手爬虫** **B站爬虫** **微博爬虫****百度贴吧**...。
目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。
原理:利用[playwright](https://playwright.dev/)搭桥保留登录成功后的上下文浏览器环境通过执行JS表达式获取一些加密参数
@ -22,6 +22,7 @@
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
| 贴吧 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
## 使用方法
@ -99,14 +100,51 @@
- [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
- [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html)
## 感谢下列Sponsors对本仓库赞助
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册安装这个款免费的Sider ChatGPT插件帮我获得一定奖励💰这个插件我用了大半年作为谷歌上最火的一款插件体验非常不错。</a>
> 安装并注册该浏览器插件之后保留一天即可我就可以获得3元的推广奖励谢谢大家支持我继续开源项目。
成为赞助者展示你的产品在这里联系作者wxyzglan
## 打赏
如果觉得项目不错的话可以打赏哦。您的支持就是我最大的动力!
打赏时您可以备注名称,我会将您添加至打赏列表中。
<p>
<img alt="打赏-微信" src="static/images/wechat_pay.jpeg" style="width: 200px;margin-right: 140px;" />
<img alt="打赏-支付宝" src="static/images/zfb_pay.png" style="width: 200px" />
</p>
## 捐赠信息
PS如果打赏时请备注捐赠者如有遗漏请联系我添加有时候消息多可能会漏掉十分抱歉
| 捐赠者 | 捐赠金额 | 捐赠日期 |
|-------------|-------|------------|
| *皓 | 50 元 | 2024-03-18 |
| *刚 | 50 元 | 2024-03-18 |
| *乐 | 20 元 | 2024-03-17 |
| *木 | 20 元 | 2024-03-17 |
| *诚 | 20 元 | 2024-03-17 |
| Strem Gamer | 20 元 | 2024-03-16 |
| *鑫 | 20 元 | 2024-03-14 |
| Yuzu | 20 元 | 2024-03-07 |
| **宁 | 100 元 | 2024-03-03 |
| **媛 | 20 元 | 2024-03-03 |
| Scarlett | 20 元 | 2024-02-16 |
| Asun | 20 元 | 2024-01-30 |
| 何* | 100 元 | 2024-01-21 |
| allen | 20 元 | 2024-01-10 |
| llllll | 20 元 | 2024-01-07 |
| 邝*元 | 20 元 | 2023-12-29 |
| 50chen | 50 元 | 2023-12-22 |
| xiongot | 20 元 | 2023-12-17 |
| atom.hu | 20 元 | 2023-12-16 |
| 一呆 | 20 元 | 2023-12-01 |
| 坠落 | 50 元 | 2023-11-08 |
## MediaCrawler爬虫项目交流群
> 扫描下方我的个人微信备注github拉你进MediaCrawler项目交流群(请一定备注github会有wx小助手自动拉群)

View File

@ -7,8 +7,8 @@ from tools.utils import str2bool
async def parse_cmd():
# 读取command arg
parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb | tieba)',
choices=["xhs", "dy", "ks", "bili", "wb", "tieba"], default=config.PLATFORM)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',

View File

@ -28,7 +28,7 @@ HEADLESS = False
SAVE_LOGIN_STATE = True
# 数据保存类型选项配置,支持三种类型csv、db、json
SAVE_DATA_OPTION = "json" # csv or db or json
SAVE_DATA_OPTION = "csv" # csv or db or json
# 用户浏览器缓存的浏览器文件配置
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
@ -37,7 +37,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
START_PAGE = 1
# 爬取视频/帖子的数量控制
CRAWLER_MAX_NOTES_COUNT = 20
CRAWLER_MAX_NOTES_COUNT = 100
# 并发爬虫数量控制
MAX_CONCURRENCY_NUM = 1
@ -57,7 +57,7 @@ XHS_SPECIFIED_ID_LIST = [
"6422c2750000000027000d88",
"64ca1b73000000000b028dd2",
"630d5b85000000001203ab41",
"668fe13000000000030241fa", # 图文混合
"668fe13000000000030241fa", # 图文混合
# ........................
]
@ -88,6 +88,16 @@ WEIBO_SPECIFIED_ID_LIST = [
# ........................
]
# 指定贴吧需要爬取的帖子列表
TIEBA_SPECIFIED_ID_LIST = [
]
# 指定贴吧名称列表,爬取该贴吧下的帖子
TIEBA_NAME_LIST = [
# "盗墓笔记"
]
# 指定小红书创作者ID列表
XHS_CREATOR_ID_LIST = [
"63e36c9a000000002703502b",
@ -112,19 +122,18 @@ KS_CREATOR_ID_LIST = [
# ........................
]
#词云相关
#是否开启生成评论词云图
# 词云相关
# 是否开启生成评论词云图
ENABLE_GET_WORDCLOUD = False
# 自定义词语及其分组
#添加规则xx:yy 其中xx为自定义添加的词组yy为将xx该词组分到的组名。
# 添加规则xx:yy 其中xx为自定义添加的词组yy为将xx该词组分到的组名。
CUSTOM_WORDS = {
'零几': '年份', # 将“零几”识别为一个整体
'高频词': '专业术语' # 示例自定义词
}
#停用(禁用)词文件路径
# 停用(禁用)词文件路径
STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
#中文字体文件路径
FONT_PATH= "./docs/STZHONGS.TTF"
# 中文字体文件路径
FONT_PATH = "./docs/STZHONGS.TTF"

1
constant/__init__.py Normal file
View File

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

3
constant/baidu_tieba.py Normal file
View File

@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-
TIEBA_URL = 'https://tieba.baidu.com'

View File

@ -8,6 +8,7 @@ from base.base_crawler import AbstractCrawler
from media_platform.bilibili import BilibiliCrawler
from media_platform.douyin import DouYinCrawler
from media_platform.kuaishou import KuaishouCrawler
from media_platform.tieba import TieBaCrawler
from media_platform.weibo import WeiboCrawler
from media_platform.xhs import XiaoHongShuCrawler
@ -18,7 +19,8 @@ class CrawlerFactory:
"dy": DouYinCrawler,
"ks": KuaishouCrawler,
"bili": BilibiliCrawler,
"wb": WeiboCrawler
"wb": WeiboCrawler,
"tieba": TieBaCrawler
}
@staticmethod
@ -28,6 +30,7 @@ class CrawlerFactory:
raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...")
return crawler_class()
async def main():
# parse cmd
await cmd_arg.parse_cmd()
@ -38,7 +41,7 @@ async def main():
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
await crawler.start()
if config.SAVE_DATA_OPTION == "db":
await db.close()

View File

@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-
from .core import TieBaCrawler

View File

@ -0,0 +1,289 @@
import asyncio
import json
from typing import Any, Callable, Dict, List, Optional, Union
from urllib.parse import urlencode
import httpx
from playwright.async_api import BrowserContext
from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
import config
from base.base_crawler import AbstractApiClient
from model.m_baidu_tieba import TiebaComment, TiebaNote
from proxy.proxy_ip_pool import ProxyIpPool
from tools import utils
from .field import SearchNoteType, SearchSortType
from .help import TieBaExtractor
class BaiduTieBaClient(AbstractApiClient):
def __init__(
self,
timeout=10,
ip_pool=None,
default_ip_proxy=None,
):
self.ip_pool: Optional[ProxyIpPool] = ip_pool
self.timeout = timeout
self.headers = {
"User-Agent": utils.get_user_agent(),
"Cookies": "",
}
self._host = "https://tieba.baidu.com"
self._page_extractor = TieBaExtractor()
self.default_ip_proxy = default_ip_proxy
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
async def request(self, method, url, return_ori_content=False, proxies=None, **kwargs) -> Union[str, Any]:
"""
封装httpx的公共请求方法对请求响应做一些处理
Args:
method: 请求方法
url: 请求的URL
return_ori_content: 是否返回原始内容
proxies: 代理IP
**kwargs: 其他请求参数例如请求头请求体等
Returns:
"""
actual_proxies = proxies if proxies else self.default_ip_proxy
async with httpx.AsyncClient(proxies=actual_proxies) as client:
response = await client.request(
method, url, timeout=self.timeout,
headers=self.headers, **kwargs
)
if response.status_code != 200:
utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
utils.logger.error(f"Request failed, response: {response.text}")
raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
if response.text == "" or response.text == "blocked":
utils.logger.error(f"request params incrr, response.text: {response.text}")
raise Exception("account blocked")
if return_ori_content:
return response.text
return response.json()
async def get(self, uri: str, params=None, return_ori_content=False, **kwargs) -> Any:
"""
GET请求对请求头签名
Args:
uri: 请求路由
params: 请求参数
return_ori_content: 是否返回原始内容
Returns:
"""
final_uri = uri
if isinstance(params, dict):
final_uri = (f"{uri}?"
f"{urlencode(params)}")
try:
res = await self.request(method="GET", url=f"{self._host}{final_uri}",
return_ori_content=return_ori_content,
**kwargs)
return res
except RetryError as e:
if self.ip_pool:
proxie_model = await self.ip_pool.get_proxy()
_, proxies = utils.format_proxy_info(proxie_model)
res = await self.request(method="GET", url=f"{self._host}{final_uri}",
return_ori_content=return_ori_content,
proxies=proxies,
**kwargs)
self.default_ip_proxy = proxies
return res
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数IP已经被Block请尝试更换新的IP代理: {e}")
raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数IP已经被Block请尝试更换新的IP代理: {e}")
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
"""
POST请求对请求头签名
Args:
uri: 请求路由
data: 请求体参数
Returns:
"""
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
return await self.request(method="POST", url=f"{self._host}{uri}",
data=json_str, **kwargs)
async def pong(self) -> bool:
"""
用于检查登录态是否失效了
Returns:
"""
utils.logger.info("[BaiduTieBaClient.pong] Begin to pong tieba...")
try:
uri = "/mo/q/sync"
res: Dict = await self.get(uri)
utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}")
if res and res.get("no") == 0:
ping_flag = True
else:
utils.logger.info(f"[BaiduTieBaClient.pong] user not login, will try to login again...")
ping_flag = False
except Exception as e:
utils.logger.error(f"[BaiduTieBaClient.pong] Ping tieba failed: {e}, and try to login again...")
ping_flag = False
return ping_flag
async def update_cookies(self, browser_context: BrowserContext):
"""
API客户端提供的更新cookies方法一般情况下登录成功后会调用此方法
Args:
browser_context: 浏览器上下文对象
Returns:
"""
pass
async def get_notes_by_keyword(
self, keyword: str,
page: int = 1,
page_size: int = 10,
sort: SearchSortType = SearchSortType.TIME_DESC,
note_type: SearchNoteType = SearchNoteType.FIXED_THREAD,
) -> List[TiebaNote]:
"""
根据关键词搜索贴吧帖子
Args:
keyword: 关键词
page: 分页第几页
page_size: 每页大小
sort: 结果排序方式
note_type: 帖子类型主题贴主题+回复混合模式
Returns:
"""
uri = "/f/search/res"
params = {
"isnew": 1,
"qw": keyword,
"rn": page_size,
"pn": page,
"sm": sort.value,
"only_thread": note_type.value
}
page_content = await self.get(uri, params=params, return_ori_content=True)
return self._page_extractor.extract_search_note_list(page_content)
async def get_note_by_id(self, note_id: str) -> TiebaNote:
"""
根据帖子ID获取帖子详情
Args:
note_id:
Returns:
"""
uri = f"/p/{note_id}"
page_content = await self.get(uri, return_ori_content=True)
return self._page_extractor.extract_note_detail(page_content)
async def get_note_all_comments(self, note_detail: TiebaNote, crawl_interval: float = 1.0,
callback: Optional[Callable] = None) -> List[TiebaComment]:
"""
获取指定帖子下的所有一级评论该方法会一直查找一个帖子下的所有评论信息
Args:
note_detail: 帖子详情对象
crawl_interval: 爬取一次笔记的延迟单位
callback: 一次笔记爬取结束后
Returns:
"""
uri = f"/p/{note_detail.note_id}"
result: List[TiebaComment] = []
current_page = 1
while note_detail.total_replay_page >= current_page:
params = {
"pn": current_page
}
page_content = await self.get(uri, params=params, return_ori_content=True)
comments = self._page_extractor.extract_tieba_note_parment_comments(page_content,
note_id=note_detail.note_id)
if not comments:
break
if callback:
await callback(note_detail.note_id, comments)
result.extend(comments)
# 获取所有子评论
await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback)
await asyncio.sleep(crawl_interval)
current_page += 1
return result
async def get_comments_all_sub_comments(self, comments: List[TiebaComment], crawl_interval: float = 1.0,
callback: Optional[Callable] = None) -> List[TiebaComment]:
"""
获取指定评论下的所有子评论
Args:
comments: 评论列表
crawl_interval: 爬取一次笔记的延迟单位
callback: 一次笔记爬取结束后
Returns:
"""
uri = "/p/comment"
if not config.ENABLE_GET_SUB_COMMENTS:
return []
# # 贴吧获取所有子评论需要登录态
# if self.headers.get("Cookies") == "" or not self.pong():
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
all_sub_comments: List[TiebaComment] = []
for parment_comment in comments:
if parment_comment.sub_comment_count == 0:
continue
current_page = 1
max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
while max_sub_page_num >= current_page:
params = {
"tid": parment_comment.note_id, # 帖子ID
"pid": parment_comment.comment_id, # 父级评论ID
"fid": parment_comment.tieba_id, # 贴吧ID
"pn": current_page # 页码
}
page_content = await self.get(uri, params=params, return_ori_content=True)
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content,
parent_comment=parment_comment)
if not sub_comments:
break
if callback:
await callback(parment_comment.note_id, sub_comments)
all_sub_comments.extend(sub_comments)
await asyncio.sleep(crawl_interval)
current_page += 1
return all_sub_comments
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
"""
根据贴吧名称获取帖子列表
Args:
tieba_name: 贴吧名称
page_num: 分页数量
Returns:
"""
uri = f"/f?kw={tieba_name}&pn={page_num}"
page_content = await self.get(uri, return_ori_content=True)
return self._page_extractor.extract_tieba_note_list(page_content)

View File

@ -0,0 +1,265 @@
import asyncio
import os
import random
from asyncio import Task
from typing import Dict, List, Optional, Tuple
from playwright.async_api import (BrowserContext, BrowserType, Page,
async_playwright)
import config
from base.base_crawler import AbstractCrawler
from model.m_baidu_tieba import TiebaNote
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import tieba as tieba_store
from tools import utils
from tools.crawler_util import format_proxy_info
from var import crawler_type_var
from .client import BaiduTieBaClient
from .field import SearchNoteType, SearchSortType
from .login import BaiduTieBaLogin
class TieBaCrawler(AbstractCrawler):
context_page: Page
tieba_client: BaiduTieBaClient
browser_context: BrowserContext
def __init__(self) -> None:
self.index_url = "https://tieba.baidu.com"
self.user_agent = utils.get_user_agent()
async def start(self) -> None:
"""
Start the crawler
Returns:
"""
ip_proxy_pool, httpx_proxy_format = None, None
if config.ENABLE_IP_PROXY:
utils.logger.info("[BaiduTieBaCrawler.start] Begin create ip proxy pool ...")
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
_, httpx_proxy_format = format_proxy_info(ip_proxy_info)
utils.logger.info(f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}")
# Create a client to interact with the baidutieba website.
self.tieba_client = BaiduTieBaClient(
ip_pool=ip_proxy_pool,
default_ip_proxy=httpx_proxy_format,
)
crawler_type_var.set(config.CRAWLER_TYPE)
if config.CRAWLER_TYPE == "search":
# Search for notes and retrieve their comment information.
await self.search()
await self.get_specified_tieba_notes()
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_notes()
else:
pass
utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...")
async def search(self) -> None:
"""
Search for notes and retrieve their comment information.
Returns:
"""
utils.logger.info("[BaiduTieBaCrawler.search] Begin search baidu tieba keywords")
tieba_limit_count = 10 # tieba limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
start_page = config.START_PAGE
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"[BaiduTieBaCrawler.search] Current search keyword: {keyword}")
page = 1
while (page - start_page + 1) * tieba_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[BaiduTieBaCrawler.search] Skip page {page}")
page += 1
continue
try:
utils.logger.info(f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}")
notes_list: List[TiebaNote] = await self.tieba_client.get_notes_by_keyword(
keyword=keyword,
page=page,
page_size=tieba_limit_count,
sort=SearchSortType.TIME_DESC,
note_type=SearchNoteType.FIXED_THREAD
)
if not notes_list:
utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty")
break
utils.logger.info(f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}")
await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list])
page += 1
except Exception as ex:
utils.logger.error(
f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}")
break
async def get_specified_tieba_notes(self):
"""
Get the information and comments of the specified post by tieba name
Returns:
"""
tieba_limit_count = 50
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
for tieba_name in config.TIEBA_NAME_LIST:
utils.logger.info(
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}")
page_number = 0
while page_number <= config.CRAWLER_MAX_NOTES_COUNT:
note_list: List[TiebaNote] = await self.tieba_client.get_notes_by_tieba_name(
tieba_name=tieba_name,
page_num=page_number
)
if not note_list:
utils.logger.info(
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty")
break
utils.logger.info(
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}")
await self.get_specified_notes([note.note_id for note in note_list])
page_number += tieba_limit_count
async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST):
"""
Get the information and comments of the specified post
Args:
note_id_list:
Returns:
"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore) for note_id in note_id_list
]
note_details = await asyncio.gather(*task_list)
note_details_model: List[TiebaNote] = []
for note_detail in note_details:
if note_detail is not None:
note_details_model.append(note_detail)
await tieba_store.update_tieba_note(note_detail)
await self.batch_get_note_comments(note_details_model)
async def get_note_detail_async_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[TiebaNote]:
"""
Get note detail
Args:
note_id: baidu tieba note id
semaphore: asyncio semaphore
Returns:
"""
async with semaphore:
try:
utils.logger.info(f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}")
note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id)
if not note_detail:
utils.logger.error(
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}")
return None
return note_detail
except Exception as ex:
utils.logger.error(f"[BaiduTieBaCrawler.get_note_detail] Get note detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(
f"[BaiduTieBaCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}")
return None
async def batch_get_note_comments(self, note_detail_list: List[TiebaNote]):
"""
Batch get note comments
Args:
note_detail_list:
Returns:
"""
if not config.ENABLE_GET_COMMENTS:
return
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
for note_detail in note_detail_list:
task = asyncio.create_task(self.get_comments_async_task(note_detail, semaphore), name=note_detail.note_id)
task_list.append(task)
await asyncio.gather(*task_list)
async def get_comments_async_task(self, note_detail: TiebaNote, semaphore: asyncio.Semaphore):
"""
Get comments async task
Args:
note_detail:
semaphore:
Returns:
"""
async with semaphore:
utils.logger.info(f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}")
await self.tieba_client.get_note_all_comments(
note_detail=note_detail,
crawl_interval=random.random(),
callback=tieba_store.batch_update_tieba_note_comments
)
async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True
) -> BrowserContext:
"""
Launch browser and create browser
Args:
chromium:
playwright_proxy:
user_agent:
headless:
Returns:
"""
utils.logger.info("[BaiduTieBaCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE:
# feat issue #14
# we will save login state to avoid login every time
user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy, # type: ignore
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
)
return browser_context
else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
)
return browser_context
async def close(self):
"""
Close browser context
Returns:
"""
await self.browser_context.close()
utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...")

View File

@ -0,0 +1,18 @@
from enum import Enum
class SearchSortType(Enum):
"""search sort type"""
# 按时间倒序
TIME_DESC = "1"
# 按时间顺序
TIME_ASC = "0"
# 按相关性顺序
RELEVANCE_ORDER = "2"
class SearchNoteType(Enum):
# 只看主题贴
MAIN_THREAD = "1"
# 混合模式(帖子+回复)
FIXED_THREAD = "0"

View File

@ -0,0 +1,301 @@
# -*- coding: utf-8 -*-
import html
import json
import re
from typing import Dict, List, Tuple
from parsel import Selector
from constant import baidu_tieba as const
from model.m_baidu_tieba import TiebaComment, TiebaNote
from tools import utils
class TieBaExtractor:
def __init__(self):
pass
@staticmethod
def extract_search_note_list(page_content: str) -> List[TiebaNote]:
"""
提取贴吧帖子列表这里提取的关键词搜索结果页的数据还缺少帖子的回复数和回复页等数据
Args:
page_content: 页面内容的HTML字符串
Returns:
包含帖子信息的字典列表
"""
xpath_selector = "//div[@class='s_post']"
post_list = Selector(text=page_content).xpath(xpath_selector)
result: List[TiebaNote] = []
for post in post_list:
tieba_note = TiebaNote(
note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(),
title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(),
desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(),
note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get(default=''),
user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip(),
user_link=const.TIEBA_URL + post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default=''),
tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(),
tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get(default=''),
publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip(),
)
result.append(tieba_note)
return result
def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
"""
提取贴吧帖子列表
Args:
page_content:
Returns:
"""
page_content = page_content.replace('<!--', "")
content_selector = Selector(text=page_content)
xpath_selector = "//ul[@id='thread_list']/li"
post_list = content_selector.xpath(xpath_selector)
result: List[TiebaNote] = []
for post_selector in post_list:
post_field_value: Dict = self.extract_data_field_value(post_selector)
if not post_field_value:
continue
note_id = str(post_field_value.get("id"))
tieba_note = TiebaNote(
note_id=note_id,
title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(),
desc=post_selector.xpath(".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get(
default='').strip(),
note_url=const.TIEBA_URL + f"/p/{note_id}",
user_link=const.TIEBA_URL + post_selector.xpath(
".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(),
user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get("author_name"),
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(),
tieba_link=const.TIEBA_URL + content_selector.xpath("//a[@class='card_title_fname']/@href").get(
default=''),
total_replay_num=post_field_value.get("reply_num", 0)
)
result.append(tieba_note)
return result
def extract_note_detail(self, page_content: str) -> TiebaNote:
"""
提取贴吧帖子详情
Args:
page_content:
Returns:
"""
content_selector = Selector(text=page_content)
first_floor_selector = content_selector.xpath("//div[@class='p_postlist'][1]")
only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip()
note_id = only_view_author_link.split("?")[0].split("/")[-1]
# 帖子回复数、回复页数
thread_num_infos = content_selector.xpath(
"//div[@id='thread_theme_5']//li[@class='l_reply_num']//span[@class='red']"
)
# IP地理位置、发表时间
other_info_content = content_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
note = TiebaNote(
note_id=note_id,
title=content_selector.xpath("//title/text()").get(default='').strip(),
desc=content_selector.xpath("//meta[@name='description']/@content").get(default='').strip(),
note_url=const.TIEBA_URL + f"/p/{note_id}",
user_link=const.TIEBA_URL + first_floor_selector.xpath(".//a[@class='p_author_face ']/@href").get(
default='').strip(),
user_nickname=first_floor_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get(
default='').strip(),
user_avatar=first_floor_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(default='').strip(),
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(),
tieba_link=const.TIEBA_URL + content_selector.xpath("//a[@class='card_title_fname']/@href").get(default=''),
ip_location=ip_location,
publish_time=publish_time,
total_replay_num=thread_num_infos[0].xpath("./text()").get(default='').strip(),
total_replay_page=thread_num_infos[1].xpath("./text()").get(default='').strip(),
)
note.title = note.title.replace(f"{note.tieba_name}】_百度贴吧", "")
return note
def extract_tieba_note_parment_comments(self, page_content: str, note_id: str) -> List[TiebaComment]:
"""
提取贴吧帖子一级评论
Args:
page_content:
note_id:
Returns:
"""
xpath_selector = "//div[@class='l_post l_post_bright j_l_post clearfix ']"
comment_list = Selector(text=page_content).xpath(xpath_selector)
result: List[TiebaComment] = []
for comment_selector in comment_list:
comment_field_value: Dict = self.extract_data_field_value(comment_selector)
if not comment_field_value:
continue
tieba_name = comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip()
other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
tieba_comment = TiebaComment(
comment_id=str(comment_field_value.get("content").get("post_id")),
sub_comment_count=comment_field_value.get("content").get("comment_num"),
content=utils.extract_text_from_html(comment_field_value.get("content").get("content")),
note_url=const.TIEBA_URL + f"/p/{note_id}",
user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(
default='').strip(),
user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get(
default='').strip(),
user_avatar=comment_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(
default='').strip(),
tieba_id=str(comment_field_value.get("content").get("forum_id", "")),
tieba_name=tieba_name,
tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}",
ip_location=ip_location,
publish_time=publish_time,
note_id=note_id,
)
result.append(tieba_comment)
return result
def extract_tieba_note_sub_comments(self, page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
"""
提取贴吧帖子二级评论
Args:
page_content:
parent_comment:
Returns:
"""
selector = Selector(page_content)
comments = []
comment_ele_list = selector.xpath("//li[@class='lzl_single_post j_lzl_s_p first_no_border']")
comment_ele_list.extend(selector.xpath("//li[@class='lzl_single_post j_lzl_s_p ']"))
for comment_ele in comment_ele_list:
comment_value = self.extract_data_field_value(comment_ele)
if not comment_value:
continue
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
content = utils.extract_text_from_html(
comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
comment = TiebaComment(
comment_id=str(comment_value.get("spid")),
content=content,
user_link=comment_user_a_selector.xpath("./@href").get(default=""),
user_nickname=comment_value.get("showname"),
user_avatar=comment_user_a_selector.xpath("./img/@src").get(default=""),
publish_time=comment_ele.xpath(".//span[@class='lzl_time']/text()").get(default="").strip(),
parent_comment_id=parent_comment.comment_id,
note_id=parent_comment.note_id,
note_url=parent_comment.note_url,
tieba_id=parent_comment.tieba_id,
tieba_name=parent_comment.tieba_name,
tieba_link=parent_comment.tieba_link
)
comments.append(comment)
return comments
@staticmethod
def extract_ip_and_pub_time(html_content: str) -> Tuple[str, str]:
"""
提取IP位置和发布时间
Args:
html_content:
Returns:
"""
pattern_ip = re.compile(r'IP属地:(\S+)</span>')
pattern_pub_time = re.compile(r'<span class="tail-info">(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</span>')
ip_match = pattern_ip.search(html_content)
time_match = pattern_pub_time.search(html_content)
ip = ip_match.group(1) if ip_match else ""
pub_time = time_match.group(1) if time_match else ""
return ip, pub_time
@staticmethod
def extract_data_field_value(selector: Selector) -> Dict:
"""
提取data-field的值
Args:
selector:
Returns:
"""
data_field_value = selector.xpath("./@data-field").get(default='').strip()
if not data_field_value or data_field_value == "{}":
return {}
try:
# 先使用 html.unescape 处理转义字符 再json.loads 将 JSON 字符串转换为 Python 字典
unescaped_json_str = html.unescape(data_field_value)
data_field_dict_value = json.loads(unescaped_json_str)
except Exception as ex:
print(f"extract_data_field_value错误信息{ex}, 尝试使用其他方式解析")
data_field_dict_value = {}
return data_field_dict_value
def test_extract_search_note_list():
with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f:
content = f.read()
extractor = TieBaExtractor()
result = extractor.extract_search_note_list(content)
print(result)
def test_extract_note_detail():
with open("test_data/note_detail.html", "r", encoding="utf-8") as f:
content = f.read()
extractor = TieBaExtractor()
result = extractor.extract_note_detail(content)
print(result.model_dump())
def test_extract_tieba_note_parment_comments():
with open("test_data/note_comments.html", "r", encoding="utf-8") as f:
content = f.read()
extractor = TieBaExtractor()
result = extractor.extract_tieba_note_parment_comments(content, "123456")
print(result)
def test_extract_tieba_note_sub_comments():
with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f:
content = f.read()
extractor = TieBaExtractor()
fake_parment_comment = TiebaComment(
comment_id="123456",
content="content",
user_link="user_link",
user_nickname="user_nickname",
user_avatar="user_avatar",
publish_time="publish_time",
parent_comment_id="parent_comment_id",
note_id="note_id",
note_url="note_url",
tieba_id="tieba_id",
tieba_name="tieba_name",
)
result = extractor.extract_tieba_note_sub_comments(content, fake_parment_comment)
print(result)
def test_extract_tieba_note_list():
with open("test_data/tieba_note_list.html", "r", encoding="utf-8") as f:
content = f.read()
extractor = TieBaExtractor()
result = extractor.extract_tieba_note_list(content)
print(result)
pass
if __name__ == '__main__':
# test_extract_search_note_list()
# test_extract_note_detail()
# test_extract_tieba_note_parment_comments()
test_extract_tieba_note_list()

View File

@ -0,0 +1,112 @@
import asyncio
import functools
import sys
from typing import Optional
from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
import config
from base.base_crawler import AbstractLogin
from tools import utils
class BaiduTieBaLogin(AbstractLogin):
def __init__(self,
login_type: str,
browser_context: BrowserContext,
context_page: Page,
login_phone: Optional[str] = "",
cookie_str: str = ""
):
config.LOGIN_TYPE = login_type
self.browser_context = browser_context
self.context_page = context_page
self.login_phone = login_phone
self.cookie_str = cookie_str
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self) -> bool:
"""
轮训检查登录状态是否成功成功返回True否则返回False
Returns:
"""
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
stoken = cookie_dict.get("STOKEN")
ptoken = cookie_dict.get("PTOKEN")
if stoken or ptoken:
return True
return False
async def begin(self):
"""Start login baidutieba"""
utils.logger.info("[BaiduTieBaLogin.begin] Begin login baidutieba ...")
if config.LOGIN_TYPE == "qrcode":
await self.login_by_qrcode()
elif config.LOGIN_TYPE == "phone":
await self.login_by_mobile()
elif config.LOGIN_TYPE == "cookie":
await self.login_by_cookies()
else:
raise ValueError("[BaiduTieBaLogin.begin]Invalid Login Type Currently only supported qrcode or phone or cookies ...")
async def login_by_mobile(self):
"""Login baidutieba by mobile"""
pass
async def login_by_qrcode(self):
"""login baidutieba website and keep webdriver login state"""
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Begin login baidutieba by qrcode ...")
qrcode_img_selector = "xpath=//img[@class='tang-pass-qrcode-img']"
# find login qrcode
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
# if this website does not automatically popup login dialog box, we will manual click login button
await asyncio.sleep(0.5)
login_button_ele = self.context_page.locator("xpath=//li[@class='u_login']")
await login_button_ele.click()
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
sys.exit()
# show login qrcode
# fix issue #12
# we need to use partial function to call show_qrcode function and run in executor
# then current asyncio event loop will not be blocked
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
try:
await self.check_login_state()
except RetryError:
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Login baidutieba failed by qrcode login method ...")
sys.exit()
wait_redirect_seconds = 5
utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_cookies(self):
"""login baidutieba website by cookies"""
utils.logger.info("[BaiduTieBaLogin.login_by_cookies] Begin login baidutieba by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,
'value': value,
'domain': ".baidu.com",
'path': "/"
}])

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,189 @@
<li class="lzl_single_post j_lzl_s_p first_no_border" data-field='{&quot;spid&quot;:150726504693,&quot;showname&quot;:&quot;heinzfrentzen&quot;,&quot;user_name&quot;:&quot;heinzfrentzen&quot;,&quot;portrait&quot;:&quot;tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&quot;}'>
<a rel="noopener" name="150726504693"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;heinzfrentzen&quot;,&quot;id&quot;:&quot;tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&fr=pb" username="heinzfrentzen">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;heinzfrentzen&quot;,&quot;id&quot;:&quot;tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&quot;}' href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&ie=utf-8&fr=pb" target="_blank" username="heinzfrentzen">heinzfrentzen</a>
:
<span class="lzl_content_main" data-username="">
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
</span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:11</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726506822,&quot;showname&quot;:&quot;\u53ef\u7231\u7684\u642c\u8fd0\u5de594&quot;,&quot;user_name&quot;:&quot;\u53ef\u7231\u7684\u642c\u8fd0\u5de594&quot;,&quot;portrait&quot;:&quot;tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&quot;}'>
<a rel="noopener" name="150726506822"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u53ef\u7231\u7684\u642c\u8fd0\u5de594&quot;,&quot;id&quot;:&quot;tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&fr=pb" username="可爱的搬运工94">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u53ef\u7231\u7684\u642c\u8fd0\u5de594&quot;,&quot;id&quot;:&quot;tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&quot;}' href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&ie=utf-8&fr=pb" target="_blank" username="可爱的搬运工94">可爱的搬运工94</a>
:<span class="lzl_content_main" data-username="">陈芋汐水花也不小 </span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:12</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726508024,&quot;showname&quot;:&quot;\u56fd\u9645\u4f53\u575b\u5de8\u661f\u9752\u6912\u8089\u4e1d&quot;,&quot;user_name&quot;:&quot;\u8682\u8681\u96c5\u864e\u54c8\u54c8&quot;,&quot;portrait&quot;:&quot;tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&quot;}'>
<a rel="noopener" name="150726508024"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u8682\u8681\u96c5\u864e\u54c8\u54c8&quot;,&quot;id&quot;:&quot;tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&fr=pb" username="蚂蚁雅虎哈哈">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u8682\u8681\u96c5\u864e\u54c8\u54c8&quot;,&quot;id&quot;:&quot;tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&quot;}' href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&ie=utf-8&fr=pb" target="_blank" username="蚂蚁雅虎哈哈">国际体坛巨星青椒肉丝</a>
:<span class="lzl_content_main" data-username="">你怀孕了吗 老是呕吐 </span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:12</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726509762,&quot;showname&quot;:&quot;\u8317\u82b1\u5c11\u5e05&quot;,&quot;user_name&quot;:&quot;\u8317\u82b1\u5c11\u5e05&quot;,&quot;portrait&quot;:&quot;tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&quot;}'>
<a rel="noopener" name="150726509762"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u8317\u82b1\u5c11\u5e05&quot;,&quot;id&quot;:&quot;tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&fr=pb" username="茗花少帅">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:{&quot;all_level&quot;:{&quot;2&quot;:{&quot;end_time&quot;:&quot;1421248220&quot;,&quot;level&quot;:2,&quot;pic_url&quot;:&quot;http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;score_limit&quot;:8000}},&quot;level&quot;:{&quot;end_time&quot;:&quot;1421248220&quot;,&quot;pic_url&quot;:&quot;http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;props_id&quot;:2}},&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u8317\u82b1\u5c11\u5e05&quot;,&quot;id&quot;:&quot;tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&quot;}' href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&ie=utf-8&fr=pb" target="_blank" username="茗花少帅">茗花少帅</a>
:<span class="lzl_content_main" data-username="">你就只看水花不看空中姿态吗 </span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:12</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726510645,&quot;showname&quot;:&quot;\u4e1c\u534e\u6b66\u5170&quot;,&quot;user_name&quot;:&quot;\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e&quot;,&quot;portrait&quot;:&quot;tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&quot;}'>
<a rel="noopener" name="150726510645"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e&quot;,&quot;id&quot;:&quot;tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&fr=pb" username="西安交大前一百">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:{&quot;all_level&quot;:{&quot;2&quot;:{&quot;end_time&quot;:&quot;1644033630&quot;,&quot;level&quot;:2,&quot;pic_url&quot;:&quot;http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;score_limit&quot;:8000}},&quot;level&quot;:{&quot;end_time&quot;:&quot;1644033630&quot;,&quot;pic_url&quot;:&quot;http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;props_id&quot;:2}},&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e&quot;,&quot;id&quot;:&quot;tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&quot;}' href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&ie=utf-8&fr=pb" target="_blank" username="西安交大前一百">东华武兰</a>
:<span class="lzl_content_main" data-username="">经典只看水花 </span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:12</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726514057,&quot;showname&quot;:&quot;\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f&quot;,&quot;user_name&quot;:&quot;\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f&quot;,&quot;portrait&quot;:&quot;tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&quot;}'>
<a rel="noopener" name="150726514057"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f&quot;,&quot;id&quot;:&quot;tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&fr=pb" username="上下班要注意">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f&quot;,&quot;id&quot;:&quot;tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&quot;}' href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&ie=utf-8&fr=pb" target="_blank" username="上下班要注意">上下班要注意</a>
:<span class="lzl_content_main" data-username="">分数正常吧 </span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:13</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726520372,&quot;showname&quot;:&quot;\u9759\u770b\u8682\u8681\u4e0a\u6811&quot;,&quot;user_name&quot;:&quot;\u9759\u770b\u8682\u8681\u4e0a\u6811&quot;,&quot;portrait&quot;:&quot;tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&quot;}'>
<a rel="noopener" name="150726520372"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u9759\u770b\u8682\u8681\u4e0a\u6811&quot;,&quot;id&quot;:&quot;tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&fr=pb" username="静看蚂蚁上树">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u9759\u770b\u8682\u8681\u4e0a\u6811&quot;,&quot;id&quot;:&quot;tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&quot;}' href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&ie=utf-8&fr=pb" target="_blank" username="静看蚂蚁上树">静看蚂蚁上树</a>
:
<span class="lzl_content_main" data-username="">
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg" target="_blank" class="at">国际体坛巨星青椒肉丝</a>
:吃酸黄瓜吃多了<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
</span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:14</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726524963,&quot;showname&quot;:&quot;\u4e0d\u61c2\u53d6\u5565\u540d\u5b57\ud83d\ude1c&quot;,&quot;user_name&quot;:&quot;\u9ec4\u5c0f\u6e2forz&quot;,&quot;portrait&quot;:&quot;tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&quot;}'>
<a rel="noopener" name="150726524963"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u9ec4\u5c0f\u6e2forz&quot;,&quot;id&quot;:&quot;tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&fr=pb" username="黄小港orz">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u9ec4\u5c0f\u6e2forz&quot;,&quot;id&quot;:&quot;tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&quot;}' href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&ie=utf-8&fr=pb" target="_blank" username="黄小港orz">不懂取啥名字😜</a>
:
<span class="lzl_content_main" data-username="">
请你去跟国际泳联投诉<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
</span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:15</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726535666,&quot;showname&quot;:&quot;\ud83d\udcab\u6cfd\u8d6b\u62c9\ud83d\udcaf&quot;,&quot;user_name&quot;:&quot;\u5feb\u770b\u5361\u5361\u5361\u5361&quot;,&quot;portrait&quot;:&quot;tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&quot;}'>
<a rel="noopener" name="150726535666"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u5feb\u770b\u5361\u5361\u5361\u5361&quot;,&quot;id&quot;:&quot;tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&fr=pb" username="快看卡卡卡卡">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:{&quot;all_level&quot;:{&quot;2&quot;:{&quot;end_time&quot;:&quot;1539783937&quot;,&quot;level&quot;:2,&quot;pic_url&quot;:&quot;http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;score_limit&quot;:8000}},&quot;level&quot;:{&quot;end_time&quot;:&quot;1539783937&quot;,&quot;pic_url&quot;:&quot;http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;props_id&quot;:2}},&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u5feb\u770b\u5361\u5361\u5361\u5361&quot;,&quot;id&quot;:&quot;tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&quot;}' href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&ie=utf-8&fr=pb" target="_blank" username="快看卡卡卡卡">💫泽赫拉💯</a>
:<span class="lzl_content_main" data-username="">第五跳陈空中分腿了空中姿态明显全红婵更好 </span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:17</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726536076,&quot;showname&quot;:&quot;\u55ef\u55ef\u54e6\u54e6\u554a\u554a\ud83d\udc36&quot;,&quot;user_name&quot;:&quot;\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc&quot;,&quot;portrait&quot;:&quot;tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&quot;}'>
<a rel="noopener" name="150726536076"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc&quot;,&quot;id&quot;:&quot;tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&fr=pb" username="嗯嗯哦哦啊啊哼">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:null,&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc&quot;,&quot;id&quot;:&quot;tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&quot;}' href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&ie=utf-8&fr=pb" target="_blank" username="嗯嗯哦哦啊啊哼">嗯嗯哦哦啊啊🐶</a>
:
<span class="lzl_content_main" data-username="">
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.84497425.b5GLK5lGm90mTB2BhjrgpA" target="_blank" class="at">美味蟹黄堡💞</a>
:你不会看起跳高度和空中姿态
</span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:17</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_li_pager j_lzl_l_p lzl_li_pager_s" data-field='{&quot;total_num&quot;:16,&quot;total_page&quot;:2}'>
<a rel="noopener" class="j_lzl_p btn-sub btn-small pull-right" href="##">
<i class="icon-reply"></i>
我也说一句
</a>
<p class="j_pager l_pager pager_theme_2">
<span class="tP">1</span>
<a href="#2">2</a>
<a href="#2">下一页</a>
<a href="#2">尾页</a>
</p>
</li>

View File

@ -0,0 +1,96 @@
<div class="s_post_list">
<div class="s_post"><span class="p_title"><a data-tid="9117888152" data-fid="26976424" class="bluelink"
href="/p/9117888152?pid=150718967291&amp;cid=0#150718967291"
target="_blank">武汉交互空间科技富士康10亿加码中国大陆印度为何逐渐失宠</a></span>
<div class="p_content">
全球知名的电子制造服务巨头富士康的母公司鸿海精密工业股份有限公司正式对外发布了一则重大投资公告富士康将在郑州投资建设新事业总部大楼承载新事业总部功能这一战略举措不仅彰显了富士康对中国市场持续深化的承诺与信心也预示着该集团业务版图的新一轮扩张与升级
项目一期选址位于郑东新区建筑面积约700公亩总投资约10亿元人民币主要建设总部管理中心研发中心和工程中心战略产业发展中心战略产业金融平台
</div>
贴吧<a data-fid="26976424" class="p_forum" href="/f?kw=%CE%E4%BA%BA%BD%BB%BB%A5%BF%D5%BC%E4"
target="_blank"><font class="p_violet">武汉交互空间</font></a>作者<a
href="/home/main?un=VR%D0%E9%C4%E2%B4%EF%C8%CB" target="_blank"><font class="p_violet">VR虚拟达人</font></a>
<font class="p_green p_date">2024-08-05 16:45</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9114743782" data-fid="90367" class="bluelink"
href="/p/9114743782?pid=150705176739&amp;cid=0#150705176739"
target="_blank">请各位急用玛尼的小心骗子最多</a></span>
<div class="p_content">
这里面到处是骗子大家小心特别那些叫出村背货的基本是卖园区天下没有那么好的事就是有这好事我们在边境上的人比你们最清楚轮不到你们边境上比你们胆子大的人大把你一不熟悉小路为什么叫你带货东南亚带货的集结地一般在南宁防城港昆明西双版纳临沧然后师机接了走小路出去南宁防城港坐船出去好多都是二十几手的中介之前卖园区一个三十万现在不知道行情但好多园区不收
</div>
贴吧<a data-fid="90367" class="p_forum" href="/f?kw=%B1%B3%B0%FC%BF%CD" target="_blank"><font class="p_violet">背包客</font></a>作者<a
href="/home/main?un=%CC%F9%B0%C9%D3%C3%BB%A7_GC64AUS" target="_blank"><font class="p_violet">贴吧用户_GC64AUS</font></a>
<font class="p_green p_date">2024-08-03 07:35</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9095684158" data-fid="1388265" class="bluelink"
href="/p/9095684158?pid=150616716870&amp;cid=0#150616716870"
target="_blank">*2025泰国冷链制冷运输展*东南亚外贸出口</a></span>
<div class="p_content">**2025泰国曼谷国际冷库空调制冷仓储暨冷链运输展 *2025泰国冷链制冷运输展*东南亚外贸出口-观展游览考察
展出时间2025-7具体时间待定 展出地点泰国曼谷会展中心 展会周期一年一届 组展单位北京励航国际商务会展有限公司
人员跟团观展补贴为您节省成本寻找适合您的市场
本公司为您提供观展考察机会让您在大型展会上获得世界同行**科技的资料同时感受异域文化气息展会现场走展考察当地游览当地相关市
</div>
贴吧<a data-fid="1388265" class="p_forum" href="/f?kw=%B9%FA%BC%CA%D5%B9%BB%E1" target="_blank"><font
class="p_violet">国际展会</font></a>作者<a href="/home/main?un=zhaot_188" target="_blank"><font
class="p_violet">zhaot_188</font></a> <font class="p_green p_date">2024-07-19 15:44</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9093564752" data-fid="27984246" class="bluelink"
href="/p/9093564752?pid=150606964195&amp;cid=0#150606964195"
target="_blank">京湘楼创始人肖鑫创立于北京植根长沙百年美食传承</a></span>
<div class="p_content">来源标题京湘楼创始人肖鑫创立于北京植根长沙百年美食传承 京湘楼KING HERO品牌创始人肖鑫
京湘楼KING
HERO集酱板鸭肥肠鸭头鸭脖鸭肠小龙虾牛蛙捆鸡鸡爪鱼嘴巴鱼尾鱿鱼牛肉猪头肉等特色食品卤制加工包装与生产经营2022年3月在北京朝阳区双井开设了第一家京湘楼·鲜卤集市卤味熟食快餐店2023年5月在湖南省长沙市开福区注册成立了长沙京湘楼品牌管理有限公司京湘楼作为品
</div>
贴吧<a data-fid="27984246" class="p_forum" href="/f?kw=%BE%A9%CF%E6%C2%A5" target="_blank"><font
class="p_violet">京湘楼</font></a>作者<a href="/home/main?un=%CC%EC%C9%F1%B6%C9%B3%BE" target="_blank"><font
class="p_violet">天神渡尘</font></a> <font class="p_green p_date">2024-07-17 23:43</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9088419293" data-fid="310" class="bluelink"
href="/p/9088419293?pid=150582471307&amp;cid=0#150582471307"
target="_blank">广州能争取到迪士尼与环球落户吗</a></span>
<div class="p_content">
不是二选一而是全都要上一组数据上海迪士尼2016年开业就接待游客超过1.2亿人次香港迪士尼2023全年游客人数才640万人次约等于无这么低的入园人次已经引来迪士尼方面的不悦
美国有两个迪士尼说实话迪士尼的门票并不高普通人都去的起中国完全有能力建两到三个迪士尼欧洲只有第一个迪士尼因为它的人口只有中国的一半假设中国人一年吃一包盐一年就是14包那么欧洲就是七亿包盐盐再便宜欧洲人也不可能一人吃
</div>
贴吧<a data-fid="310" class="p_forum" href="/f?kw=%B5%D8%C0%ED" target="_blank"><font
class="p_violet">地理</font></a>作者<a href="/home/main?un=SeaRoutes" target="_blank"><font
class="p_violet">SeaRoutes</font></a> <font class="p_green p_date">2024-07-13 20:17</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9088416365" data-fid="7561034" class="bluelink"
href="/p/9088416365?pid=150582456551&amp;cid=0#150582456551"
target="_blank">#城市GDP#广州应该全力去争取迪士尼和环球影城</a></span>
<div class="p_content">
不是二选一而是全都要上一组数据上海迪士尼2016年开业就接待游客超过1.2亿人次香港迪士尼2023全年游客人数才640万人次约等于无这么低的入园人次已经引来迪士尼方面的不悦
美国有两个迪士尼说实话迪士尼的门票并不高普通人都去的起中国完全有能力建两到三个迪士尼欧洲只有第一个迪士尼因为它的人口只有中国的一半假设中国人一年吃一包盐一年就是14包那么欧洲就是七亿包盐盐再便宜欧洲人也不可能一人吃
</div>
贴吧<a data-fid="7561034" class="p_forum" href="/f?kw=%B3%C7%CA%D0gdp" target="_blank"><font class="p_violet">城市gdp</font></a>作者<a
href="/home/main?un=SeaRoutes" target="_blank"><font class="p_violet">SeaRoutes</font></a> <font
class="p_green p_date">2024-07-13 20:14</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9087419039" data-fid="46374" class="bluelink"
href="/p/9087419039?pid=150577861626&amp;cid=0#150577861626"
target="_blank">云南省首批云南日报昆明新闻头条聚焦阳宗海省级物流枢纽建设</a></span>
<div class="p_content">
7月11日云南日报昆明新闻头条刊发文章阳宗海风景名胜区立足衔接西部陆海新通道与中老铁路优势加速28个物流枢纽设施建设聚焦昆明阳宗海风景名胜区系统推进省级物流枢纽建设和功能提升深挖比较优势壮大物流产业据云南省发展和改革委员会在昆明召开的新闻发布会上公布今年全省共有5地纳入云南省第一批省级物流枢纽和省级骨干冷链物流基地建设名单其中昆明市有两家获批阳宗海物流枢纽上榜一起来看近日云南省
</div>
贴吧<a data-fid="46374" class="p_forum" href="/f?kw=%C0%A5%C3%F7" target="_blank"><font
class="p_violet">昆明</font></a>作者<a href="/home/main?un=%8F%EC" target="_blank"><font
class="p_violet"></font></a> <font class="p_green p_date">2024-07-12 23:04</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9085102046" data-fid="348713" class="bluelink"
href="/p/9085102046?pid=150567555367&amp;cid=0#150567555367"
target="_blank">寻找弟弟很久没跟家里联系</a></span>
<div class="p_content">Kk四期世纪园区寻找弟弟外号大佐F3 2公司cj集团</div>
贴吧<a data-fid="348713" class="p_forum" href="/f?kw=%B6%AB%C4%CF%D1%C7" target="_blank"><font
class="p_violet">东南亚</font></a>作者<a href="/home/main?un=%CC%F9%B0%C9%D3%C3%BB%A7_GC2CtRa"
target="_blank"><font class="p_violet">贴吧用户_GC2CtRa</font></a>
<font class="p_green p_date">2024-07-11 07:53</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9083888071" data-fid="30" class="bluelink"
href="/p/9083888071?pid=150562129935&amp;cid=0#150562129935"
target="_blank">拉美 非洲 东南亚 南亚等发展中国家不太可能普及八小时双休吧</a></span>
<div class="p_content">拉美 东南亚的泰国 之类的连毒枭和黑色产业都管不好感觉普及八小时双休不太可能 缅甸和非洲军阀林立
跟军阀谈八小时双休那么不开玩笑缅北诈骗园区就能看出来
</div>
贴吧<a data-fid="30" class="p_forum" href="/f?kw=%C0%FA%CA%B7" target="_blank"><font
class="p_violet">历史</font></a>作者<a href="/home/main?un=yoursagain" target="_blank"><font
class="p_violet">yoursagain</font></a> <font class="p_green p_date">2024-07-10 09:00</font></div>
<div class="s_post"><span class="p_title"><a data-tid="9071937582" data-fid="8103241" class="bluelink"
href="/p/9071937582?pid=150510120873&amp;cid=0#150510120873"
target="_blank">东南亚园区 </a></span>
<div class="p_content"></div>
贴吧<a data-fid="8103241" class="p_forum" href="/f?kw=%D4%B0%C7%F8%D5%D0%C9%CC" target="_blank"><font
class="p_violet">园区招商</font></a>作者<a href="/home/main?un=QQ59052966" target="_blank"><font
class="p_violet">QQ59052966</font></a> <font class="p_green p_date">2024-06-30 12:09</font></div>
</div>

File diff suppressed because one or more lines are too long

1
model/__init__.py Normal file
View File

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

45
model/m_baidu_tieba.py Normal file
View File

@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
from typing import Optional
from pydantic import BaseModel, Field
class TiebaNote(BaseModel):
"""
百度贴吧帖子
"""
note_id: str = Field(..., description="帖子ID")
title: str = Field(..., description="帖子标题")
desc: str = Field(default="", description="帖子描述")
note_url: str = Field(..., description="帖子链接")
publish_time: str = Field(default="", description="发布时间")
user_link: str = Field(default="", description="用户主页链接")
user_nickname: str = Field(default="", description="用户昵称")
user_avatar: str = Field(default="", description="用户头像地址")
tieba_name: str = Field(..., description="贴吧名称")
tieba_link: str = Field(..., description="贴吧链接")
total_replay_num: int = Field(default=0, description="回复总数")
total_replay_page: int = Field(default=0, description="回复总页数")
ip_location: Optional[str] = Field(default="", description="IP地理位置")
class TiebaComment(BaseModel):
"""
百度贴吧评论
"""
comment_id: str = Field(..., description="评论ID")
parent_comment_id: str = Field(default="", description="父评论ID")
content: str = Field(..., description="评论内容")
user_link: str = Field(default="", description="用户主页链接")
user_nickname: str = Field(default="", description="用户昵称")
user_avatar: str = Field(default="", description="用户头像地址")
publish_time: str = Field(default="", description="发布时间")
ip_location: Optional[str] = Field(default="", description="IP地理位置")
sub_comment_count: int = Field(default=0, description="子评论数")
note_id: str = Field(..., description="帖子ID")
note_url: str = Field(..., description="帖子链接")
tieba_id: str = Field(..., description="所属的贴吧ID")
tieba_name: str = Field(..., description="所属的贴吧名称")
tieba_link: str = Field(..., description="贴吧链接")

1
model/m_douyin.py Normal file
View File

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

1
model/m_kuaishou.py Normal file
View File

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

1
model/m_weibo.py Normal file
View File

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

1
model/m_xiaohongshu.py Normal file
View File

@ -0,0 +1 @@
# -*- coding: utf-8 -*-

View File

@ -13,4 +13,4 @@ python-dotenv==1.0.1
jieba==0.42.1
wordcloud==1.9.3
matplotlib==3.9.0
requests==2.32.3
requests==2.32.3

View File

@ -2,192 +2,200 @@
-- Table structure for bilibili_video
-- ----------------------------
DROP TABLE IF EXISTS `bilibili_video`;
CREATE TABLE `bilibili_video` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
`video_type` varchar(16) NOT NULL COMMENT '视频类型',
`title` varchar(500) DEFAULT NULL COMMENT '视频标题',
`desc` longtext COMMENT '视频描述',
`create_time` bigint NOT NULL COMMENT '视频发布时间戳',
`liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数',
`video_play_count` varchar(16) DEFAULT NULL COMMENT '视频播放数量',
`video_danmaku` varchar(16) DEFAULT NULL COMMENT '视频弹幕数量',
`video_comment` varchar(16) DEFAULT NULL COMMENT '视频评论数量',
`video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL',
`video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL',
PRIMARY KEY (`id`),
KEY `idx_bilibili_vi_video_i_31c36e` (`video_id`),
KEY `idx_bilibili_vi_create__73e0ec` (`create_time`)
CREATE TABLE `bilibili_video`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
`video_type` varchar(16) NOT NULL COMMENT '视频类型',
`title` varchar(500) DEFAULT NULL COMMENT '视频标题',
`desc` longtext COMMENT '视频描述',
`create_time` bigint NOT NULL COMMENT '视频发布时间戳',
`liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数',
`video_play_count` varchar(16) DEFAULT NULL COMMENT '视频播放数量',
`video_danmaku` varchar(16) DEFAULT NULL COMMENT '视频弹幕数量',
`video_comment` varchar(16) DEFAULT NULL COMMENT '视频评论数量',
`video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL',
`video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL',
PRIMARY KEY (`id`),
KEY `idx_bilibili_vi_video_i_31c36e` (`video_id`),
KEY `idx_bilibili_vi_create__73e0ec` (`create_time`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B站视频';
-- ----------------------------
-- Table structure for bilibili_video_comment
-- ----------------------------
DROP TABLE IF EXISTS `bilibili_video_comment`;
CREATE TABLE `bilibili_video_comment` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
`content` longtext COMMENT '评论内容',
`create_time` bigint NOT NULL COMMENT '评论时间戳',
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
PRIMARY KEY (`id`),
KEY `idx_bilibili_vi_comment_41c34e` (`comment_id`),
KEY `idx_bilibili_vi_video_i_f22873` (`video_id`)
CREATE TABLE `bilibili_video_comment`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
`content` longtext COMMENT '评论内容',
`create_time` bigint NOT NULL COMMENT '评论时间戳',
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
PRIMARY KEY (`id`),
KEY `idx_bilibili_vi_comment_41c34e` (`comment_id`),
KEY `idx_bilibili_vi_video_i_f22873` (`video_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站视频评论';
-- ----------------------------
-- Table structure for bilibili_up_info
-- ----------------------------
DROP TABLE IF EXISTS `bilibili_up_info`;
CREATE TABLE `bilibili_up_info` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`total_fans` bigint DEFAULT NULL COMMENT '粉丝数',
`total_liked` bigint DEFAULT NULL COMMENT '总获赞数',
`user_rank` int DEFAULT NULL COMMENT '用户等级',
`is_official` int DEFAULT NULL COMMENT '是否官号',
PRIMARY KEY (`id`),
KEY `idx_bilibili_vi_user_123456` (`user_id`)
CREATE TABLE `bilibili_up_info`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`total_fans` bigint DEFAULT NULL COMMENT '粉丝数',
`total_liked` bigint DEFAULT NULL COMMENT '总获赞数',
`user_rank` int DEFAULT NULL COMMENT '用户等级',
`is_official` int DEFAULT NULL COMMENT '是否官号',
PRIMARY KEY (`id`),
KEY `idx_bilibili_vi_user_123456` (`user_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站UP主信息';
-- ----------------------------
-- Table structure for douyin_aweme
-- ----------------------------
DROP TABLE IF EXISTS `douyin_aweme`;
CREATE TABLE `douyin_aweme` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid',
`short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID',
`user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`aweme_id` varchar(64) NOT NULL COMMENT '视频ID',
`aweme_type` varchar(16) NOT NULL COMMENT '视频类型',
`title` varchar(500) DEFAULT NULL COMMENT '视频标题',
`desc` longtext COMMENT '视频描述',
`create_time` bigint NOT NULL COMMENT '视频发布时间戳',
`liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数',
`comment_count` varchar(16) DEFAULT NULL COMMENT '视频评论数',
`share_count` varchar(16) DEFAULT NULL COMMENT '视频分享数',
`collected_count` varchar(16) DEFAULT NULL COMMENT '视频收藏数',
`aweme_url` varchar(255) DEFAULT NULL COMMENT '视频详情页URL',
PRIMARY KEY (`id`),
KEY `idx_douyin_awem_aweme_i_6f7bc6` (`aweme_id`),
KEY `idx_douyin_awem_create__299dfe` (`create_time`)
CREATE TABLE `douyin_aweme`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid',
`short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID',
`user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`aweme_id` varchar(64) NOT NULL COMMENT '视频ID',
`aweme_type` varchar(16) NOT NULL COMMENT '视频类型',
`title` varchar(500) DEFAULT NULL COMMENT '视频标题',
`desc` longtext COMMENT '视频描述',
`create_time` bigint NOT NULL COMMENT '视频发布时间戳',
`liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数',
`comment_count` varchar(16) DEFAULT NULL COMMENT '视频评论数',
`share_count` varchar(16) DEFAULT NULL COMMENT '视频分享数',
`collected_count` varchar(16) DEFAULT NULL COMMENT '视频收藏数',
`aweme_url` varchar(255) DEFAULT NULL COMMENT '视频详情页URL',
PRIMARY KEY (`id`),
KEY `idx_douyin_awem_aweme_i_6f7bc6` (`aweme_id`),
KEY `idx_douyin_awem_create__299dfe` (`create_time`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音视频';
-- ----------------------------
-- Table structure for douyin_aweme_comment
-- ----------------------------
DROP TABLE IF EXISTS `douyin_aweme_comment`;
CREATE TABLE `douyin_aweme_comment` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid',
`short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID',
`user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
`aweme_id` varchar(64) NOT NULL COMMENT '视频ID',
`content` longtext COMMENT '评论内容',
`create_time` bigint NOT NULL COMMENT '评论时间戳',
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
PRIMARY KEY (`id`),
KEY `idx_douyin_awem_comment_fcd7e4` (`comment_id`),
KEY `idx_douyin_awem_aweme_i_c50049` (`aweme_id`)
CREATE TABLE `douyin_aweme_comment`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid',
`short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID',
`user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
`aweme_id` varchar(64) NOT NULL COMMENT '视频ID',
`content` longtext COMMENT '评论内容',
`create_time` bigint NOT NULL COMMENT '评论时间戳',
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
PRIMARY KEY (`id`),
KEY `idx_douyin_awem_comment_fcd7e4` (`comment_id`),
KEY `idx_douyin_awem_aweme_i_c50049` (`aweme_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音视频评论';
-- ----------------------------
-- Table structure for dy_creator
-- ----------------------------
DROP TABLE IF EXISTS `dy_creator`;
CREATE TABLE `dy_creator` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(128) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`desc` longtext COMMENT '用户描述',
`gender` varchar(1) DEFAULT NULL COMMENT '性别',
`follows` varchar(16) DEFAULT NULL COMMENT '关注数',
`fans` varchar(16) DEFAULT NULL COMMENT '粉丝数',
`interaction` varchar(16) DEFAULT NULL COMMENT '获赞数',
`videos_count` varchar(16) DEFAULT NULL COMMENT '作品数',
PRIMARY KEY (`id`)
CREATE TABLE `dy_creator`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(128) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`desc` longtext COMMENT '用户描述',
`gender` varchar(1) DEFAULT NULL COMMENT '性别',
`follows` varchar(16) DEFAULT NULL COMMENT '关注数',
`fans` varchar(16) DEFAULT NULL COMMENT '粉丝数',
`interaction` varchar(16) DEFAULT NULL COMMENT '获赞数',
`videos_count` varchar(16) DEFAULT NULL COMMENT '作品数',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音博主信息';
-- ----------------------------
-- Table structure for kuaishou_video
-- ----------------------------
DROP TABLE IF EXISTS `kuaishou_video`;
CREATE TABLE `kuaishou_video` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
`video_type` varchar(16) NOT NULL COMMENT '视频类型',
`title` varchar(500) DEFAULT NULL COMMENT '视频标题',
`desc` longtext COMMENT '视频描述',
`create_time` bigint NOT NULL COMMENT '视频发布时间戳',
`liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数',
`viewd_count` varchar(16) DEFAULT NULL COMMENT '视频浏览数量',
`video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL',
`video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL',
`video_play_url` varchar(512) DEFAULT NULL COMMENT '视频播放 URL',
PRIMARY KEY (`id`),
KEY `idx_kuaishou_vi_video_i_c5c6a6` (`video_id`),
KEY `idx_kuaishou_vi_create__a10dee` (`create_time`)
CREATE TABLE `kuaishou_video`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
`video_type` varchar(16) NOT NULL COMMENT '视频类型',
`title` varchar(500) DEFAULT NULL COMMENT '视频标题',
`desc` longtext COMMENT '视频描述',
`create_time` bigint NOT NULL COMMENT '视频发布时间戳',
`liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数',
`viewd_count` varchar(16) DEFAULT NULL COMMENT '视频浏览数量',
`video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL',
`video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL',
`video_play_url` varchar(512) DEFAULT NULL COMMENT '视频播放 URL',
PRIMARY KEY (`id`),
KEY `idx_kuaishou_vi_video_i_c5c6a6` (`video_id`),
KEY `idx_kuaishou_vi_create__a10dee` (`create_time`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='快手视频';
-- ----------------------------
-- Table structure for kuaishou_video_comment
-- ----------------------------
DROP TABLE IF EXISTS `kuaishou_video_comment`;
CREATE TABLE `kuaishou_video_comment` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
`content` longtext COMMENT '评论内容',
`create_time` bigint NOT NULL COMMENT '评论时间戳',
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
PRIMARY KEY (`id`),
KEY `idx_kuaishou_vi_comment_ed48fa` (`comment_id`),
KEY `idx_kuaishou_vi_video_i_e50914` (`video_id`)
CREATE TABLE `kuaishou_video_comment`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
`content` longtext COMMENT '评论内容',
`create_time` bigint NOT NULL COMMENT '评论时间戳',
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
PRIMARY KEY (`id`),
KEY `idx_kuaishou_vi_comment_ed48fa` (`comment_id`),
KEY `idx_kuaishou_vi_video_i_e50914` (`video_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='快手视频评论';
@ -195,145 +203,198 @@ CREATE TABLE `kuaishou_video_comment` (
-- Table structure for weibo_note
-- ----------------------------
DROP TABLE IF EXISTS `weibo_note`;
CREATE TABLE `weibo_note` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`gender` varchar(12) DEFAULT NULL COMMENT '用户性别',
`profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址',
`ip_location` varchar(32) DEFAULT '发布微博的地理信息',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`note_id` varchar(64) NOT NULL COMMENT '帖子ID',
`content` longtext COMMENT '帖子正文内容',
`create_time` bigint NOT NULL COMMENT '帖子发布时间戳',
`create_date_time` varchar(32) NOT NULL COMMENT '帖子发布日期时间',
`liked_count` varchar(16) DEFAULT NULL COMMENT '帖子点赞数',
`comments_count` varchar(16) DEFAULT NULL COMMENT '帖子评论数量',
`shared_count` varchar(16) DEFAULT NULL COMMENT '帖子转发数量',
`note_url` varchar(512) DEFAULT NULL COMMENT '帖子详情URL',
PRIMARY KEY (`id`),
KEY `idx_weibo_note_note_id_f95b1a` (`note_id`),
KEY `idx_weibo_note_create__692709` (`create_time`),
KEY `idx_weibo_note_create__d05ed2` (`create_date_time`)
CREATE TABLE `weibo_note`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`gender` varchar(12) DEFAULT NULL COMMENT '用户性别',
`profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址',
`ip_location` varchar(32) DEFAULT '发布微博的地理信息',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`note_id` varchar(64) NOT NULL COMMENT '帖子ID',
`content` longtext COMMENT '帖子正文内容',
`create_time` bigint NOT NULL COMMENT '帖子发布时间戳',
`create_date_time` varchar(32) NOT NULL COMMENT '帖子发布日期时间',
`liked_count` varchar(16) DEFAULT NULL COMMENT '帖子点赞数',
`comments_count` varchar(16) DEFAULT NULL COMMENT '帖子评论数量',
`shared_count` varchar(16) DEFAULT NULL COMMENT '帖子转发数量',
`note_url` varchar(512) DEFAULT NULL COMMENT '帖子详情URL',
PRIMARY KEY (`id`),
KEY `idx_weibo_note_note_id_f95b1a` (`note_id`),
KEY `idx_weibo_note_create__692709` (`create_time`),
KEY `idx_weibo_note_create__d05ed2` (`create_date_time`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博帖子';
-- ----------------------------
-- Table structure for weibo_note_comment
-- ----------------------------
DROP TABLE IF EXISTS `weibo_note_comment`;
CREATE TABLE `weibo_note_comment` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`gender` varchar(12) DEFAULT NULL COMMENT '用户性别',
`profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址',
`ip_location` varchar(32) DEFAULT '发布微博的地理信息',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
`note_id` varchar(64) NOT NULL COMMENT '帖子ID',
`content` longtext COMMENT '评论内容',
`create_time` bigint NOT NULL COMMENT '评论时间戳',
`create_date_time` varchar(32) NOT NULL COMMENT '评论日期时间',
`comment_like_count` varchar(16) NOT NULL COMMENT '评论点赞数量',
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
PRIMARY KEY (`id`),
KEY `idx_weibo_note__comment_c7611c` (`comment_id`),
KEY `idx_weibo_note__note_id_24f108` (`note_id`),
KEY `idx_weibo_note__create__667fe3` (`create_date_time`)
CREATE TABLE `weibo_note_comment`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`gender` varchar(12) DEFAULT NULL COMMENT '用户性别',
`profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址',
`ip_location` varchar(32) DEFAULT '发布微博的地理信息',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
`note_id` varchar(64) NOT NULL COMMENT '帖子ID',
`content` longtext COMMENT '评论内容',
`create_time` bigint NOT NULL COMMENT '评论时间戳',
`create_date_time` varchar(32) NOT NULL COMMENT '评论日期时间',
`comment_like_count` varchar(16) NOT NULL COMMENT '评论点赞数量',
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
PRIMARY KEY (`id`),
KEY `idx_weibo_note__comment_c7611c` (`comment_id`),
KEY `idx_weibo_note__note_id_24f108` (`note_id`),
KEY `idx_weibo_note__create__667fe3` (`create_date_time`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博帖子评论';
-- ----------------------------
-- Table structure for xhs_creator
-- ----------------------------
DROP TABLE IF EXISTS `xhs_creator`;
CREATE TABLE `xhs_creator` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`desc` longtext COMMENT '用户描述',
`gender` varchar(1) DEFAULT NULL COMMENT '性别',
`follows` varchar(16) DEFAULT NULL COMMENT '关注数',
`fans` varchar(16) DEFAULT NULL COMMENT '粉丝数',
`interaction` varchar(16) DEFAULT NULL COMMENT '获赞和收藏数',
`tag_list` longtext COMMENT '标签列表',
PRIMARY KEY (`id`)
CREATE TABLE `xhs_creator`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`desc` longtext COMMENT '用户描述',
`gender` varchar(1) DEFAULT NULL COMMENT '性别',
`follows` varchar(16) DEFAULT NULL COMMENT '关注数',
`fans` varchar(16) DEFAULT NULL COMMENT '粉丝数',
`interaction` varchar(16) DEFAULT NULL COMMENT '获赞和收藏数',
`tag_list` longtext COMMENT '标签列表',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书博主';
-- ----------------------------
-- Table structure for xhs_note
-- ----------------------------
DROP TABLE IF EXISTS `xhs_note`;
CREATE TABLE `xhs_note` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`note_id` varchar(64) NOT NULL COMMENT '笔记ID',
`type` varchar(16) DEFAULT NULL COMMENT '笔记类型(normal | video)',
`title` varchar(255) DEFAULT NULL COMMENT '笔记标题',
`desc` longtext COMMENT '笔记描述',
`video_url` longtext COMMENT '视频地址',
`time` bigint NOT NULL COMMENT '笔记发布时间戳',
`last_update_time` bigint NOT NULL COMMENT '笔记最后更新时间戳',
`liked_count` varchar(16) DEFAULT NULL COMMENT '笔记点赞数',
`collected_count` varchar(16) DEFAULT NULL COMMENT '笔记收藏数',
`comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数',
`share_count` varchar(16) DEFAULT NULL COMMENT '笔记分享数',
`image_list` longtext COMMENT '笔记封面图片列表',
`tag_list` longtext COMMENT '标签列表',
`note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL',
PRIMARY KEY (`id`),
KEY `idx_xhs_note_note_id_209457` (`note_id`),
KEY `idx_xhs_note_time_eaa910` (`time`)
CREATE TABLE `xhs_note`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`note_id` varchar(64) NOT NULL COMMENT '笔记ID',
`type` varchar(16) DEFAULT NULL COMMENT '笔记类型(normal | video)',
`title` varchar(255) DEFAULT NULL COMMENT '笔记标题',
`desc` longtext COMMENT '笔记描述',
`video_url` longtext COMMENT '视频地址',
`time` bigint NOT NULL COMMENT '笔记发布时间戳',
`last_update_time` bigint NOT NULL COMMENT '笔记最后更新时间戳',
`liked_count` varchar(16) DEFAULT NULL COMMENT '笔记点赞数',
`collected_count` varchar(16) DEFAULT NULL COMMENT '笔记收藏数',
`comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数',
`share_count` varchar(16) DEFAULT NULL COMMENT '笔记分享数',
`image_list` longtext COMMENT '笔记封面图片列表',
`tag_list` longtext COMMENT '标签列表',
`note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL',
PRIMARY KEY (`id`),
KEY `idx_xhs_note_note_id_209457` (`note_id`),
KEY `idx_xhs_note_time_eaa910` (`time`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书笔记';
-- ----------------------------
-- Table structure for xhs_note_comment
-- ----------------------------
DROP TABLE IF EXISTS `xhs_note_comment`;
CREATE TABLE `xhs_note_comment` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
`create_time` bigint NOT NULL COMMENT '评论时间戳',
`note_id` varchar(64) NOT NULL COMMENT '笔记ID',
`content` longtext NOT NULL COMMENT '评论内容',
`sub_comment_count` int NOT NULL COMMENT '子评论数量',
`pictures` varchar(512) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `idx_xhs_note_co_comment_8e8349` (`comment_id`),
KEY `idx_xhs_note_co_create__204f8d` (`create_time`)
CREATE TABLE `xhs_note_comment`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
`create_time` bigint NOT NULL COMMENT '评论时间戳',
`note_id` varchar(64) NOT NULL COMMENT '笔记ID',
`content` longtext NOT NULL COMMENT '评论内容',
`sub_comment_count` int NOT NULL COMMENT '子评论数量',
`pictures` varchar(512) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `idx_xhs_note_co_comment_8e8349` (`comment_id`),
KEY `idx_xhs_note_co_create__204f8d` (`create_time`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书笔记评论';
-- ----------------------------
-- alter table xhs_note_comment to support parent_comment_id
-- ----------------------------
ALTER TABLE `xhs_note_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `douyin_aweme_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `bilibili_video_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ALTER TABLE `weibo_note_comment`
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
SET FOREIGN_KEY_CHECKS = 1;
DROP TABLE IF EXISTS `tieba_note`;
CREATE TABLE tieba_note
(
id BIGINT AUTO_INCREMENT PRIMARY KEY,
note_id VARCHAR(644) NOT NULL COMMENT '帖子ID',
title VARCHAR(255) NOT NULL COMMENT '帖子标题',
`desc` TEXT COMMENT '帖子描述',
note_url VARCHAR(255) NOT NULL COMMENT '帖子链接',
publish_time VARCHAR(255) NOT NULL COMMENT '发布时间',
user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接',
user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称',
user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址',
tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID',
tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称',
tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接',
total_replay_num INT DEFAULT 0 COMMENT '帖子回复总数',
total_replay_page INT DEFAULT 0 COMMENT '帖子回复总页数',
ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置',
add_ts BIGINT NOT NULL COMMENT '添加时间戳',
last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳',
KEY `idx_tieba_note_note_id` (`note_id`),
KEY `idx_tieba_note_publish_time` (`publish_time`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧帖子表';
DROP TABLE IF EXISTS `tieba_comment`;
CREATE TABLE tieba_comment
(
id BIGINT AUTO_INCREMENT PRIMARY KEY,
comment_id VARCHAR(255) NOT NULL COMMENT '评论ID',
parent_comment_id VARCHAR(255) DEFAULT '' COMMENT '父评论ID',
content TEXT NOT NULL COMMENT '评论内容',
user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接',
user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称',
user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址',
tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID',
tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称',
tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接',
publish_time VARCHAR(255) DEFAULT '' COMMENT '发布时间',
ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置',
sub_comment_count INT DEFAULT 0 COMMENT '子评论数',
note_id VARCHAR(255) NOT NULL COMMENT '帖子ID',
note_url VARCHAR(255) NOT NULL COMMENT '帖子链接',
add_ts BIGINT NOT NULL COMMENT '添加时间戳',
last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳',
KEY `idx_tieba_comment_comment_id` (`note_id`),
KEY `idx_tieba_comment_note_id` (`note_id`),
KEY `idx_tieba_comment_publish_time` (`publish_time`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表';

71
store/tieba/__init__.py Normal file
View File

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
from typing import List
from model.m_baidu_tieba import TiebaComment, TiebaNote
from . import tieba_store_impl
from .tieba_store_impl import *
class TieBaStoreFactory:
STORES = {
"csv": TieBaCsvStoreImplement,
"db": TieBaDbStoreImplement,
"json": TieBaJsonStoreImplement
}
@staticmethod
def create_store() -> AbstractStore:
store_class = TieBaStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
if not store_class:
raise ValueError(
"[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...")
return store_class()
async def update_tieba_note(note_item: TiebaNote):
"""
Add or Update tieba note
Args:
note_item:
Returns:
"""
save_note_item = note_item.model_dump()
save_note_item.update({"last_modify_ts": utils.get_current_timestamp()})
utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {save_note_item}")
await TieBaStoreFactory.create_store().store_content(save_note_item)
async def batch_update_tieba_note_comments(note_id:str, comments: List[TiebaComment]):
"""
Batch update tieba note comments
Args:
note_id:
comments:
Returns:
"""
if not comments:
return
for comment_item in comments:
await update_tieba_note_comment(note_id, comment_item)
async def update_tieba_note_comment(note_id: str, comment_item: TiebaComment):
"""
Update tieba note comment
Args:
note_id:
comment_item:
Returns:
"""
save_comment_item = comment_item.model_dump()
save_comment_item.update({"last_modify_ts": utils.get_current_timestamp()})
utils.logger.info(f"[store.tieba.update_tieba_note_comment] tieba note id: {note_id} comment:{save_comment_item}")
await TieBaStoreFactory.create_store().store_comment(save_comment_item)

View File

@ -0,0 +1,244 @@
# -*- coding: utf-8 -*-
import asyncio
import csv
import json
import os
import pathlib
from typing import Dict
import aiofiles
import config
from base.base_crawler import AbstractStore
from tools import utils, words
from var import crawler_type_var
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
Args:
file_store_path;
Returns:
file nums
"""
if not os.path.exists(file_store_path):
return 1
try:
return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1
except ValueError:
return 1
class TieBaCsvStoreImplement(AbstractStore):
csv_store_path: str = "data/tieba"
file_count:int=calculate_number_of_files(csv_store_path)
def make_save_file_name(self, store_type: str) -> str:
"""
make save file name by store type
Args:
store_type: contents or comments
Returns: eg: data/tieba/search_comments_20240114.csv ...
"""
return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv"
async def save_data_to_csv(self, save_item: Dict, store_type: str):
"""
Below is a simple way to save it in CSV format.
Args:
save_item: save content dict info
store_type: Save type contains content and commentscontents | comments
Returns: no returns
"""
pathlib.Path(self.csv_store_path).mkdir(parents=True, exist_ok=True)
save_file_name = self.make_save_file_name(store_type=store_type)
async with aiofiles.open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
f.fileno()
writer = csv.writer(f)
if await f.tell() == 0:
await writer.writerow(save_item.keys())
await writer.writerow(save_item.values())
async def store_content(self, content_item: Dict):
"""
Xiaohongshu content CSV storage implementation
Args:
content_item: note item dict
Returns:
"""
await self.save_data_to_csv(save_item=content_item, store_type="contents")
async def store_comment(self, comment_item: Dict):
"""
Xiaohongshu comment CSV storage implementation
Args:
comment_item: comment item dict
Returns:
"""
await self.save_data_to_csv(save_item=comment_item, store_type="comments")
async def store_creator(self, creator: Dict):
"""
Xiaohongshu content CSV storage implementation
Args:
creator: creator dict
Returns:
"""
await self.save_data_to_csv(save_item=creator, store_type="creator")
class TieBaDbStoreImplement(AbstractStore):
async def store_content(self, content_item: Dict):
"""
Xiaohongshu content DB storage implementation
Args:
content_item: content item dict
Returns:
"""
from .tieba_store_sql import (add_new_content,
query_content_by_content_id,
update_content_by_content_id)
note_id = content_item.get("note_id")
note_detail: Dict = await query_content_by_content_id(content_id=note_id)
if not note_detail:
content_item["add_ts"] = utils.get_current_timestamp()
await add_new_content(content_item)
else:
await update_content_by_content_id(note_id, content_item=content_item)
async def store_comment(self, comment_item: Dict):
"""
Xiaohongshu content DB storage implementation
Args:
comment_item: comment item dict
Returns:
"""
from .tieba_store_sql import (add_new_comment,
query_comment_by_comment_id,
update_comment_by_comment_id)
comment_id = comment_item.get("comment_id")
comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id)
if not comment_detail:
comment_item["add_ts"] = utils.get_current_timestamp()
await add_new_comment(comment_item)
else:
await update_comment_by_comment_id(comment_id, comment_item=comment_item)
async def store_creator(self, creator: Dict):
"""
Xiaohongshu content DB storage implementation
Args:
creator: creator dict
Returns:
"""
from .tieba_store_sql import (add_new_creator,
query_creator_by_user_id,
update_creator_by_user_id)
user_id = creator.get("user_id")
user_detail: Dict = await query_creator_by_user_id(user_id)
if not user_detail:
creator["add_ts"] = utils.get_current_timestamp()
await add_new_creator(creator)
else:
await update_creator_by_user_id(user_id, creator)
class TieBaJsonStoreImplement(AbstractStore):
json_store_path: str = "data/tieba/json"
words_store_path: str = "data/tieba/words"
lock = asyncio.Lock()
file_count:int=calculate_number_of_files(json_store_path)
WordCloud = words.AsyncWordCloudGenerator()
def make_save_file_name(self, store_type: str) -> (str,str):
"""
make save file name by store type
Args:
store_type: Save type contains content and commentscontents | comments
Returns:
"""
return (
f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json",
f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}"
)
async def save_data_to_json(self, save_item: Dict, store_type: str):
"""
Below is a simple way to save it in json format.
Args:
save_item: save content dict info
store_type: Save type contains content and commentscontents | comments
Returns:
"""
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_data = []
async with self.lock:
if os.path.exists(save_file_name):
async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file:
save_data = json.loads(await file.read())
save_data.append(save_item)
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
await file.write(json.dumps(save_data, ensure_ascii=False))
if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD:
try:
await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
except:
pass
async def store_content(self, content_item: Dict):
"""
content JSON storage implementation
Args:
content_item:
Returns:
"""
await self.save_data_to_json(content_item, "contents")
async def store_comment(self, comment_item: Dict):
"""
comment JSON storage implementatio
Args:
comment_item:
Returns:
"""
await self.save_data_to_json(comment_item, "comments")
async def store_creator(self, creator: Dict):
"""
Xiaohongshu content JSON storage implementation
Args:
creator: creator dict
Returns:
"""
await self.save_data_to_json(creator, "creator")

View File

@ -0,0 +1,144 @@
# -*- coding: utf-8 -*-
from typing import Dict, List
from db import AsyncMysqlDB
from var import media_crawler_db_var
async def query_content_by_content_id(content_id: str) -> Dict:
"""
查询一条内容记录xhs的帖子 抖音的视频 微博 快手视频 ...
Args:
content_id:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
sql: str = f"select * from tieba_note where note_id = '{content_id}'"
rows: List[Dict] = await async_db_conn.query(sql)
if len(rows) > 0:
return rows[0]
return dict()
async def add_new_content(content_item: Dict) -> int:
"""
新增一条内容记录xhs的帖子 抖音的视频 微博 快手视频 ...
Args:
content_item:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
last_row_id: int = await async_db_conn.item_to_table("tieba_note", content_item)
return last_row_id
async def update_content_by_content_id(content_id: str, content_item: Dict) -> int:
"""
更新一条记录xhs的帖子 抖音的视频 微博 快手视频 ...
Args:
content_id:
content_item:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
effect_row: int = await async_db_conn.update_table("tieba_note", content_item, "note_id", content_id)
return effect_row
async def query_comment_by_comment_id(comment_id: str) -> Dict:
"""
查询一条评论内容
Args:
comment_id:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
sql: str = f"select * from tieba_comment where comment_id = '{comment_id}'"
rows: List[Dict] = await async_db_conn.query(sql)
if len(rows) > 0:
return rows[0]
return dict()
async def add_new_comment(comment_item: Dict) -> int:
"""
新增一条评论记录
Args:
comment_item:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
last_row_id: int = await async_db_conn.item_to_table("tieba_comment", comment_item)
return last_row_id
async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> int:
"""
更新增一条评论记录
Args:
comment_id:
comment_item:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
effect_row: int = await async_db_conn.update_table("tieba_comment", comment_item, "comment_id", comment_id)
return effect_row
async def query_creator_by_user_id(user_id: str) -> Dict:
"""
查询一条创作者记录
Args:
user_id:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
sql: str = f"select * from tieba_creator where user_id = '{user_id}'"
rows: List[Dict] = await async_db_conn.query(sql)
if len(rows) > 0:
return rows[0]
return dict()
async def add_new_creator(creator_item: Dict) -> int:
"""
新增一条创作者信息
Args:
creator_item:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
last_row_id: int = await async_db_conn.item_to_table("tieba_creator", creator_item)
return last_row_id
async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int:
"""
更新一条创作者信息
Args:
user_id:
creator_item:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
effect_row: int = await async_db_conn.update_table("tieba_creator", creator_item, "user_id", user_id)
return effect_row

View File

@ -13,6 +13,8 @@ import httpx
from PIL import Image, ImageDraw
from playwright.async_api import Cookie, Page
from proxy import IpInfoModel
from . import utils
@ -133,3 +135,24 @@ def match_interact_info_count(count_str: str) -> int:
return int(number)
else:
return 0
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
"""format proxy info for playwright and httpx"""
playwright_proxy = {
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
"username": ip_proxy_info.user,
"password": ip_proxy_info.password,
}
httpx_proxy = {
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
}
return playwright_proxy, httpx_proxy
def extract_text_from_html(html: str) -> str:
"""Extract text from HTML, removing all tags."""
# Remove script and style elements
clean_html = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', html, flags=re.DOTALL)
# Remove all other tags
clean_text = re.sub(r'<[^>]+>', '', clean_html).strip()
return clean_text

View File

@ -10,7 +10,7 @@ def init_loging_config():
level = logging.INFO
logging.basicConfig(
level=level,
format="%(asctime)s [%(threadName)s] %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
datefmt='%Y-%m-%d %H:%M:%S'
)
_logger = logging.getLogger("MediaCrawler")