Merge pull request #374 from NanmiCoder/feature/baidu_tieba_20240805
feat: MediaCrawler支持百度贴吧
This commit is contained in:
commit
a10cdcf474
44
README.md
44
README.md
|
@ -7,7 +7,7 @@
|
|||
> 点击查看更为详细的免责声明。[点击跳转](#disclaimer)
|
||||
# 仓库描述
|
||||
|
||||
**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**...。
|
||||
**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**,**百度贴吧**...。
|
||||
目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。
|
||||
|
||||
原理:利用[playwright](https://playwright.dev/)搭桥,保留登录成功后的上下文浏览器环境,通过执行JS表达式获取一些加密参数
|
||||
|
@ -22,6 +22,7 @@
|
|||
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
|
||||
| 贴吧 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
|
||||
|
||||
|
||||
## 使用方法
|
||||
|
@ -99,14 +100,51 @@
|
|||
- [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html)
|
||||
- [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html)
|
||||
|
||||
|
||||
|
||||
## 感谢下列Sponsors对本仓库赞助
|
||||
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册安装这个款免费的Sider ChatGPT插件帮我获得一定奖励💰,这个插件我用了大半年,作为谷歌上最火的一款插件,体验非常不错。</a>
|
||||
> 安装并注册该浏览器插件之后保留一天即可,我就可以获得3元的推广奖励,谢谢大家,支持我继续开源项目。
|
||||
|
||||
成为赞助者,展示你的产品在这里,联系作者wx:yzglan
|
||||
|
||||
## 打赏
|
||||
|
||||
如果觉得项目不错的话可以打赏哦。您的支持就是我最大的动力!
|
||||
|
||||
打赏时您可以备注名称,我会将您添加至打赏列表中。
|
||||
<p>
|
||||
<img alt="打赏-微信" src="static/images/wechat_pay.jpeg" style="width: 200px;margin-right: 140px;" />
|
||||
<img alt="打赏-支付宝" src="static/images/zfb_pay.png" style="width: 200px" />
|
||||
</p>
|
||||
|
||||
## 捐赠信息
|
||||
|
||||
PS:如果打赏时请备注捐赠者,如有遗漏请联系我添加(有时候消息多可能会漏掉,十分抱歉)
|
||||
|
||||
| 捐赠者 | 捐赠金额 | 捐赠日期 |
|
||||
|-------------|-------|------------|
|
||||
| *皓 | 50 元 | 2024-03-18 |
|
||||
| *刚 | 50 元 | 2024-03-18 |
|
||||
| *乐 | 20 元 | 2024-03-17 |
|
||||
| *木 | 20 元 | 2024-03-17 |
|
||||
| *诚 | 20 元 | 2024-03-17 |
|
||||
| Strem Gamer | 20 元 | 2024-03-16 |
|
||||
| *鑫 | 20 元 | 2024-03-14 |
|
||||
| Yuzu | 20 元 | 2024-03-07 |
|
||||
| **宁 | 100 元 | 2024-03-03 |
|
||||
| **媛 | 20 元 | 2024-03-03 |
|
||||
| Scarlett | 20 元 | 2024-02-16 |
|
||||
| Asun | 20 元 | 2024-01-30 |
|
||||
| 何* | 100 元 | 2024-01-21 |
|
||||
| allen | 20 元 | 2024-01-10 |
|
||||
| llllll | 20 元 | 2024-01-07 |
|
||||
| 邝*元 | 20 元 | 2023-12-29 |
|
||||
| 50chen | 50 元 | 2023-12-22 |
|
||||
| xiongot | 20 元 | 2023-12-17 |
|
||||
| atom.hu | 20 元 | 2023-12-16 |
|
||||
| 一呆 | 20 元 | 2023-12-01 |
|
||||
| 坠落 | 50 元 | 2023-11-08 |
|
||||
|
||||
|
||||
|
||||
## MediaCrawler爬虫项目交流群:
|
||||
> 扫描下方我的个人微信,备注:github,拉你进MediaCrawler项目交流群(请一定备注:github,会有wx小助手自动拉群)
|
||||
|
|
|
@ -7,8 +7,8 @@ from tools.utils import str2bool
|
|||
async def parse_cmd():
|
||||
# 读取command arg
|
||||
parser = argparse.ArgumentParser(description='Media crawler program.')
|
||||
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)',
|
||||
choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM)
|
||||
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb | tieba)',
|
||||
choices=["xhs", "dy", "ks", "bili", "wb", "tieba"], default=config.PLATFORM)
|
||||
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
|
||||
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
|
||||
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
|
||||
|
|
|
@ -28,7 +28,7 @@ HEADLESS = False
|
|||
SAVE_LOGIN_STATE = True
|
||||
|
||||
# 数据保存类型选项配置,支持三种类型:csv、db、json
|
||||
SAVE_DATA_OPTION = "json" # csv or db or json
|
||||
SAVE_DATA_OPTION = "csv" # csv or db or json
|
||||
|
||||
# 用户浏览器缓存的浏览器文件配置
|
||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||
|
@ -37,7 +37,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
|||
START_PAGE = 1
|
||||
|
||||
# 爬取视频/帖子的数量控制
|
||||
CRAWLER_MAX_NOTES_COUNT = 20
|
||||
CRAWLER_MAX_NOTES_COUNT = 100
|
||||
|
||||
# 并发爬虫数量控制
|
||||
MAX_CONCURRENCY_NUM = 1
|
||||
|
@ -57,7 +57,7 @@ XHS_SPECIFIED_ID_LIST = [
|
|||
"6422c2750000000027000d88",
|
||||
"64ca1b73000000000b028dd2",
|
||||
"630d5b85000000001203ab41",
|
||||
"668fe13000000000030241fa", # 图文混合
|
||||
"668fe13000000000030241fa", # 图文混合
|
||||
# ........................
|
||||
]
|
||||
|
||||
|
@ -88,6 +88,16 @@ WEIBO_SPECIFIED_ID_LIST = [
|
|||
# ........................
|
||||
]
|
||||
|
||||
# 指定贴吧需要爬取的帖子列表
|
||||
TIEBA_SPECIFIED_ID_LIST = [
|
||||
|
||||
]
|
||||
|
||||
# 指定贴吧名称列表,爬取该贴吧下的帖子
|
||||
TIEBA_NAME_LIST = [
|
||||
# "盗墓笔记"
|
||||
]
|
||||
|
||||
# 指定小红书创作者ID列表
|
||||
XHS_CREATOR_ID_LIST = [
|
||||
"63e36c9a000000002703502b",
|
||||
|
@ -112,19 +122,18 @@ KS_CREATOR_ID_LIST = [
|
|||
# ........................
|
||||
]
|
||||
|
||||
|
||||
#词云相关
|
||||
#是否开启生成评论词云图
|
||||
# 词云相关
|
||||
# 是否开启生成评论词云图
|
||||
ENABLE_GET_WORDCLOUD = False
|
||||
# 自定义词语及其分组
|
||||
#添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
|
||||
# 添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
|
||||
CUSTOM_WORDS = {
|
||||
'零几': '年份', # 将“零几”识别为一个整体
|
||||
'高频词': '专业术语' # 示例自定义词
|
||||
}
|
||||
|
||||
#停用(禁用)词文件路径
|
||||
# 停用(禁用)词文件路径
|
||||
STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
|
||||
|
||||
#中文字体文件路径
|
||||
FONT_PATH= "./docs/STZHONGS.TTF"
|
||||
# 中文字体文件路径
|
||||
FONT_PATH = "./docs/STZHONGS.TTF"
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
# -*- coding: utf-8 -*-
|
|
@ -0,0 +1,3 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
TIEBA_URL = 'https://tieba.baidu.com'
|
7
main.py
7
main.py
|
@ -8,6 +8,7 @@ from base.base_crawler import AbstractCrawler
|
|||
from media_platform.bilibili import BilibiliCrawler
|
||||
from media_platform.douyin import DouYinCrawler
|
||||
from media_platform.kuaishou import KuaishouCrawler
|
||||
from media_platform.tieba import TieBaCrawler
|
||||
from media_platform.weibo import WeiboCrawler
|
||||
from media_platform.xhs import XiaoHongShuCrawler
|
||||
|
||||
|
@ -18,7 +19,8 @@ class CrawlerFactory:
|
|||
"dy": DouYinCrawler,
|
||||
"ks": KuaishouCrawler,
|
||||
"bili": BilibiliCrawler,
|
||||
"wb": WeiboCrawler
|
||||
"wb": WeiboCrawler,
|
||||
"tieba": TieBaCrawler
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
|
@ -28,6 +30,7 @@ class CrawlerFactory:
|
|||
raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...")
|
||||
return crawler_class()
|
||||
|
||||
|
||||
async def main():
|
||||
# parse cmd
|
||||
await cmd_arg.parse_cmd()
|
||||
|
@ -38,7 +41,7 @@ async def main():
|
|||
|
||||
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
|
||||
await crawler.start()
|
||||
|
||||
|
||||
if config.SAVE_DATA_OPTION == "db":
|
||||
await db.close()
|
||||
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from .core import TieBaCrawler
|
|
@ -0,0 +1,289 @@
|
|||
import asyncio
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext
|
||||
from tenacity import RetryError, retry, stop_after_attempt, wait_fixed
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractApiClient
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||
from proxy.proxy_ip_pool import ProxyIpPool
|
||||
from tools import utils
|
||||
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import TieBaExtractor
|
||||
|
||||
|
||||
class BaiduTieBaClient(AbstractApiClient):
|
||||
def __init__(
|
||||
self,
|
||||
timeout=10,
|
||||
ip_pool=None,
|
||||
default_ip_proxy=None,
|
||||
):
|
||||
self.ip_pool: Optional[ProxyIpPool] = ip_pool
|
||||
self.timeout = timeout
|
||||
self.headers = {
|
||||
"User-Agent": utils.get_user_agent(),
|
||||
"Cookies": "",
|
||||
}
|
||||
self._host = "https://tieba.baidu.com"
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.default_ip_proxy = default_ip_proxy
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
async def request(self, method, url, return_ori_content=False, proxies=None, **kwargs) -> Union[str, Any]:
|
||||
"""
|
||||
封装httpx的公共请求方法,对请求响应做一些处理
|
||||
Args:
|
||||
method: 请求方法
|
||||
url: 请求的URL
|
||||
return_ori_content: 是否返回原始内容
|
||||
proxies: 代理IP
|
||||
**kwargs: 其他请求参数,例如请求头、请求体等
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
actual_proxies = proxies if proxies else self.default_ip_proxy
|
||||
async with httpx.AsyncClient(proxies=actual_proxies) as client:
|
||||
response = await client.request(
|
||||
method, url, timeout=self.timeout,
|
||||
headers=self.headers, **kwargs
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
|
||||
utils.logger.error(f"Request failed, response: {response.text}")
|
||||
raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
|
||||
|
||||
if response.text == "" or response.text == "blocked":
|
||||
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||
raise Exception("account blocked")
|
||||
|
||||
if return_ori_content:
|
||||
return response.text
|
||||
|
||||
return response.json()
|
||||
|
||||
async def get(self, uri: str, params=None, return_ori_content=False, **kwargs) -> Any:
|
||||
"""
|
||||
GET请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
params: 请求参数
|
||||
return_ori_content: 是否返回原始内容
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
final_uri = uri
|
||||
if isinstance(params, dict):
|
||||
final_uri = (f"{uri}?"
|
||||
f"{urlencode(params)}")
|
||||
try:
|
||||
res = await self.request(method="GET", url=f"{self._host}{final_uri}",
|
||||
return_ori_content=return_ori_content,
|
||||
**kwargs)
|
||||
return res
|
||||
except RetryError as e:
|
||||
if self.ip_pool:
|
||||
proxie_model = await self.ip_pool.get_proxy()
|
||||
_, proxies = utils.format_proxy_info(proxie_model)
|
||||
res = await self.request(method="GET", url=f"{self._host}{final_uri}",
|
||||
return_ori_content=return_ori_content,
|
||||
proxies=proxies,
|
||||
**kwargs)
|
||||
self.default_ip_proxy = proxies
|
||||
return res
|
||||
|
||||
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
|
||||
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
||||
"""
|
||||
POST请求,对请求头签名
|
||||
Args:
|
||||
uri: 请求路由
|
||||
data: 请求体参数
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
|
||||
return await self.request(method="POST", url=f"{self._host}{uri}",
|
||||
data=json_str, **kwargs)
|
||||
|
||||
async def pong(self) -> bool:
|
||||
"""
|
||||
用于检查登录态是否失效了
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[BaiduTieBaClient.pong] Begin to pong tieba...")
|
||||
try:
|
||||
uri = "/mo/q/sync"
|
||||
res: Dict = await self.get(uri)
|
||||
utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}")
|
||||
if res and res.get("no") == 0:
|
||||
ping_flag = True
|
||||
else:
|
||||
utils.logger.info(f"[BaiduTieBaClient.pong] user not login, will try to login again...")
|
||||
ping_flag = False
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BaiduTieBaClient.pong] Ping tieba failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
async def update_cookies(self, browser_context: BrowserContext):
|
||||
"""
|
||||
API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法
|
||||
Args:
|
||||
browser_context: 浏览器上下文对象
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
async def get_notes_by_keyword(
|
||||
self, keyword: str,
|
||||
page: int = 1,
|
||||
page_size: int = 10,
|
||||
sort: SearchSortType = SearchSortType.TIME_DESC,
|
||||
note_type: SearchNoteType = SearchNoteType.FIXED_THREAD,
|
||||
) -> List[TiebaNote]:
|
||||
"""
|
||||
根据关键词搜索贴吧帖子
|
||||
Args:
|
||||
keyword: 关键词
|
||||
page: 分页第几页
|
||||
page_size: 每页大小
|
||||
sort: 结果排序方式
|
||||
note_type: 帖子类型(主题贴|主题+回复混合模式)
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/f/search/res"
|
||||
params = {
|
||||
"isnew": 1,
|
||||
"qw": keyword,
|
||||
"rn": page_size,
|
||||
"pn": page,
|
||||
"sm": sort.value,
|
||||
"only_thread": note_type.value
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
return self._page_extractor.extract_search_note_list(page_content)
|
||||
|
||||
async def get_note_by_id(self, note_id: str) -> TiebaNote:
|
||||
"""
|
||||
根据帖子ID获取帖子详情
|
||||
Args:
|
||||
note_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/p/{note_id}"
|
||||
page_content = await self.get(uri, return_ori_content=True)
|
||||
return self._page_extractor.extract_note_detail(page_content)
|
||||
|
||||
async def get_note_all_comments(self, note_detail: TiebaNote, crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None) -> List[TiebaComment]:
|
||||
"""
|
||||
获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息
|
||||
Args:
|
||||
note_detail: 帖子详情对象
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/p/{note_detail.note_id}"
|
||||
result: List[TiebaComment] = []
|
||||
current_page = 1
|
||||
while note_detail.total_replay_page >= current_page:
|
||||
params = {
|
||||
"pn": current_page
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
comments = self._page_extractor.extract_tieba_note_parment_comments(page_content,
|
||||
note_id=note_detail.note_id)
|
||||
if not comments:
|
||||
break
|
||||
if callback:
|
||||
await callback(note_detail.note_id, comments)
|
||||
result.extend(comments)
|
||||
# 获取所有子评论
|
||||
await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
current_page += 1
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(self, comments: List[TiebaComment], crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None) -> List[TiebaComment]:
|
||||
"""
|
||||
获取指定评论下的所有子评论
|
||||
Args:
|
||||
comments: 评论列表
|
||||
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||
callback: 一次笔记爬取结束后
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/p/comment"
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
return []
|
||||
|
||||
# # 贴吧获取所有子评论需要登录态
|
||||
# if self.headers.get("Cookies") == "" or not self.pong():
|
||||
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
|
||||
|
||||
all_sub_comments: List[TiebaComment] = []
|
||||
for parment_comment in comments:
|
||||
if parment_comment.sub_comment_count == 0:
|
||||
continue
|
||||
|
||||
current_page = 1
|
||||
max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
|
||||
while max_sub_page_num >= current_page:
|
||||
params = {
|
||||
"tid": parment_comment.note_id, # 帖子ID
|
||||
"pid": parment_comment.comment_id, # 父级评论ID
|
||||
"fid": parment_comment.tieba_id, # 贴吧ID
|
||||
"pn": current_page # 页码
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content,
|
||||
parent_comment=parment_comment)
|
||||
|
||||
if not sub_comments:
|
||||
break
|
||||
if callback:
|
||||
await callback(parment_comment.note_id, sub_comments)
|
||||
all_sub_comments.extend(sub_comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
current_page += 1
|
||||
return all_sub_comments
|
||||
|
||||
|
||||
|
||||
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
|
||||
"""
|
||||
根据贴吧名称获取帖子列表
|
||||
Args:
|
||||
tieba_name: 贴吧名称
|
||||
page_num: 分页数量
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
uri = f"/f?kw={tieba_name}&pn={page_num}"
|
||||
page_content = await self.get(uri, return_ori_content=True)
|
||||
return self._page_extractor.extract_tieba_note_list(page_content)
|
|
@ -0,0 +1,265 @@
|
|||
import asyncio
|
||||
import os
|
||||
import random
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
from playwright.async_api import (BrowserContext, BrowserType, Page,
|
||||
async_playwright)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractCrawler
|
||||
from model.m_baidu_tieba import TiebaNote
|
||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||
from store import tieba as tieba_store
|
||||
from tools import utils
|
||||
from tools.crawler_util import format_proxy_info
|
||||
from var import crawler_type_var
|
||||
|
||||
from .client import BaiduTieBaClient
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .login import BaiduTieBaLogin
|
||||
|
||||
|
||||
class TieBaCrawler(AbstractCrawler):
|
||||
context_page: Page
|
||||
tieba_client: BaiduTieBaClient
|
||||
browser_context: BrowserContext
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.index_url = "https://tieba.baidu.com"
|
||||
self.user_agent = utils.get_user_agent()
|
||||
|
||||
async def start(self) -> None:
|
||||
"""
|
||||
Start the crawler
|
||||
Returns:
|
||||
|
||||
"""
|
||||
ip_proxy_pool, httpx_proxy_format = None, None
|
||||
if config.ENABLE_IP_PROXY:
|
||||
utils.logger.info("[BaiduTieBaCrawler.start] Begin create ip proxy pool ...")
|
||||
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
|
||||
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
|
||||
_, httpx_proxy_format = format_proxy_info(ip_proxy_info)
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}")
|
||||
|
||||
# Create a client to interact with the baidutieba website.
|
||||
self.tieba_client = BaiduTieBaClient(
|
||||
ip_pool=ip_proxy_pool,
|
||||
default_ip_proxy=httpx_proxy_format,
|
||||
)
|
||||
crawler_type_var.set(config.CRAWLER_TYPE)
|
||||
if config.CRAWLER_TYPE == "search":
|
||||
# Search for notes and retrieve their comment information.
|
||||
await self.search()
|
||||
await self.get_specified_tieba_notes()
|
||||
elif config.CRAWLER_TYPE == "detail":
|
||||
# Get the information and comments of the specified post
|
||||
await self.get_specified_notes()
|
||||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
"""
|
||||
Search for notes and retrieve their comment information.
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[BaiduTieBaCrawler.search] Begin search baidu tieba keywords")
|
||||
tieba_limit_count = 10 # tieba limit page fixed value
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
||||
start_page = config.START_PAGE
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while (page - start_page + 1) * tieba_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
if page < start_page:
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Skip page {page}")
|
||||
page += 1
|
||||
continue
|
||||
try:
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}")
|
||||
notes_list: List[TiebaNote] = await self.tieba_client.get_notes_by_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
page_size=tieba_limit_count,
|
||||
sort=SearchSortType.TIME_DESC,
|
||||
note_type=SearchNoteType.FIXED_THREAD
|
||||
)
|
||||
if not notes_list:
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty")
|
||||
break
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}")
|
||||
await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list])
|
||||
page += 1
|
||||
except Exception as ex:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}")
|
||||
break
|
||||
|
||||
async def get_specified_tieba_notes(self):
|
||||
"""
|
||||
Get the information and comments of the specified post by tieba name
|
||||
Returns:
|
||||
|
||||
"""
|
||||
tieba_limit_count = 50
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
||||
for tieba_name in config.TIEBA_NAME_LIST:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}")
|
||||
page_number = 0
|
||||
while page_number <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
note_list: List[TiebaNote] = await self.tieba_client.get_notes_by_tieba_name(
|
||||
tieba_name=tieba_name,
|
||||
page_num=page_number
|
||||
)
|
||||
if not note_list:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty")
|
||||
break
|
||||
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}")
|
||||
await self.get_specified_notes([note.note_id for note in note_list])
|
||||
page_number += tieba_limit_count
|
||||
|
||||
async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST):
|
||||
"""
|
||||
Get the information and comments of the specified post
|
||||
Args:
|
||||
note_id_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore) for note_id in note_id_list
|
||||
]
|
||||
note_details = await asyncio.gather(*task_list)
|
||||
note_details_model: List[TiebaNote] = []
|
||||
for note_detail in note_details:
|
||||
if note_detail is not None:
|
||||
note_details_model.append(note_detail)
|
||||
await tieba_store.update_tieba_note(note_detail)
|
||||
await self.batch_get_note_comments(note_details_model)
|
||||
|
||||
async def get_note_detail_async_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[TiebaNote]:
|
||||
"""
|
||||
Get note detail
|
||||
Args:
|
||||
note_id: baidu tieba note id
|
||||
semaphore: asyncio semaphore
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}")
|
||||
note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id)
|
||||
if not note_detail:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}")
|
||||
return None
|
||||
return note_detail
|
||||
except Exception as ex:
|
||||
utils.logger.error(f"[BaiduTieBaCrawler.get_note_detail] Get note detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(
|
||||
f"[BaiduTieBaCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, note_detail_list: List[TiebaNote]):
|
||||
"""
|
||||
Batch get note comments
|
||||
Args:
|
||||
note_detail_list:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_COMMENTS:
|
||||
return
|
||||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for note_detail in note_detail_list:
|
||||
task = asyncio.create_task(self.get_comments_async_task(note_detail, semaphore), name=note_detail.note_id)
|
||||
task_list.append(task)
|
||||
await asyncio.gather(*task_list)
|
||||
|
||||
async def get_comments_async_task(self, note_detail: TiebaNote, semaphore: asyncio.Semaphore):
|
||||
"""
|
||||
Get comments async task
|
||||
Args:
|
||||
note_detail:
|
||||
semaphore:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async with semaphore:
|
||||
utils.logger.info(f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}")
|
||||
await self.tieba_client.get_note_all_comments(
|
||||
note_detail=note_detail,
|
||||
crawl_interval=random.random(),
|
||||
callback=tieba_store.batch_update_tieba_note_comments
|
||||
)
|
||||
|
||||
async def launch_browser(
|
||||
self,
|
||||
chromium: BrowserType,
|
||||
playwright_proxy: Optional[Dict],
|
||||
user_agent: Optional[str],
|
||||
headless: bool = True
|
||||
) -> BrowserContext:
|
||||
"""
|
||||
Launch browser and create browser
|
||||
Args:
|
||||
chromium:
|
||||
playwright_proxy:
|
||||
user_agent:
|
||||
headless:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
utils.logger.info("[BaiduTieBaCrawler.launch_browser] Begin create browser context ...")
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
||||
config.USER_DATA_DIR % config.PLATFORM) # type: ignore
|
||||
browser_context = await chromium.launch_persistent_context(
|
||||
user_data_dir=user_data_dir,
|
||||
accept_downloads=True,
|
||||
headless=headless,
|
||||
proxy=playwright_proxy, # type: ignore
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent
|
||||
)
|
||||
return browser_context
|
||||
else:
|
||||
browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
|
||||
browser_context = await browser.new_context(
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent=user_agent
|
||||
)
|
||||
return browser_context
|
||||
|
||||
async def close(self):
|
||||
"""
|
||||
Close browser context
|
||||
Returns:
|
||||
|
||||
"""
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...")
|
|
@ -0,0 +1,18 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class SearchSortType(Enum):
|
||||
"""search sort type"""
|
||||
# 按时间倒序
|
||||
TIME_DESC = "1"
|
||||
# 按时间顺序
|
||||
TIME_ASC = "0"
|
||||
# 按相关性顺序
|
||||
RELEVANCE_ORDER = "2"
|
||||
|
||||
|
||||
class SearchNoteType(Enum):
|
||||
# 只看主题贴
|
||||
MAIN_THREAD = "1"
|
||||
# 混合模式(帖子+回复)
|
||||
FIXED_THREAD = "0"
|
|
@ -0,0 +1,301 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from parsel import Selector
|
||||
|
||||
from constant import baidu_tieba as const
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||
from tools import utils
|
||||
|
||||
|
||||
class TieBaExtractor:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def extract_search_note_list(page_content: str) -> List[TiebaNote]:
|
||||
"""
|
||||
提取贴吧帖子列表,这里提取的关键词搜索结果页的数据,还缺少帖子的回复数和回复页等数据
|
||||
Args:
|
||||
page_content: 页面内容的HTML字符串
|
||||
|
||||
Returns:
|
||||
包含帖子信息的字典列表
|
||||
"""
|
||||
xpath_selector = "//div[@class='s_post']"
|
||||
post_list = Selector(text=page_content).xpath(xpath_selector)
|
||||
result: List[TiebaNote] = []
|
||||
for post in post_list:
|
||||
tieba_note = TiebaNote(
|
||||
note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(),
|
||||
title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(),
|
||||
desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(),
|
||||
note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get(default=''),
|
||||
user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip(),
|
||||
user_link=const.TIEBA_URL + post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default=''),
|
||||
tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(),
|
||||
tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get(default=''),
|
||||
publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip(),
|
||||
)
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
|
||||
"""
|
||||
提取贴吧帖子列表
|
||||
Args:
|
||||
page_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
page_content = page_content.replace('<!--', "")
|
||||
content_selector = Selector(text=page_content)
|
||||
xpath_selector = "//ul[@id='thread_list']/li"
|
||||
post_list = content_selector.xpath(xpath_selector)
|
||||
result: List[TiebaNote] = []
|
||||
for post_selector in post_list:
|
||||
post_field_value: Dict = self.extract_data_field_value(post_selector)
|
||||
if not post_field_value:
|
||||
continue
|
||||
note_id = str(post_field_value.get("id"))
|
||||
tieba_note = TiebaNote(
|
||||
note_id=note_id,
|
||||
title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(),
|
||||
desc=post_selector.xpath(".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get(
|
||||
default='').strip(),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + post_selector.xpath(
|
||||
".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(),
|
||||
user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get("author_name"),
|
||||
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(),
|
||||
tieba_link=const.TIEBA_URL + content_selector.xpath("//a[@class='card_title_fname']/@href").get(
|
||||
default=''),
|
||||
total_replay_num=post_field_value.get("reply_num", 0)
|
||||
)
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
def extract_note_detail(self, page_content: str) -> TiebaNote:
|
||||
"""
|
||||
提取贴吧帖子详情
|
||||
Args:
|
||||
page_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
content_selector = Selector(text=page_content)
|
||||
first_floor_selector = content_selector.xpath("//div[@class='p_postlist'][1]")
|
||||
only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip()
|
||||
note_id = only_view_author_link.split("?")[0].split("/")[-1]
|
||||
# 帖子回复数、回复页数
|
||||
thread_num_infos = content_selector.xpath(
|
||||
"//div[@id='thread_theme_5']//li[@class='l_reply_num']//span[@class='red']"
|
||||
)
|
||||
# IP地理位置、发表时间
|
||||
other_info_content = content_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
|
||||
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
|
||||
note = TiebaNote(
|
||||
note_id=note_id,
|
||||
title=content_selector.xpath("//title/text()").get(default='').strip(),
|
||||
desc=content_selector.xpath("//meta[@name='description']/@content").get(default='').strip(),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + first_floor_selector.xpath(".//a[@class='p_author_face ']/@href").get(
|
||||
default='').strip(),
|
||||
user_nickname=first_floor_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get(
|
||||
default='').strip(),
|
||||
user_avatar=first_floor_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(default='').strip(),
|
||||
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(),
|
||||
tieba_link=const.TIEBA_URL + content_selector.xpath("//a[@class='card_title_fname']/@href").get(default=''),
|
||||
ip_location=ip_location,
|
||||
publish_time=publish_time,
|
||||
total_replay_num=thread_num_infos[0].xpath("./text()").get(default='').strip(),
|
||||
total_replay_page=thread_num_infos[1].xpath("./text()").get(default='').strip(),
|
||||
)
|
||||
note.title = note.title.replace(f"【{note.tieba_name}】_百度贴吧", "")
|
||||
return note
|
||||
|
||||
def extract_tieba_note_parment_comments(self, page_content: str, note_id: str) -> List[TiebaComment]:
|
||||
"""
|
||||
提取贴吧帖子一级评论
|
||||
Args:
|
||||
page_content:
|
||||
note_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
xpath_selector = "//div[@class='l_post l_post_bright j_l_post clearfix ']"
|
||||
comment_list = Selector(text=page_content).xpath(xpath_selector)
|
||||
result: List[TiebaComment] = []
|
||||
for comment_selector in comment_list:
|
||||
comment_field_value: Dict = self.extract_data_field_value(comment_selector)
|
||||
if not comment_field_value:
|
||||
continue
|
||||
tieba_name = comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip()
|
||||
other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
|
||||
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
|
||||
tieba_comment = TiebaComment(
|
||||
comment_id=str(comment_field_value.get("content").get("post_id")),
|
||||
sub_comment_count=comment_field_value.get("content").get("comment_num"),
|
||||
content=utils.extract_text_from_html(comment_field_value.get("content").get("content")),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(
|
||||
default='').strip(),
|
||||
user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get(
|
||||
default='').strip(),
|
||||
user_avatar=comment_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(
|
||||
default='').strip(),
|
||||
tieba_id=str(comment_field_value.get("content").get("forum_id", "")),
|
||||
tieba_name=tieba_name,
|
||||
tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}",
|
||||
ip_location=ip_location,
|
||||
publish_time=publish_time,
|
||||
note_id=note_id,
|
||||
)
|
||||
result.append(tieba_comment)
|
||||
return result
|
||||
|
||||
def extract_tieba_note_sub_comments(self, page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
|
||||
"""
|
||||
提取贴吧帖子二级评论
|
||||
Args:
|
||||
page_content:
|
||||
parent_comment:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
selector = Selector(page_content)
|
||||
comments = []
|
||||
comment_ele_list = selector.xpath("//li[@class='lzl_single_post j_lzl_s_p first_no_border']")
|
||||
comment_ele_list.extend(selector.xpath("//li[@class='lzl_single_post j_lzl_s_p ']"))
|
||||
for comment_ele in comment_ele_list:
|
||||
comment_value = self.extract_data_field_value(comment_ele)
|
||||
if not comment_value:
|
||||
continue
|
||||
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
|
||||
content = utils.extract_text_from_html(
|
||||
comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
|
||||
comment = TiebaComment(
|
||||
comment_id=str(comment_value.get("spid")),
|
||||
content=content,
|
||||
user_link=comment_user_a_selector.xpath("./@href").get(default=""),
|
||||
user_nickname=comment_value.get("showname"),
|
||||
user_avatar=comment_user_a_selector.xpath("./img/@src").get(default=""),
|
||||
publish_time=comment_ele.xpath(".//span[@class='lzl_time']/text()").get(default="").strip(),
|
||||
parent_comment_id=parent_comment.comment_id,
|
||||
note_id=parent_comment.note_id,
|
||||
note_url=parent_comment.note_url,
|
||||
tieba_id=parent_comment.tieba_id,
|
||||
tieba_name=parent_comment.tieba_name,
|
||||
tieba_link=parent_comment.tieba_link
|
||||
)
|
||||
comments.append(comment)
|
||||
|
||||
return comments
|
||||
|
||||
@staticmethod
|
||||
def extract_ip_and_pub_time(html_content: str) -> Tuple[str, str]:
|
||||
"""
|
||||
提取IP位置和发布时间
|
||||
Args:
|
||||
html_content:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pattern_ip = re.compile(r'IP属地:(\S+)</span>')
|
||||
pattern_pub_time = re.compile(r'<span class="tail-info">(\d{4}-\d{2}-\d{2} \d{2}:\d{2})</span>')
|
||||
ip_match = pattern_ip.search(html_content)
|
||||
time_match = pattern_pub_time.search(html_content)
|
||||
ip = ip_match.group(1) if ip_match else ""
|
||||
pub_time = time_match.group(1) if time_match else ""
|
||||
return ip, pub_time
|
||||
|
||||
@staticmethod
|
||||
def extract_data_field_value(selector: Selector) -> Dict:
|
||||
"""
|
||||
提取data-field的值
|
||||
Args:
|
||||
selector:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
data_field_value = selector.xpath("./@data-field").get(default='').strip()
|
||||
if not data_field_value or data_field_value == "{}":
|
||||
return {}
|
||||
try:
|
||||
# 先使用 html.unescape 处理转义字符 再json.loads 将 JSON 字符串转换为 Python 字典
|
||||
unescaped_json_str = html.unescape(data_field_value)
|
||||
data_field_dict_value = json.loads(unescaped_json_str)
|
||||
except Exception as ex:
|
||||
print(f"extract_data_field_value,错误信息:{ex}, 尝试使用其他方式解析")
|
||||
data_field_dict_value = {}
|
||||
return data_field_dict_value
|
||||
|
||||
|
||||
def test_extract_search_note_list():
|
||||
with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_search_note_list(content)
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_note_detail():
|
||||
with open("test_data/note_detail.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_note_detail(content)
|
||||
print(result.model_dump())
|
||||
|
||||
|
||||
def test_extract_tieba_note_parment_comments():
|
||||
with open("test_data/note_comments.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_tieba_note_parment_comments(content, "123456")
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_tieba_note_sub_comments():
|
||||
with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
fake_parment_comment = TiebaComment(
|
||||
comment_id="123456",
|
||||
content="content",
|
||||
user_link="user_link",
|
||||
user_nickname="user_nickname",
|
||||
user_avatar="user_avatar",
|
||||
publish_time="publish_time",
|
||||
parent_comment_id="parent_comment_id",
|
||||
note_id="note_id",
|
||||
note_url="note_url",
|
||||
tieba_id="tieba_id",
|
||||
tieba_name="tieba_name",
|
||||
)
|
||||
result = extractor.extract_tieba_note_sub_comments(content, fake_parment_comment)
|
||||
print(result)
|
||||
|
||||
|
||||
def test_extract_tieba_note_list():
|
||||
with open("test_data/tieba_note_list.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
result = extractor.extract_tieba_note_list(content)
|
||||
print(result)
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# test_extract_search_note_list()
|
||||
# test_extract_note_detail()
|
||||
# test_extract_tieba_note_parment_comments()
|
||||
test_extract_tieba_note_list()
|
|
@ -0,0 +1,112 @@
|
|||
import asyncio
|
||||
import functools
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractLogin
|
||||
from tools import utils
|
||||
|
||||
|
||||
class BaiduTieBaLogin(AbstractLogin):
|
||||
|
||||
def __init__(self,
|
||||
login_type: str,
|
||||
browser_context: BrowserContext,
|
||||
context_page: Page,
|
||||
login_phone: Optional[str] = "",
|
||||
cookie_str: str = ""
|
||||
):
|
||||
config.LOGIN_TYPE = login_type
|
||||
self.browser_context = browser_context
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
|
||||
@retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
"""
|
||||
轮训检查登录状态是否成功,成功返回True否则返回False
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
stoken = cookie_dict.get("STOKEN")
|
||||
ptoken = cookie_dict.get("PTOKEN")
|
||||
if stoken or ptoken:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def begin(self):
|
||||
"""Start login baidutieba"""
|
||||
utils.logger.info("[BaiduTieBaLogin.begin] Begin login baidutieba ...")
|
||||
if config.LOGIN_TYPE == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif config.LOGIN_TYPE == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif config.LOGIN_TYPE == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("[BaiduTieBaLogin.begin]Invalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||
|
||||
async def login_by_mobile(self):
|
||||
"""Login baidutieba by mobile"""
|
||||
pass
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login baidutieba website and keep webdriver login state"""
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Begin login baidutieba by qrcode ...")
|
||||
qrcode_img_selector = "xpath=//img[@class='tang-pass-qrcode-img']"
|
||||
# find login qrcode
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
# if this website does not automatically popup login dialog box, we will manual click login button
|
||||
await asyncio.sleep(0.5)
|
||||
login_button_ele = self.context_page.locator("xpath=//li[@class='u_login']")
|
||||
await login_button_ele.click()
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
# fix issue #12
|
||||
# we need to use partial function to call show_qrcode function and run in executor
|
||||
# then current asyncio event loop will not be blocked
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Login baidutieba failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_cookies(self):
|
||||
"""login baidutieba website by cookies"""
|
||||
utils.logger.info("[BaiduTieBaLogin.login_by_cookies] Begin login baidutieba by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".baidu.com",
|
||||
'path': "/"
|
||||
}])
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,189 @@
|
|||
<li class="lzl_single_post j_lzl_s_p first_no_border" data-field='{"spid":150726504693,"showname":"heinzfrentzen","user_name":"heinzfrentzen","portrait":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}'>
|
||||
<a rel="noopener" name="150726504693"></a>
|
||||
<a rel="noopener" data-field='{"un":"heinzfrentzen","id":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&fr=pb" username="heinzfrentzen">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"heinzfrentzen","id":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}' href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&ie=utf-8&fr=pb" target="_blank" username="heinzfrentzen">heinzfrentzen</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:11</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726506822,"showname":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","user_name":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","portrait":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}'>
|
||||
<a rel="noopener" name="150726506822"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","id":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&fr=pb" username="可爱的搬运工94">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","id":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}' href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&ie=utf-8&fr=pb" target="_blank" username="可爱的搬运工94">可爱的搬运工94</a>
|
||||
:<span class="lzl_content_main" data-username="">陈芋汐水花也不小 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726508024,"showname":"\u56fd\u9645\u4f53\u575b\u5de8\u661f\u9752\u6912\u8089\u4e1d","user_name":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","portrait":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}'>
|
||||
<a rel="noopener" name="150726508024"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","id":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&fr=pb" username="蚂蚁雅虎哈哈">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","id":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}' href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&ie=utf-8&fr=pb" target="_blank" username="蚂蚁雅虎哈哈">国际体坛巨星青椒肉丝</a>
|
||||
:<span class="lzl_content_main" data-username="">你怀孕了吗 老是呕吐 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726509762,"showname":"\u8317\u82b1\u5c11\u5e05","user_name":"\u8317\u82b1\u5c11\u5e05","portrait":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}'>
|
||||
<a rel="noopener" name="150726509762"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u8317\u82b1\u5c11\u5e05","id":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&fr=pb" username="茗花少帅">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1421248220","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1421248220","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u8317\u82b1\u5c11\u5e05","id":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}' href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&ie=utf-8&fr=pb" target="_blank" username="茗花少帅">茗花少帅</a>
|
||||
:<span class="lzl_content_main" data-username="">你就只看水花,不看空中姿态吗 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726510645,"showname":"\u4e1c\u534e\u6b66\u5170","user_name":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","portrait":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}'>
|
||||
<a rel="noopener" name="150726510645"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","id":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&fr=pb" username="西安交大前一百">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1644033630","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1644033630","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","id":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}' href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&ie=utf-8&fr=pb" target="_blank" username="西安交大前一百">东华武兰</a>
|
||||
:<span class="lzl_content_main" data-username="">经典只看水花 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726514057,"showname":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","user_name":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","portrait":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}'>
|
||||
<a rel="noopener" name="150726514057"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","id":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&fr=pb" username="上下班要注意">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","id":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}' href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&ie=utf-8&fr=pb" target="_blank" username="上下班要注意">上下班要注意</a>
|
||||
:<span class="lzl_content_main" data-username="">额,分数正常吧 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:13</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726520372,"showname":"\u9759\u770b\u8682\u8681\u4e0a\u6811","user_name":"\u9759\u770b\u8682\u8681\u4e0a\u6811","portrait":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}'>
|
||||
<a rel="noopener" name="150726520372"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u9759\u770b\u8682\u8681\u4e0a\u6811","id":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&fr=pb" username="静看蚂蚁上树">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u9759\u770b\u8682\u8681\u4e0a\u6811","id":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}' href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&ie=utf-8&fr=pb" target="_blank" username="静看蚂蚁上树">静看蚂蚁上树</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg" target="_blank" class="at">国际体坛巨星青椒肉丝</a>
|
||||
:吃酸黄瓜吃多了<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:14</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726524963,"showname":"\u4e0d\u61c2\u53d6\u5565\u540d\u5b57\ud83d\ude1c","user_name":"\u9ec4\u5c0f\u6e2forz","portrait":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}'>
|
||||
<a rel="noopener" name="150726524963"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u9ec4\u5c0f\u6e2forz","id":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&fr=pb" username="黄小港orz">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u9ec4\u5c0f\u6e2forz","id":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}' href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&ie=utf-8&fr=pb" target="_blank" username="黄小港orz">不懂取啥名字😜</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
请你去跟国际泳联投诉<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:15</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726535666,"showname":"\ud83d\udcab\u6cfd\u8d6b\u62c9\ud83d\udcaf","user_name":"\u5feb\u770b\u5361\u5361\u5361\u5361","portrait":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}'>
|
||||
<a rel="noopener" name="150726535666"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u5feb\u770b\u5361\u5361\u5361\u5361","id":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&fr=pb" username="快看卡卡卡卡">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1539783937","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1539783937","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u5feb\u770b\u5361\u5361\u5361\u5361","id":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}' href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&ie=utf-8&fr=pb" target="_blank" username="快看卡卡卡卡">💫泽赫拉💯</a>
|
||||
:<span class="lzl_content_main" data-username="">第五跳陈空中分腿了,空中姿态明显全红婵更好 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:17</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726536076,"showname":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\ud83d\udc36","user_name":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","portrait":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}'>
|
||||
<a rel="noopener" name="150726536076"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","id":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&fr=pb" username="嗯嗯哦哦啊啊哼">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":null,"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","id":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}' href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&ie=utf-8&fr=pb" target="_blank" username="嗯嗯哦哦啊啊哼">嗯嗯哦哦啊啊🐶</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.84497425.b5GLK5lGm90mTB2BhjrgpA" target="_blank" class="at">美味蟹黄堡💞</a>
|
||||
:你不会看起跳高度和空中姿态?
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:17</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_li_pager j_lzl_l_p lzl_li_pager_s" data-field='{"total_num":16,"total_page":2}'>
|
||||
<a rel="noopener" class="j_lzl_p btn-sub btn-small pull-right" href="##">
|
||||
<i class="icon-reply"></i>
|
||||
我也说一句
|
||||
</a>
|
||||
<p class="j_pager l_pager pager_theme_2">
|
||||
<span class="tP">1</span>
|
||||
<a href="#2">2</a>
|
||||
<a href="#2">下一页</a>
|
||||
<a href="#2">尾页</a>
|
||||
</p>
|
||||
</li>
|
|
@ -0,0 +1,96 @@
|
|||
<div class="s_post_list">
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9117888152" data-fid="26976424" class="bluelink"
|
||||
href="/p/9117888152?pid=150718967291&cid=0#150718967291"
|
||||
target="_blank">武汉交互空间科技:富士康10亿加码中国大陆,印度为何逐渐“失宠</a></span>
|
||||
<div class="p_content">
|
||||
全球知名的电子制造服务巨头富士康的母公司鸿海精密工业股份有限公司正式对外发布了一则重大投资公告,富士康将在郑州投资建设新事业总部大楼,承载新事业总部功能。这一战略举措不仅彰显了富士康对中国市场持续深化的承诺与信心,也预示着该集团业务版图的新一轮扩张与升级。
|
||||
项目一期选址位于郑东新区,建筑面积约700公亩,总投资约10亿元人民币。主要建设总部管理中心、研发中心和工程中心、战略产业发展中心、战略产业金融平台、
|
||||
</div>
|
||||
贴吧:<a data-fid="26976424" class="p_forum" href="/f?kw=%CE%E4%BA%BA%BD%BB%BB%A5%BF%D5%BC%E4"
|
||||
target="_blank"><font class="p_violet">武汉交互空间</font></a>作者:<a
|
||||
href="/home/main?un=VR%D0%E9%C4%E2%B4%EF%C8%CB" target="_blank"><font class="p_violet">VR虚拟达人</font></a>
|
||||
<font class="p_green p_date">2024-08-05 16:45</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9114743782" data-fid="90367" class="bluelink"
|
||||
href="/p/9114743782?pid=150705176739&cid=0#150705176739"
|
||||
target="_blank">请各位急用玛尼的小心,骗子最多</a></span>
|
||||
<div class="p_content">
|
||||
这里面到处是骗子,大家小心。特别那些叫出村背货的,基本是卖园区,天下没有那么好的事。就是有这好事,我们在边境上的人,比你们最清楚,轮不到你们,边境上比你们胆子大的人大把,你一不熟悉小路,为什么叫你带货。东南亚带货的集结地,一般在南宁,防城港,昆明,西双版纳,临沧然后师机接了走小路出去,南宁,防城港坐船出去。好多都是二十几手的中介,之前卖园区一个三十万,现在不知道行情,但好多园区不收
|
||||
</div>
|
||||
贴吧:<a data-fid="90367" class="p_forum" href="/f?kw=%B1%B3%B0%FC%BF%CD" target="_blank"><font class="p_violet">背包客</font></a>作者:<a
|
||||
href="/home/main?un=%CC%F9%B0%C9%D3%C3%BB%A7_GC64AUS" target="_blank"><font class="p_violet">贴吧用户_GC64AUS</font></a>
|
||||
<font class="p_green p_date">2024-08-03 07:35</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9095684158" data-fid="1388265" class="bluelink"
|
||||
href="/p/9095684158?pid=150616716870&cid=0#150616716870"
|
||||
target="_blank">*2025泰国冷链制冷运输展*东南亚外贸出口</a></span>
|
||||
<div class="p_content">**2025泰国曼谷国际冷库、空调制冷、仓储暨冷链运输展 *2025泰国冷链制冷运输展*东南亚外贸出口-观展游览考察
|
||||
展出时间:2025-7月(具体时间待定) 展出地点:泰国曼谷会展中心 展会周期:一年一届 组展单位:北京励航国际商务会展有限公司
|
||||
人员跟团观展补贴!为您节省成本,寻找适合您的市场:
|
||||
本公司为您提供观展考察机会,让您在大型展会上获得世界同行**科技的资料同时,感受异域文化气息。展会现场走展考察→→当地游览→→当地相关市
|
||||
</div>
|
||||
贴吧:<a data-fid="1388265" class="p_forum" href="/f?kw=%B9%FA%BC%CA%D5%B9%BB%E1" target="_blank"><font
|
||||
class="p_violet">国际展会</font></a>作者:<a href="/home/main?un=zhaot_188" target="_blank"><font
|
||||
class="p_violet">zhaot_188</font></a> <font class="p_green p_date">2024-07-19 15:44</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9093564752" data-fid="27984246" class="bluelink"
|
||||
href="/p/9093564752?pid=150606964195&cid=0#150606964195"
|
||||
target="_blank">京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承</a></span>
|
||||
<div class="p_content">来源标题:京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承 京湘楼(KING HERO)品牌创始人:肖鑫
|
||||
京湘楼,KING
|
||||
HERO,集酱板鸭、肥肠、鸭头、鸭脖、鸭肠、小龙虾、牛蛙、捆鸡、鸡爪、鱼嘴巴、鱼尾、鱿鱼、牛肉、猪头肉等特色食品卤制,加工、包装与生产经营。2022年3月在北京朝阳区双井开设了第一家“京湘楼·鲜卤集市”卤味熟食快餐店,2023年5月在湖南省长沙市开福区注册成立了“长沙京湘楼品牌管理有限公司”,以“京湘楼”作为品
|
||||
</div>
|
||||
贴吧:<a data-fid="27984246" class="p_forum" href="/f?kw=%BE%A9%CF%E6%C2%A5" target="_blank"><font
|
||||
class="p_violet">京湘楼</font></a>作者:<a href="/home/main?un=%CC%EC%C9%F1%B6%C9%B3%BE" target="_blank"><font
|
||||
class="p_violet">天神渡尘</font></a> <font class="p_green p_date">2024-07-17 23:43</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9088419293" data-fid="310" class="bluelink"
|
||||
href="/p/9088419293?pid=150582471307&cid=0#150582471307"
|
||||
target="_blank">广州能争取到迪士尼与环球落户吗?</a></span>
|
||||
<div class="p_content">
|
||||
不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。
|
||||
美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃
|
||||
</div>
|
||||
贴吧:<a data-fid="310" class="p_forum" href="/f?kw=%B5%D8%C0%ED" target="_blank"><font
|
||||
class="p_violet">地理</font></a>作者:<a href="/home/main?un=SeaRoutes" target="_blank"><font
|
||||
class="p_violet">SeaRoutes</font></a> <font class="p_green p_date">2024-07-13 20:17</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9088416365" data-fid="7561034" class="bluelink"
|
||||
href="/p/9088416365?pid=150582456551&cid=0#150582456551"
|
||||
target="_blank">#城市GDP#广州应该全力去争取迪士尼和环球影城</a></span>
|
||||
<div class="p_content">
|
||||
不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。
|
||||
美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃
|
||||
</div>
|
||||
贴吧:<a data-fid="7561034" class="p_forum" href="/f?kw=%B3%C7%CA%D0gdp" target="_blank"><font class="p_violet">城市gdp</font></a>作者:<a
|
||||
href="/home/main?un=SeaRoutes" target="_blank"><font class="p_violet">SeaRoutes</font></a> <font
|
||||
class="p_green p_date">2024-07-13 20:14</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9087419039" data-fid="46374" class="bluelink"
|
||||
href="/p/9087419039?pid=150577861626&cid=0#150577861626"
|
||||
target="_blank">云南省首批《云南日报》昆明新闻头条聚焦阳宗海省级物流枢纽建设</a></span>
|
||||
<div class="p_content">
|
||||
7月11日《云南日报》昆明新闻头条刊发文章《阳宗海风景名胜区立足“衔接西部陆海新通道与中老铁路”优势——加速28个物流枢纽设施建设》聚焦昆明阳宗海风景名胜区系统推进省级物流枢纽建设和功能提升深挖比较优势壮大物流产业据云南省发展和改革委员会在昆明召开的新闻发布会上公布,今年全省共有5地纳入云南省第一批省级物流枢纽和省级骨干冷链物流基地建设名单,其中,昆明市有两家获批,阳宗海物流枢纽上榜!一起来看近日,云南省
|
||||
</div>
|
||||
贴吧:<a data-fid="46374" class="p_forum" href="/f?kw=%C0%A5%C3%F7" target="_blank"><font
|
||||
class="p_violet">昆明</font></a>作者:<a href="/home/main?un=%8F%EC" target="_blank"><font
|
||||
class="p_violet">忟</font></a> <font class="p_green p_date">2024-07-12 23:04</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9085102046" data-fid="348713" class="bluelink"
|
||||
href="/p/9085102046?pid=150567555367&cid=0#150567555367"
|
||||
target="_blank">寻找弟弟,很久没跟家里联系</a></span>
|
||||
<div class="p_content">Kk四期世纪园区,寻找弟弟,外号大佐,F3 2楼,公司cj集团</div>
|
||||
贴吧:<a data-fid="348713" class="p_forum" href="/f?kw=%B6%AB%C4%CF%D1%C7" target="_blank"><font
|
||||
class="p_violet">东南亚</font></a>作者:<a href="/home/main?un=%CC%F9%B0%C9%D3%C3%BB%A7_GC2CtRa"
|
||||
target="_blank"><font class="p_violet">贴吧用户_GC2CtRa</font></a>
|
||||
<font class="p_green p_date">2024-07-11 07:53</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9083888071" data-fid="30" class="bluelink"
|
||||
href="/p/9083888071?pid=150562129935&cid=0#150562129935"
|
||||
target="_blank">拉美 非洲 东南亚 南亚等发展中国家不太可能普及八小时双休吧?</a></span>
|
||||
<div class="p_content">拉美 和 东南亚的泰国 之类的连毒枭和黑色产业都管不好感觉普及八小时双休不太可能 缅甸和非洲军阀林立
|
||||
跟军阀谈八小时双休那么不开玩笑?缅北诈骗园区就能看出来。
|
||||
</div>
|
||||
贴吧:<a data-fid="30" class="p_forum" href="/f?kw=%C0%FA%CA%B7" target="_blank"><font
|
||||
class="p_violet">历史</font></a>作者:<a href="/home/main?un=yoursagain" target="_blank"><font
|
||||
class="p_violet">yoursagain</font></a> <font class="p_green p_date">2024-07-10 09:00</font></div>
|
||||
<div class="s_post"><span class="p_title"><a data-tid="9071937582" data-fid="8103241" class="bluelink"
|
||||
href="/p/9071937582?pid=150510120873&cid=0#150510120873"
|
||||
target="_blank">东南亚,园区【 工 价 低 】</a></span>
|
||||
<div class="p_content"></div>
|
||||
贴吧:<a data-fid="8103241" class="p_forum" href="/f?kw=%D4%B0%C7%F8%D5%D0%C9%CC" target="_blank"><font
|
||||
class="p_violet">园区招商</font></a>作者:<a href="/home/main?un=QQ59052966" target="_blank"><font
|
||||
class="p_violet">QQ59052966</font></a> <font class="p_green p_date">2024-06-30 12:09</font></div>
|
||||
</div>
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1 @@
|
|||
# -*- coding: utf-8 -*-
|
|
@ -0,0 +1,45 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class TiebaNote(BaseModel):
|
||||
"""
|
||||
百度贴吧帖子
|
||||
"""
|
||||
note_id: str = Field(..., description="帖子ID")
|
||||
title: str = Field(..., description="帖子标题")
|
||||
desc: str = Field(default="", description="帖子描述")
|
||||
note_url: str = Field(..., description="帖子链接")
|
||||
publish_time: str = Field(default="", description="发布时间")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
user_avatar: str = Field(default="", description="用户头像地址")
|
||||
tieba_name: str = Field(..., description="贴吧名称")
|
||||
tieba_link: str = Field(..., description="贴吧链接")
|
||||
total_replay_num: int = Field(default=0, description="回复总数")
|
||||
total_replay_page: int = Field(default=0, description="回复总页数")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
|
||||
|
||||
class TiebaComment(BaseModel):
|
||||
"""
|
||||
百度贴吧评论
|
||||
"""
|
||||
|
||||
comment_id: str = Field(..., description="评论ID")
|
||||
parent_comment_id: str = Field(default="", description="父评论ID")
|
||||
content: str = Field(..., description="评论内容")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
user_avatar: str = Field(default="", description="用户头像地址")
|
||||
publish_time: str = Field(default="", description="发布时间")
|
||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||
sub_comment_count: int = Field(default=0, description="子评论数")
|
||||
note_id: str = Field(..., description="帖子ID")
|
||||
note_url: str = Field(..., description="帖子链接")
|
||||
tieba_id: str = Field(..., description="所属的贴吧ID")
|
||||
tieba_name: str = Field(..., description="所属的贴吧名称")
|
||||
tieba_link: str = Field(..., description="贴吧链接")
|
||||
|
|
@ -0,0 +1 @@
|
|||
# -*- coding: utf-8 -*-
|
|
@ -0,0 +1 @@
|
|||
# -*- coding: utf-8 -*-
|
|
@ -0,0 +1 @@
|
|||
# -*- coding: utf-8 -*-
|
|
@ -0,0 +1 @@
|
|||
# -*- coding: utf-8 -*-
|
|
@ -13,4 +13,4 @@ python-dotenv==1.0.1
|
|||
jieba==0.42.1
|
||||
wordcloud==1.9.3
|
||||
matplotlib==3.9.0
|
||||
requests==2.32.3
|
||||
requests==2.32.3
|
||||
|
|
|
@ -2,192 +2,200 @@
|
|||
-- Table structure for bilibili_video
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `bilibili_video`;
|
||||
CREATE TABLE `bilibili_video` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
|
||||
`video_type` varchar(16) NOT NULL COMMENT '视频类型',
|
||||
`title` varchar(500) DEFAULT NULL COMMENT '视频标题',
|
||||
`desc` longtext COMMENT '视频描述',
|
||||
`create_time` bigint NOT NULL COMMENT '视频发布时间戳',
|
||||
`liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数',
|
||||
`video_play_count` varchar(16) DEFAULT NULL COMMENT '视频播放数量',
|
||||
`video_danmaku` varchar(16) DEFAULT NULL COMMENT '视频弹幕数量',
|
||||
`video_comment` varchar(16) DEFAULT NULL COMMENT '视频评论数量',
|
||||
`video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL',
|
||||
`video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_bilibili_vi_video_i_31c36e` (`video_id`),
|
||||
KEY `idx_bilibili_vi_create__73e0ec` (`create_time`)
|
||||
CREATE TABLE `bilibili_video`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
|
||||
`video_type` varchar(16) NOT NULL COMMENT '视频类型',
|
||||
`title` varchar(500) DEFAULT NULL COMMENT '视频标题',
|
||||
`desc` longtext COMMENT '视频描述',
|
||||
`create_time` bigint NOT NULL COMMENT '视频发布时间戳',
|
||||
`liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数',
|
||||
`video_play_count` varchar(16) DEFAULT NULL COMMENT '视频播放数量',
|
||||
`video_danmaku` varchar(16) DEFAULT NULL COMMENT '视频弹幕数量',
|
||||
`video_comment` varchar(16) DEFAULT NULL COMMENT '视频评论数量',
|
||||
`video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL',
|
||||
`video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_bilibili_vi_video_i_31c36e` (`video_id`),
|
||||
KEY `idx_bilibili_vi_create__73e0ec` (`create_time`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B站视频';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for bilibili_video_comment
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `bilibili_video_comment`;
|
||||
CREATE TABLE `bilibili_video_comment` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
|
||||
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
|
||||
`content` longtext COMMENT '评论内容',
|
||||
`create_time` bigint NOT NULL COMMENT '评论时间戳',
|
||||
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_bilibili_vi_comment_41c34e` (`comment_id`),
|
||||
KEY `idx_bilibili_vi_video_i_f22873` (`video_id`)
|
||||
CREATE TABLE `bilibili_video_comment`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
|
||||
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
|
||||
`content` longtext COMMENT '评论内容',
|
||||
`create_time` bigint NOT NULL COMMENT '评论时间戳',
|
||||
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_bilibili_vi_comment_41c34e` (`comment_id`),
|
||||
KEY `idx_bilibili_vi_video_i_f22873` (`video_id`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站视频评论';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for bilibili_up_info
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `bilibili_up_info`;
|
||||
CREATE TABLE `bilibili_up_info` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`total_fans` bigint DEFAULT NULL COMMENT '粉丝数',
|
||||
`total_liked` bigint DEFAULT NULL COMMENT '总获赞数',
|
||||
`user_rank` int DEFAULT NULL COMMENT '用户等级',
|
||||
`is_official` int DEFAULT NULL COMMENT '是否官号',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_bilibili_vi_user_123456` (`user_id`)
|
||||
CREATE TABLE `bilibili_up_info`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`total_fans` bigint DEFAULT NULL COMMENT '粉丝数',
|
||||
`total_liked` bigint DEFAULT NULL COMMENT '总获赞数',
|
||||
`user_rank` int DEFAULT NULL COMMENT '用户等级',
|
||||
`is_official` int DEFAULT NULL COMMENT '是否官号',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_bilibili_vi_user_123456` (`user_id`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站UP主信息';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for douyin_aweme
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `douyin_aweme`;
|
||||
CREATE TABLE `douyin_aweme` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid',
|
||||
`short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID',
|
||||
`user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名',
|
||||
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`aweme_id` varchar(64) NOT NULL COMMENT '视频ID',
|
||||
`aweme_type` varchar(16) NOT NULL COMMENT '视频类型',
|
||||
`title` varchar(500) DEFAULT NULL COMMENT '视频标题',
|
||||
`desc` longtext COMMENT '视频描述',
|
||||
`create_time` bigint NOT NULL COMMENT '视频发布时间戳',
|
||||
`liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数',
|
||||
`comment_count` varchar(16) DEFAULT NULL COMMENT '视频评论数',
|
||||
`share_count` varchar(16) DEFAULT NULL COMMENT '视频分享数',
|
||||
`collected_count` varchar(16) DEFAULT NULL COMMENT '视频收藏数',
|
||||
`aweme_url` varchar(255) DEFAULT NULL COMMENT '视频详情页URL',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_douyin_awem_aweme_i_6f7bc6` (`aweme_id`),
|
||||
KEY `idx_douyin_awem_create__299dfe` (`create_time`)
|
||||
CREATE TABLE `douyin_aweme`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid',
|
||||
`short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID',
|
||||
`user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名',
|
||||
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`aweme_id` varchar(64) NOT NULL COMMENT '视频ID',
|
||||
`aweme_type` varchar(16) NOT NULL COMMENT '视频类型',
|
||||
`title` varchar(500) DEFAULT NULL COMMENT '视频标题',
|
||||
`desc` longtext COMMENT '视频描述',
|
||||
`create_time` bigint NOT NULL COMMENT '视频发布时间戳',
|
||||
`liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数',
|
||||
`comment_count` varchar(16) DEFAULT NULL COMMENT '视频评论数',
|
||||
`share_count` varchar(16) DEFAULT NULL COMMENT '视频分享数',
|
||||
`collected_count` varchar(16) DEFAULT NULL COMMENT '视频收藏数',
|
||||
`aweme_url` varchar(255) DEFAULT NULL COMMENT '视频详情页URL',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_douyin_awem_aweme_i_6f7bc6` (`aweme_id`),
|
||||
KEY `idx_douyin_awem_create__299dfe` (`create_time`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音视频';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for douyin_aweme_comment
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `douyin_aweme_comment`;
|
||||
CREATE TABLE `douyin_aweme_comment` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid',
|
||||
`short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID',
|
||||
`user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名',
|
||||
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
|
||||
`aweme_id` varchar(64) NOT NULL COMMENT '视频ID',
|
||||
`content` longtext COMMENT '评论内容',
|
||||
`create_time` bigint NOT NULL COMMENT '评论时间戳',
|
||||
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_douyin_awem_comment_fcd7e4` (`comment_id`),
|
||||
KEY `idx_douyin_awem_aweme_i_c50049` (`aweme_id`)
|
||||
CREATE TABLE `douyin_aweme_comment`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid',
|
||||
`short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID',
|
||||
`user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名',
|
||||
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
|
||||
`aweme_id` varchar(64) NOT NULL COMMENT '视频ID',
|
||||
`content` longtext COMMENT '评论内容',
|
||||
`create_time` bigint NOT NULL COMMENT '评论时间戳',
|
||||
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_douyin_awem_comment_fcd7e4` (`comment_id`),
|
||||
KEY `idx_douyin_awem_aweme_i_c50049` (`aweme_id`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音视频评论';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for dy_creator
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `dy_creator`;
|
||||
CREATE TABLE `dy_creator` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(128) NOT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`desc` longtext COMMENT '用户描述',
|
||||
`gender` varchar(1) DEFAULT NULL COMMENT '性别',
|
||||
`follows` varchar(16) DEFAULT NULL COMMENT '关注数',
|
||||
`fans` varchar(16) DEFAULT NULL COMMENT '粉丝数',
|
||||
`interaction` varchar(16) DEFAULT NULL COMMENT '获赞数',
|
||||
`videos_count` varchar(16) DEFAULT NULL COMMENT '作品数',
|
||||
PRIMARY KEY (`id`)
|
||||
CREATE TABLE `dy_creator`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(128) NOT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`desc` longtext COMMENT '用户描述',
|
||||
`gender` varchar(1) DEFAULT NULL COMMENT '性别',
|
||||
`follows` varchar(16) DEFAULT NULL COMMENT '关注数',
|
||||
`fans` varchar(16) DEFAULT NULL COMMENT '粉丝数',
|
||||
`interaction` varchar(16) DEFAULT NULL COMMENT '获赞数',
|
||||
`videos_count` varchar(16) DEFAULT NULL COMMENT '作品数',
|
||||
PRIMARY KEY (`id`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音博主信息';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for kuaishou_video
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `kuaishou_video`;
|
||||
CREATE TABLE `kuaishou_video` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
|
||||
`video_type` varchar(16) NOT NULL COMMENT '视频类型',
|
||||
`title` varchar(500) DEFAULT NULL COMMENT '视频标题',
|
||||
`desc` longtext COMMENT '视频描述',
|
||||
`create_time` bigint NOT NULL COMMENT '视频发布时间戳',
|
||||
`liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数',
|
||||
`viewd_count` varchar(16) DEFAULT NULL COMMENT '视频浏览数量',
|
||||
`video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL',
|
||||
`video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL',
|
||||
`video_play_url` varchar(512) DEFAULT NULL COMMENT '视频播放 URL',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_kuaishou_vi_video_i_c5c6a6` (`video_id`),
|
||||
KEY `idx_kuaishou_vi_create__a10dee` (`create_time`)
|
||||
CREATE TABLE `kuaishou_video`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
|
||||
`video_type` varchar(16) NOT NULL COMMENT '视频类型',
|
||||
`title` varchar(500) DEFAULT NULL COMMENT '视频标题',
|
||||
`desc` longtext COMMENT '视频描述',
|
||||
`create_time` bigint NOT NULL COMMENT '视频发布时间戳',
|
||||
`liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数',
|
||||
`viewd_count` varchar(16) DEFAULT NULL COMMENT '视频浏览数量',
|
||||
`video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL',
|
||||
`video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL',
|
||||
`video_play_url` varchar(512) DEFAULT NULL COMMENT '视频播放 URL',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_kuaishou_vi_video_i_c5c6a6` (`video_id`),
|
||||
KEY `idx_kuaishou_vi_create__a10dee` (`create_time`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='快手视频';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for kuaishou_video_comment
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `kuaishou_video_comment`;
|
||||
CREATE TABLE `kuaishou_video_comment` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
|
||||
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
|
||||
`content` longtext COMMENT '评论内容',
|
||||
`create_time` bigint NOT NULL COMMENT '评论时间戳',
|
||||
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_kuaishou_vi_comment_ed48fa` (`comment_id`),
|
||||
KEY `idx_kuaishou_vi_video_i_e50914` (`video_id`)
|
||||
CREATE TABLE `kuaishou_video_comment`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
|
||||
`video_id` varchar(64) NOT NULL COMMENT '视频ID',
|
||||
`content` longtext COMMENT '评论内容',
|
||||
`create_time` bigint NOT NULL COMMENT '评论时间戳',
|
||||
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_kuaishou_vi_comment_ed48fa` (`comment_id`),
|
||||
KEY `idx_kuaishou_vi_video_i_e50914` (`video_id`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='快手视频评论';
|
||||
|
||||
|
||||
|
@ -195,145 +203,198 @@ CREATE TABLE `kuaishou_video_comment` (
|
|||
-- Table structure for weibo_note
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `weibo_note`;
|
||||
CREATE TABLE `weibo_note` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`gender` varchar(12) DEFAULT NULL COMMENT '用户性别',
|
||||
`profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址',
|
||||
`ip_location` varchar(32) DEFAULT '发布微博的地理信息',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`note_id` varchar(64) NOT NULL COMMENT '帖子ID',
|
||||
`content` longtext COMMENT '帖子正文内容',
|
||||
`create_time` bigint NOT NULL COMMENT '帖子发布时间戳',
|
||||
`create_date_time` varchar(32) NOT NULL COMMENT '帖子发布日期时间',
|
||||
`liked_count` varchar(16) DEFAULT NULL COMMENT '帖子点赞数',
|
||||
`comments_count` varchar(16) DEFAULT NULL COMMENT '帖子评论数量',
|
||||
`shared_count` varchar(16) DEFAULT NULL COMMENT '帖子转发数量',
|
||||
`note_url` varchar(512) DEFAULT NULL COMMENT '帖子详情URL',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_weibo_note_note_id_f95b1a` (`note_id`),
|
||||
KEY `idx_weibo_note_create__692709` (`create_time`),
|
||||
KEY `idx_weibo_note_create__d05ed2` (`create_date_time`)
|
||||
CREATE TABLE `weibo_note`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`gender` varchar(12) DEFAULT NULL COMMENT '用户性别',
|
||||
`profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址',
|
||||
`ip_location` varchar(32) DEFAULT '发布微博的地理信息',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`note_id` varchar(64) NOT NULL COMMENT '帖子ID',
|
||||
`content` longtext COMMENT '帖子正文内容',
|
||||
`create_time` bigint NOT NULL COMMENT '帖子发布时间戳',
|
||||
`create_date_time` varchar(32) NOT NULL COMMENT '帖子发布日期时间',
|
||||
`liked_count` varchar(16) DEFAULT NULL COMMENT '帖子点赞数',
|
||||
`comments_count` varchar(16) DEFAULT NULL COMMENT '帖子评论数量',
|
||||
`shared_count` varchar(16) DEFAULT NULL COMMENT '帖子转发数量',
|
||||
`note_url` varchar(512) DEFAULT NULL COMMENT '帖子详情URL',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_weibo_note_note_id_f95b1a` (`note_id`),
|
||||
KEY `idx_weibo_note_create__692709` (`create_time`),
|
||||
KEY `idx_weibo_note_create__d05ed2` (`create_date_time`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博帖子';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for weibo_note_comment
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `weibo_note_comment`;
|
||||
CREATE TABLE `weibo_note_comment` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`gender` varchar(12) DEFAULT NULL COMMENT '用户性别',
|
||||
`profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址',
|
||||
`ip_location` varchar(32) DEFAULT '发布微博的地理信息',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
|
||||
`note_id` varchar(64) NOT NULL COMMENT '帖子ID',
|
||||
`content` longtext COMMENT '评论内容',
|
||||
`create_time` bigint NOT NULL COMMENT '评论时间戳',
|
||||
`create_date_time` varchar(32) NOT NULL COMMENT '评论日期时间',
|
||||
`comment_like_count` varchar(16) NOT NULL COMMENT '评论点赞数量',
|
||||
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_weibo_note__comment_c7611c` (`comment_id`),
|
||||
KEY `idx_weibo_note__note_id_24f108` (`note_id`),
|
||||
KEY `idx_weibo_note__create__667fe3` (`create_date_time`)
|
||||
CREATE TABLE `weibo_note_comment`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) DEFAULT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`gender` varchar(12) DEFAULT NULL COMMENT '用户性别',
|
||||
`profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址',
|
||||
`ip_location` varchar(32) DEFAULT '发布微博的地理信息',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
|
||||
`note_id` varchar(64) NOT NULL COMMENT '帖子ID',
|
||||
`content` longtext COMMENT '评论内容',
|
||||
`create_time` bigint NOT NULL COMMENT '评论时间戳',
|
||||
`create_date_time` varchar(32) NOT NULL COMMENT '评论日期时间',
|
||||
`comment_like_count` varchar(16) NOT NULL COMMENT '评论点赞数量',
|
||||
`sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_weibo_note__comment_c7611c` (`comment_id`),
|
||||
KEY `idx_weibo_note__note_id_24f108` (`note_id`),
|
||||
KEY `idx_weibo_note__create__667fe3` (`create_date_time`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博帖子评论';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for xhs_creator
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `xhs_creator`;
|
||||
CREATE TABLE `xhs_creator` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`desc` longtext COMMENT '用户描述',
|
||||
`gender` varchar(1) DEFAULT NULL COMMENT '性别',
|
||||
`follows` varchar(16) DEFAULT NULL COMMENT '关注数',
|
||||
`fans` varchar(16) DEFAULT NULL COMMENT '粉丝数',
|
||||
`interaction` varchar(16) DEFAULT NULL COMMENT '获赞和收藏数',
|
||||
`tag_list` longtext COMMENT '标签列表',
|
||||
PRIMARY KEY (`id`)
|
||||
CREATE TABLE `xhs_creator`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`desc` longtext COMMENT '用户描述',
|
||||
`gender` varchar(1) DEFAULT NULL COMMENT '性别',
|
||||
`follows` varchar(16) DEFAULT NULL COMMENT '关注数',
|
||||
`fans` varchar(16) DEFAULT NULL COMMENT '粉丝数',
|
||||
`interaction` varchar(16) DEFAULT NULL COMMENT '获赞和收藏数',
|
||||
`tag_list` longtext COMMENT '标签列表',
|
||||
PRIMARY KEY (`id`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书博主';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for xhs_note
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `xhs_note`;
|
||||
CREATE TABLE `xhs_note` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`note_id` varchar(64) NOT NULL COMMENT '笔记ID',
|
||||
`type` varchar(16) DEFAULT NULL COMMENT '笔记类型(normal | video)',
|
||||
`title` varchar(255) DEFAULT NULL COMMENT '笔记标题',
|
||||
`desc` longtext COMMENT '笔记描述',
|
||||
`video_url` longtext COMMENT '视频地址',
|
||||
`time` bigint NOT NULL COMMENT '笔记发布时间戳',
|
||||
`last_update_time` bigint NOT NULL COMMENT '笔记最后更新时间戳',
|
||||
`liked_count` varchar(16) DEFAULT NULL COMMENT '笔记点赞数',
|
||||
`collected_count` varchar(16) DEFAULT NULL COMMENT '笔记收藏数',
|
||||
`comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数',
|
||||
`share_count` varchar(16) DEFAULT NULL COMMENT '笔记分享数',
|
||||
`image_list` longtext COMMENT '笔记封面图片列表',
|
||||
`tag_list` longtext COMMENT '标签列表',
|
||||
`note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_xhs_note_note_id_209457` (`note_id`),
|
||||
KEY `idx_xhs_note_time_eaa910` (`time`)
|
||||
CREATE TABLE `xhs_note`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`note_id` varchar(64) NOT NULL COMMENT '笔记ID',
|
||||
`type` varchar(16) DEFAULT NULL COMMENT '笔记类型(normal | video)',
|
||||
`title` varchar(255) DEFAULT NULL COMMENT '笔记标题',
|
||||
`desc` longtext COMMENT '笔记描述',
|
||||
`video_url` longtext COMMENT '视频地址',
|
||||
`time` bigint NOT NULL COMMENT '笔记发布时间戳',
|
||||
`last_update_time` bigint NOT NULL COMMENT '笔记最后更新时间戳',
|
||||
`liked_count` varchar(16) DEFAULT NULL COMMENT '笔记点赞数',
|
||||
`collected_count` varchar(16) DEFAULT NULL COMMENT '笔记收藏数',
|
||||
`comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数',
|
||||
`share_count` varchar(16) DEFAULT NULL COMMENT '笔记分享数',
|
||||
`image_list` longtext COMMENT '笔记封面图片列表',
|
||||
`tag_list` longtext COMMENT '标签列表',
|
||||
`note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL',
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_xhs_note_note_id_209457` (`note_id`),
|
||||
KEY `idx_xhs_note_time_eaa910` (`time`)
|
||||
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书笔记';
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for xhs_note_comment
|
||||
-- ----------------------------
|
||||
DROP TABLE IF EXISTS `xhs_note_comment`;
|
||||
CREATE TABLE `xhs_note_comment` (
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
|
||||
`create_time` bigint NOT NULL COMMENT '评论时间戳',
|
||||
`note_id` varchar(64) NOT NULL COMMENT '笔记ID',
|
||||
`content` longtext NOT NULL COMMENT '评论内容',
|
||||
`sub_comment_count` int NOT NULL COMMENT '子评论数量',
|
||||
`pictures` varchar(512) DEFAULT NULL,
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_xhs_note_co_comment_8e8349` (`comment_id`),
|
||||
KEY `idx_xhs_note_co_create__204f8d` (`create_time`)
|
||||
CREATE TABLE `xhs_note_comment`
|
||||
(
|
||||
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
|
||||
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
|
||||
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
|
||||
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
|
||||
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
|
||||
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
|
||||
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
|
||||
`comment_id` varchar(64) NOT NULL COMMENT '评论ID',
|
||||
`create_time` bigint NOT NULL COMMENT '评论时间戳',
|
||||
`note_id` varchar(64) NOT NULL COMMENT '笔记ID',
|
||||
`content` longtext NOT NULL COMMENT '评论内容',
|
||||
`sub_comment_count` int NOT NULL COMMENT '子评论数量',
|
||||
`pictures` varchar(512) DEFAULT NULL,
|
||||
PRIMARY KEY (`id`),
|
||||
KEY `idx_xhs_note_co_comment_8e8349` (`comment_id`),
|
||||
KEY `idx_xhs_note_co_create__204f8d` (`create_time`)
|
||||
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书笔记评论';
|
||||
|
||||
-- ----------------------------
|
||||
-- alter table xhs_note_comment to support parent_comment_id
|
||||
-- ----------------------------
|
||||
ALTER TABLE `xhs_note_comment`
|
||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
|
||||
ALTER TABLE `douyin_aweme_comment`
|
||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
|
||||
ALTER TABLE `bilibili_video_comment`
|
||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
|
||||
ALTER TABLE `weibo_note_comment`
|
||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
|
||||
SET FOREIGN_KEY_CHECKS = 1;
|
||||
|
||||
DROP TABLE IF EXISTS `tieba_note`;
|
||||
CREATE TABLE tieba_note
|
||||
(
|
||||
id BIGINT AUTO_INCREMENT PRIMARY KEY,
|
||||
note_id VARCHAR(644) NOT NULL COMMENT '帖子ID',
|
||||
title VARCHAR(255) NOT NULL COMMENT '帖子标题',
|
||||
`desc` TEXT COMMENT '帖子描述',
|
||||
note_url VARCHAR(255) NOT NULL COMMENT '帖子链接',
|
||||
publish_time VARCHAR(255) NOT NULL COMMENT '发布时间',
|
||||
user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接',
|
||||
user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称',
|
||||
user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址',
|
||||
tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID',
|
||||
tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称',
|
||||
tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接',
|
||||
total_replay_num INT DEFAULT 0 COMMENT '帖子回复总数',
|
||||
total_replay_page INT DEFAULT 0 COMMENT '帖子回复总页数',
|
||||
ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置',
|
||||
add_ts BIGINT NOT NULL COMMENT '添加时间戳',
|
||||
last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳',
|
||||
KEY `idx_tieba_note_note_id` (`note_id`),
|
||||
KEY `idx_tieba_note_publish_time` (`publish_time`)
|
||||
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧帖子表';
|
||||
|
||||
DROP TABLE IF EXISTS `tieba_comment`;
|
||||
CREATE TABLE tieba_comment
|
||||
(
|
||||
id BIGINT AUTO_INCREMENT PRIMARY KEY,
|
||||
comment_id VARCHAR(255) NOT NULL COMMENT '评论ID',
|
||||
parent_comment_id VARCHAR(255) DEFAULT '' COMMENT '父评论ID',
|
||||
content TEXT NOT NULL COMMENT '评论内容',
|
||||
user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接',
|
||||
user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称',
|
||||
user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址',
|
||||
tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID',
|
||||
tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称',
|
||||
tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接',
|
||||
publish_time VARCHAR(255) DEFAULT '' COMMENT '发布时间',
|
||||
ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置',
|
||||
sub_comment_count INT DEFAULT 0 COMMENT '子评论数',
|
||||
note_id VARCHAR(255) NOT NULL COMMENT '帖子ID',
|
||||
note_url VARCHAR(255) NOT NULL COMMENT '帖子链接',
|
||||
add_ts BIGINT NOT NULL COMMENT '添加时间戳',
|
||||
last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳',
|
||||
KEY `idx_tieba_comment_comment_id` (`note_id`),
|
||||
KEY `idx_tieba_comment_note_id` (`note_id`),
|
||||
KEY `idx_tieba_comment_publish_time` (`publish_time`)
|
||||
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表';
|
|
@ -0,0 +1,71 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from typing import List
|
||||
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||
|
||||
from . import tieba_store_impl
|
||||
from .tieba_store_impl import *
|
||||
|
||||
|
||||
class TieBaStoreFactory:
|
||||
STORES = {
|
||||
"csv": TieBaCsvStoreImplement,
|
||||
"db": TieBaDbStoreImplement,
|
||||
"json": TieBaJsonStoreImplement
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def create_store() -> AbstractStore:
|
||||
store_class = TieBaStoreFactory.STORES.get(config.SAVE_DATA_OPTION)
|
||||
if not store_class:
|
||||
raise ValueError(
|
||||
"[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...")
|
||||
return store_class()
|
||||
|
||||
|
||||
async def update_tieba_note(note_item: TiebaNote):
|
||||
"""
|
||||
Add or Update tieba note
|
||||
Args:
|
||||
note_item:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
save_note_item = note_item.model_dump()
|
||||
save_note_item.update({"last_modify_ts": utils.get_current_timestamp()})
|
||||
utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {save_note_item}")
|
||||
|
||||
await TieBaStoreFactory.create_store().store_content(save_note_item)
|
||||
|
||||
|
||||
async def batch_update_tieba_note_comments(note_id:str, comments: List[TiebaComment]):
|
||||
"""
|
||||
Batch update tieba note comments
|
||||
Args:
|
||||
note_id:
|
||||
comments:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not comments:
|
||||
return
|
||||
for comment_item in comments:
|
||||
await update_tieba_note_comment(note_id, comment_item)
|
||||
|
||||
|
||||
async def update_tieba_note_comment(note_id: str, comment_item: TiebaComment):
|
||||
"""
|
||||
Update tieba note comment
|
||||
Args:
|
||||
note_id:
|
||||
comment_item:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
save_comment_item = comment_item.model_dump()
|
||||
save_comment_item.update({"last_modify_ts": utils.get_current_timestamp()})
|
||||
utils.logger.info(f"[store.tieba.update_tieba_note_comment] tieba note id: {note_id} comment:{save_comment_item}")
|
||||
await TieBaStoreFactory.create_store().store_comment(save_comment_item)
|
|
@ -0,0 +1,244 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import asyncio
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Dict
|
||||
|
||||
import aiofiles
|
||||
|
||||
import config
|
||||
from base.base_crawler import AbstractStore
|
||||
from tools import utils, words
|
||||
from var import crawler_type_var
|
||||
|
||||
|
||||
def calculate_number_of_files(file_store_path: str) -> int:
|
||||
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
|
||||
Args:
|
||||
file_store_path;
|
||||
Returns:
|
||||
file nums
|
||||
"""
|
||||
if not os.path.exists(file_store_path):
|
||||
return 1
|
||||
try:
|
||||
return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1
|
||||
except ValueError:
|
||||
return 1
|
||||
|
||||
|
||||
class TieBaCsvStoreImplement(AbstractStore):
|
||||
csv_store_path: str = "data/tieba"
|
||||
file_count:int=calculate_number_of_files(csv_store_path)
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> str:
|
||||
"""
|
||||
make save file name by store type
|
||||
Args:
|
||||
store_type: contents or comments
|
||||
|
||||
Returns: eg: data/tieba/search_comments_20240114.csv ...
|
||||
|
||||
"""
|
||||
return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv"
|
||||
|
||||
async def save_data_to_csv(self, save_item: Dict, store_type: str):
|
||||
"""
|
||||
Below is a simple way to save it in CSV format.
|
||||
Args:
|
||||
save_item: save content dict info
|
||||
store_type: Save type contains content and comments(contents | comments)
|
||||
|
||||
Returns: no returns
|
||||
|
||||
"""
|
||||
pathlib.Path(self.csv_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name = self.make_save_file_name(store_type=store_type)
|
||||
async with aiofiles.open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f:
|
||||
f.fileno()
|
||||
writer = csv.writer(f)
|
||||
if await f.tell() == 0:
|
||||
await writer.writerow(save_item.keys())
|
||||
await writer.writerow(save_item.values())
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
Xiaohongshu content CSV storage implementation
|
||||
Args:
|
||||
content_item: note item dict
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
await self.save_data_to_csv(save_item=content_item, store_type="contents")
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
Xiaohongshu comment CSV storage implementation
|
||||
Args:
|
||||
comment_item: comment item dict
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
await self.save_data_to_csv(save_item=comment_item, store_type="comments")
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
Xiaohongshu content CSV storage implementation
|
||||
Args:
|
||||
creator: creator dict
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
await self.save_data_to_csv(save_item=creator, store_type="creator")
|
||||
|
||||
|
||||
class TieBaDbStoreImplement(AbstractStore):
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
Xiaohongshu content DB storage implementation
|
||||
Args:
|
||||
content_item: content item dict
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
from .tieba_store_sql import (add_new_content,
|
||||
query_content_by_content_id,
|
||||
update_content_by_content_id)
|
||||
note_id = content_item.get("note_id")
|
||||
note_detail: Dict = await query_content_by_content_id(content_id=note_id)
|
||||
if not note_detail:
|
||||
content_item["add_ts"] = utils.get_current_timestamp()
|
||||
await add_new_content(content_item)
|
||||
else:
|
||||
await update_content_by_content_id(note_id, content_item=content_item)
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
Xiaohongshu content DB storage implementation
|
||||
Args:
|
||||
comment_item: comment item dict
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
from .tieba_store_sql import (add_new_comment,
|
||||
query_comment_by_comment_id,
|
||||
update_comment_by_comment_id)
|
||||
comment_id = comment_item.get("comment_id")
|
||||
comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id)
|
||||
if not comment_detail:
|
||||
comment_item["add_ts"] = utils.get_current_timestamp()
|
||||
await add_new_comment(comment_item)
|
||||
else:
|
||||
await update_comment_by_comment_id(comment_id, comment_item=comment_item)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
Xiaohongshu content DB storage implementation
|
||||
Args:
|
||||
creator: creator dict
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
from .tieba_store_sql import (add_new_creator,
|
||||
query_creator_by_user_id,
|
||||
update_creator_by_user_id)
|
||||
user_id = creator.get("user_id")
|
||||
user_detail: Dict = await query_creator_by_user_id(user_id)
|
||||
if not user_detail:
|
||||
creator["add_ts"] = utils.get_current_timestamp()
|
||||
await add_new_creator(creator)
|
||||
else:
|
||||
await update_creator_by_user_id(user_id, creator)
|
||||
|
||||
|
||||
class TieBaJsonStoreImplement(AbstractStore):
|
||||
json_store_path: str = "data/tieba/json"
|
||||
words_store_path: str = "data/tieba/words"
|
||||
lock = asyncio.Lock()
|
||||
file_count:int=calculate_number_of_files(json_store_path)
|
||||
WordCloud = words.AsyncWordCloudGenerator()
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> (str,str):
|
||||
"""
|
||||
make save file name by store type
|
||||
Args:
|
||||
store_type: Save type contains content and comments(contents | comments)
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
return (
|
||||
f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json",
|
||||
f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}"
|
||||
)
|
||||
|
||||
async def save_data_to_json(self, save_item: Dict, store_type: str):
|
||||
"""
|
||||
Below is a simple way to save it in json format.
|
||||
Args:
|
||||
save_item: save content dict info
|
||||
store_type: Save type contains content and comments(contents | comments)
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
|
||||
save_data = []
|
||||
|
||||
async with self.lock:
|
||||
if os.path.exists(save_file_name):
|
||||
async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file:
|
||||
save_data = json.loads(await file.read())
|
||||
|
||||
save_data.append(save_item)
|
||||
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
|
||||
await file.write(json.dumps(save_data, ensure_ascii=False))
|
||||
|
||||
if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD:
|
||||
try:
|
||||
await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
|
||||
except:
|
||||
pass
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
content JSON storage implementation
|
||||
Args:
|
||||
content_item:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
await self.save_data_to_json(content_item, "contents")
|
||||
|
||||
async def store_comment(self, comment_item: Dict):
|
||||
"""
|
||||
comment JSON storage implementatio
|
||||
Args:
|
||||
comment_item:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
await self.save_data_to_json(comment_item, "comments")
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
"""
|
||||
Xiaohongshu content JSON storage implementation
|
||||
Args:
|
||||
creator: creator dict
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
await self.save_data_to_json(creator, "creator")
|
|
@ -0,0 +1,144 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from typing import Dict, List
|
||||
|
||||
from db import AsyncMysqlDB
|
||||
from var import media_crawler_db_var
|
||||
|
||||
|
||||
async def query_content_by_content_id(content_id: str) -> Dict:
|
||||
"""
|
||||
查询一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...)
|
||||
Args:
|
||||
content_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
|
||||
sql: str = f"select * from tieba_note where note_id = '{content_id}'"
|
||||
rows: List[Dict] = await async_db_conn.query(sql)
|
||||
if len(rows) > 0:
|
||||
return rows[0]
|
||||
return dict()
|
||||
|
||||
|
||||
async def add_new_content(content_item: Dict) -> int:
|
||||
"""
|
||||
新增一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...)
|
||||
Args:
|
||||
content_item:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
|
||||
last_row_id: int = await async_db_conn.item_to_table("tieba_note", content_item)
|
||||
return last_row_id
|
||||
|
||||
|
||||
async def update_content_by_content_id(content_id: str, content_item: Dict) -> int:
|
||||
"""
|
||||
更新一条记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...)
|
||||
Args:
|
||||
content_id:
|
||||
content_item:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
|
||||
effect_row: int = await async_db_conn.update_table("tieba_note", content_item, "note_id", content_id)
|
||||
return effect_row
|
||||
|
||||
|
||||
|
||||
async def query_comment_by_comment_id(comment_id: str) -> Dict:
|
||||
"""
|
||||
查询一条评论内容
|
||||
Args:
|
||||
comment_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
|
||||
sql: str = f"select * from tieba_comment where comment_id = '{comment_id}'"
|
||||
rows: List[Dict] = await async_db_conn.query(sql)
|
||||
if len(rows) > 0:
|
||||
return rows[0]
|
||||
return dict()
|
||||
|
||||
|
||||
async def add_new_comment(comment_item: Dict) -> int:
|
||||
"""
|
||||
新增一条评论记录
|
||||
Args:
|
||||
comment_item:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
|
||||
last_row_id: int = await async_db_conn.item_to_table("tieba_comment", comment_item)
|
||||
return last_row_id
|
||||
|
||||
|
||||
async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> int:
|
||||
"""
|
||||
更新增一条评论记录
|
||||
Args:
|
||||
comment_id:
|
||||
comment_item:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
|
||||
effect_row: int = await async_db_conn.update_table("tieba_comment", comment_item, "comment_id", comment_id)
|
||||
return effect_row
|
||||
|
||||
|
||||
async def query_creator_by_user_id(user_id: str) -> Dict:
|
||||
"""
|
||||
查询一条创作者记录
|
||||
Args:
|
||||
user_id:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
|
||||
sql: str = f"select * from tieba_creator where user_id = '{user_id}'"
|
||||
rows: List[Dict] = await async_db_conn.query(sql)
|
||||
if len(rows) > 0:
|
||||
return rows[0]
|
||||
return dict()
|
||||
|
||||
|
||||
async def add_new_creator(creator_item: Dict) -> int:
|
||||
"""
|
||||
新增一条创作者信息
|
||||
Args:
|
||||
creator_item:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
|
||||
last_row_id: int = await async_db_conn.item_to_table("tieba_creator", creator_item)
|
||||
return last_row_id
|
||||
|
||||
|
||||
async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int:
|
||||
"""
|
||||
更新一条创作者信息
|
||||
Args:
|
||||
user_id:
|
||||
creator_item:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
|
||||
effect_row: int = await async_db_conn.update_table("tieba_creator", creator_item, "user_id", user_id)
|
||||
return effect_row
|
|
@ -13,6 +13,8 @@ import httpx
|
|||
from PIL import Image, ImageDraw
|
||||
from playwright.async_api import Cookie, Page
|
||||
|
||||
from proxy import IpInfoModel
|
||||
|
||||
from . import utils
|
||||
|
||||
|
||||
|
@ -133,3 +135,24 @@ def match_interact_info_count(count_str: str) -> int:
|
|||
return int(number)
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
|
||||
"""format proxy info for playwright and httpx"""
|
||||
playwright_proxy = {
|
||||
"server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}",
|
||||
"username": ip_proxy_info.user,
|
||||
"password": ip_proxy_info.password,
|
||||
}
|
||||
httpx_proxy = {
|
||||
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
|
||||
}
|
||||
return playwright_proxy, httpx_proxy
|
||||
|
||||
def extract_text_from_html(html: str) -> str:
|
||||
"""Extract text from HTML, removing all tags."""
|
||||
# Remove script and style elements
|
||||
clean_html = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', html, flags=re.DOTALL)
|
||||
# Remove all other tags
|
||||
clean_text = re.sub(r'<[^>]+>', '', clean_html).strip()
|
||||
return clean_text
|
||||
|
|
|
@ -10,7 +10,7 @@ def init_loging_config():
|
|||
level = logging.INFO
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
format="%(asctime)s [%(threadName)s] %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
|
||||
format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s",
|
||||
datefmt='%Y-%m-%d %H:%M:%S'
|
||||
)
|
||||
_logger = logging.getLogger("MediaCrawler")
|
||||
|
|
Loading…
Reference in New Issue