From d347cf5a2c171573be40c63e3226c34010b1de3c Mon Sep 17 00:00:00 2001 From: Relakkes Date: Tue, 6 Aug 2024 03:37:55 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=B8=96=E5=AD=90=E6=90=9C=E7=B4=A2=20?= =?UTF-8?q?&=20=E7=A7=BB=E9=99=A4=E7=99=BB=E5=BD=95=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E4=BD=BF=E7=94=A8IP=E4=BB=A3=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- media_platform/tieba/client.py | 104 ++-- media_platform/tieba/core.py | 129 ++--- media_platform/tieba/help.py | 69 +++ .../tieba/test_data/search_keyword_notes.html | 96 ++++ schema/tables.sql | 536 ++++++++++-------- store/tieba/__init__.py | 23 +- store/tieba/tieba_store_sql.py | 18 +- tools/crawler_util.py | 14 + 8 files changed, 600 insertions(+), 389 deletions(-) create mode 100644 media_platform/tieba/help.py create mode 100644 media_platform/tieba/test_data/search_keyword_notes.html diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index a7ebaa1..a02e243 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -1,67 +1,77 @@ -import asyncio import json -import re +import random from typing import Any, Callable, Dict, List, Optional, Union from urllib.parse import urlencode import httpx -from playwright.async_api import BrowserContext, Page +from playwright.async_api import BrowserContext +from tenacity import (RetryError, retry, stop_after_attempt, + wait_fixed) -import config from base.base_crawler import AbstractApiClient +from proxy.proxy_ip_pool import ProxyIpPool from tools import utils from .field import SearchNoteType, SearchSortType +from .help import TieBaExtractor class BaiduTieBaClient(AbstractApiClient): def __init__( self, timeout=10, - proxies=None, - *, - headers: Dict[str, str], - playwright_page: Page, - cookie_dict: Dict[str, str], + ip_pool=None, + default_ip_proxy=None, ): - self.proxies = proxies + self.ip_pool: Optional[ProxyIpPool] = ip_pool self.timeout = timeout - self.headers = headers - self.playwright_page = playwright_page - self.cookie_dict = cookie_dict + self.headers = utils.get_user_agent() self._host = "https://tieba.baidu.com" + self._page_extractor = TieBaExtractor() + self.default_ip_proxy = default_ip_proxy - async def request(self, method, url, **kwargs) -> Union[str, Any]: + @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) + async def request(self, method, url, return_ori_content=False, proxies=None, **kwargs) -> Union[str, Any]: """ 封装httpx的公共请求方法,对请求响应做一些处理 Args: method: 请求方法 url: 请求的URL + return_ori_content: 是否返回原始内容 + proxies: 代理IP **kwargs: 其他请求参数,例如请求头、请求体等 Returns: """ - # return response.text - return_response = kwargs.pop('return_response', False) - - async with httpx.AsyncClient(proxies=self.proxies) as client: + actual_proxies = proxies if proxies else self.default_ip_proxy + async with httpx.AsyncClient(proxies=actual_proxies) as client: response = await client.request( method, url, timeout=self.timeout, **kwargs ) - if return_response: + if response.status_code != 200: + utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}") + utils.logger.error(f"Request failed, response: {response.text}") + raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}") + + if response.text == "" or response.text == "blocked": + utils.logger.error(f"request params incrr, response.text: {response.text}") + raise Exception("account blocked") + + if return_ori_content: return response.text return response.json() - async def get(self, uri: str, params=None) -> Dict: + async def get(self, uri: str, params=None, return_ori_content=False, **kwargs) -> Any: """ GET请求,对请求头签名 Args: uri: 请求路由 params: 请求参数 + return_ori_content: 是否返回原始内容 Returns: @@ -70,9 +80,25 @@ class BaiduTieBaClient(AbstractApiClient): if isinstance(params, dict): final_uri = (f"{uri}?" f"{urlencode(params)}") - return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=self.headers) + try: + res = await self.request(method="GET", url=f"{self._host}{final_uri}", + return_ori_content=return_ori_content, + **kwargs) + return res + except RetryError as e: + if self.ip_pool: + proxie_model = await self.ip_pool.get_proxy() + _, proxies = utils.format_proxy_info(proxie_model) + res = await self.request(method="GET", url=f"{self._host}{final_uri}", + return_ori_content=return_ori_content, + proxies=proxies, + **kwargs) + self.default_ip_proxy = proxies + return res - async def post(self, uri: str, data: dict) -> Dict: + utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,请尝试更换新的IP代理: {e}") + + async def post(self, uri: str, data: dict, **kwargs) -> Dict: """ POST请求,对请求头签名 Args: @@ -84,7 +110,7 @@ class BaiduTieBaClient(AbstractApiClient): """ json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) return await self.request(method="POST", url=f"{self._host}{uri}", - data=json_str, headers=self.headers) + data=json_str, **kwargs) async def pong(self) -> bool: """ @@ -96,6 +122,7 @@ class BaiduTieBaClient(AbstractApiClient): try: uri = "/mo/q/sync" res: Dict = await self.get(uri) + utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}") if res and res.get("no") == 0: ping_flag = True else: @@ -115,31 +142,42 @@ class BaiduTieBaClient(AbstractApiClient): Returns: """ - cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) - self.headers["Cookie"] = cookie_str - self.cookie_dict = cookie_dict + pass - async def get_note_by_keyword( + async def get_notes_by_keyword( self, keyword: str, page: int = 1, page_size: int = 10, sort: SearchSortType = SearchSortType.TIME_DESC, - note_type: SearchNoteType = SearchNoteType.FIXED_THREAD - ) -> Dict: + note_type: SearchNoteType = SearchNoteType.FIXED_THREAD, + random_sleep: bool = True + ) -> List[Dict]: """ 根据关键词搜索贴吧帖子 Args: keyword: 关键词 page: 分页第几页 - page_size: 每页肠病毒 + page_size: 每页大小 sort: 结果排序方式 note_type: 帖子类型(主题贴|主题+回复混合模式) + random_sleep: 是否随机休眠 Returns: """ - # todo impl it - return {} + uri = "/f/search/res" + params = { + "isnew": 1, + "qw": keyword, + "rn": page_size, + "pn": page, + "sm": sort.value, + "only_thread": note_type.value + } + page_content = await self.get(uri, params=params, return_ori_content=True) + if random_sleep: + random.randint(1, 5) + return self._page_extractor.extract_search_note_list(page_content) async def get_note_by_id(self, note_id: str) -> Dict: """ @@ -166,4 +204,4 @@ class BaiduTieBaClient(AbstractApiClient): """ # todo impl it - return [] \ No newline at end of file + return [] diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index c7a99d5..91795a4 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -9,9 +9,10 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler -from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool, ProxyIpPool from store import tieba as tieba_store from tools import utils +from tools.crawler_util import format_proxy_info from var import crawler_type_var from .client import BaiduTieBaClient @@ -29,53 +30,43 @@ class TieBaCrawler(AbstractCrawler): self.user_agent = utils.get_user_agent() async def start(self) -> None: - playwright_proxy_format, httpx_proxy_format = None, None + """ + Start the crawler + Returns: + + """ + ip_proxy_pool, httpx_proxy_format = None, None if config.ENABLE_IP_PROXY: + utils.logger.info("[BaiduTieBaCrawler.start] Begin create ip proxy pool ...") ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() - playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info) + _, httpx_proxy_format = format_proxy_info(ip_proxy_info) + utils.logger.info(f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}") - async with async_playwright() as playwright: - # Launch a browser context. - chromium = playwright.chromium - self.browser_context = await self.launch_browser( - chromium, - None, - self.user_agent, - headless=config.HEADLESS - ) - # stealth.min.js is a js script to prevent the website from detecting the crawler. - await self.browser_context.add_init_script(path="libs/stealth.min.js") - self.context_page = await self.browser_context.new_page() - await self.context_page.goto(self.index_url) + # Create a client to interact with the baidutieba website. + self.tieba_client = BaiduTieBaClient( + ip_pool=ip_proxy_pool, + default_ip_proxy=httpx_proxy_format, + ) + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": + # Search for notes and retrieve their comment information. + await self.search() + elif config.CRAWLER_TYPE == "detail": + # Get the information and comments of the specified post + await self.get_specified_notes() + else: + pass - # Create a client to interact with the baidutieba website. - self.tieba_client = await self.create_tieba_client(httpx_proxy_format) - if not await self.tieba_client.pong(): - login_obj = BaiduTieBaLogin( - login_type=config.LOGIN_TYPE, - login_phone="", # input your phone number - browser_context=self.browser_context, - context_page=self.context_page, - cookie_str=config.COOKIES - ) - await login_obj.begin() - await self.tieba_client.update_cookies(browser_context=self.browser_context) - - crawler_type_var.set(config.CRAWLER_TYPE) - if config.CRAWLER_TYPE == "search": - # Search for notes and retrieve their comment information. - await self.search() - elif config.CRAWLER_TYPE == "detail": - # Get the information and comments of the specified post - await self.get_specified_notes() - else: - pass - - utils.logger.info("[BaiduTieBaCrawler.start] Xhs Crawler finished ...") + utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...") async def search(self) -> None: - """Search for notes and retrieve their comment information.""" + """ + Search for notes and retrieve their comment information. + Returns: + + """ + utils.logger.info("[BaiduTieBaCrawler.search] Begin search baidutieba keywords") tieba_limit_count = 10 # tieba limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count: @@ -92,36 +83,26 @@ class TieBaCrawler(AbstractCrawler): try: utils.logger.info(f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}") note_id_list: List[str] = [] - notes_res = await self.tieba_client.get_note_by_keyword( + notes_list_res = await self.tieba_client.get_notes_by_keyword( keyword=keyword, page=page, page_size=tieba_limit_count, sort=SearchSortType.TIME_DESC, note_type=SearchNoteType.FIXED_THREAD ) - utils.logger.info(f"[BaiduTieBaCrawler.search] Search notes res:{notes_res}") - if not notes_res or not notes_res.get('has_more', False): - utils.logger.info("No more content!") + utils.logger.info(f"[BaiduTieBaCrawler.search] Search notes res:{notes_list_res}") + if not notes_list_res: break - semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) - task_list = [ - self.get_note_detail( - note_id=post_item.get("id"), - semaphore=semaphore - ) - for post_item in notes_res.get("items", {}) - if post_item.get('model_type') not in ('rec_query', 'hot_query') - ] - note_details = await asyncio.gather(*task_list) - for note_detail in note_details: + + for note_detail in notes_list_res: if note_detail: await tieba_store.update_tieba_note(note_detail) note_id_list.append(note_detail.get("note_id")) page += 1 - utils.logger.info(f"[BaiduTieBaCrawler.search] Note details: {note_details}") + utils.logger.info(f"[BaiduTieBaCrawler.search] Note details: {notes_list_res}") await self.batch_get_note_comments(note_id_list) except Exception as ex: - utils.logger.error(f"[BaiduTieBaCrawler.search] Get note detail error, err: {ex}") + utils.logger.error(f"[BaiduTieBaCrawler.search] Search note list error, err: {ex}") break async def fetch_creator_notes_detail(self, note_list: List[Dict]): @@ -197,34 +178,20 @@ class TieBaCrawler(AbstractCrawler): callback=tieba_store.batch_update_tieba_note_comments ) - @staticmethod - def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: - """format proxy info for playwright and httpx""" - playwright_proxy = { - "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", - "username": ip_proxy_info.user, - "password": ip_proxy_info.password, - } - httpx_proxy = { - f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" - } - return playwright_proxy, httpx_proxy + async def create_tieba_client(self, ip_pool: ProxyIpPool) -> BaiduTieBaClient: + """ + Create tieba client + Args: + ip_pool: - async def create_tieba_client(self, httpx_proxy: Optional[str]) -> BaiduTieBaClient: + Returns: + + """ """Create tieba client""" utils.logger.info("[BaiduTieBaCrawler.create_tieba_client] Begin create baidutieba API client ...") cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) tieba_client_obj = BaiduTieBaClient( - proxies=httpx_proxy, - headers={ - "User-Agent": self.user_agent, - "Cookie": cookie_str, - "Origin": "https://www.baidutieba.com", - "Referer": "https://www.baidutieba.com", - "Content-Type": "application/json;charset=UTF-8" - }, - playwright_page=self.context_page, - cookie_dict=cookie_dict, + ip_pool=ip_pool, ) return tieba_client_obj diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py new file mode 100644 index 0000000..59eabdb --- /dev/null +++ b/media_platform/tieba/help.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- + +from typing import List, Dict + +from parsel import Selector + + +class TieBaExtractor: + def __init__(self): + pass + + @staticmethod + def extract_search_note_list(page_content: str) -> List[Dict]: + """ + 提取贴吧帖子列表 + Args: + page_content: 页面内容的HTML字符串 + + Returns: + 包含帖子信息的字典列表 + """ + xpath_selector = "//div[@class='s_post']" + post_list = Selector(text=page_content).xpath(xpath_selector) + result = [] + for post in post_list: + post_id = post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip() + title = post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip() + link = post.xpath(".//span[@class='p_title']/a/@href").get(default='') + description = post.xpath(".//div[@class='p_content']/text()").get(default='').strip() + forum = post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip() + forum_link = post.xpath(".//a[@class='p_forum']/@href").get(default='') + author = post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip() + author_link = post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default='') + date = post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip() + + result.append({ + "note_id": post_id, + "title": title, + "desc": description, + "note_url": link, + "time": date, + "tieba_name": forum, + "tieba_link": forum_link, + "nickname": author, + "nickname_link": author_link, + }) + + return result + + @staticmethod + def extract_tieba_note_comments(page_content: str) -> List[Dict]: + """ + 提取贴吧帖子评论 + Args: + page_content: + + Returns: + + """ + pass + + +if __name__ == '__main__': + with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f: + content = f.read() + extractor = TieBaExtractor() + _result = extractor.extract_search_note_list(content) + print(_result) + print(f"Total: {len(_result)}") diff --git a/media_platform/tieba/test_data/search_keyword_notes.html b/media_platform/tieba/test_data/search_keyword_notes.html new file mode 100644 index 0000000..d15d8ce --- /dev/null +++ b/media_platform/tieba/test_data/search_keyword_notes.html @@ -0,0 +1,96 @@ +
+
武汉交互空间科技:富士康10亿加码中国大陆,印度为何逐渐“失宠 +
+ 全球知名的电子制造服务巨头富士康的母公司鸿海精密工业股份有限公司正式对外发布了一则重大投资公告,富士康将在郑州投资建设新事业总部大楼,承载新事业总部功能。这一战略举措不仅彰显了富士康对中国市场持续深化的承诺与信心,也预示着该集团业务版图的新一轮扩张与升级。 + 项目一期选址位于郑东新区,建筑面积约700公亩,总投资约10亿元人民币。主要建设总部管理中心、研发中心和工程中心、战略产业发展中心、战略产业金融平台、 +
+ 贴吧:武汉交互空间作者:VR虚拟达人 + 2024-08-05 16:45
+
请各位急用玛尼的小心,骗子最多 +
+ 这里面到处是骗子,大家小心。特别那些叫出村背货的,基本是卖园区,天下没有那么好的事。就是有这好事,我们在边境上的人,比你们最清楚,轮不到你们,边境上比你们胆子大的人大把,你一不熟悉小路,为什么叫你带货。东南亚带货的集结地,一般在南宁,防城港,昆明,西双版纳,临沧然后师机接了走小路出去,南宁,防城港坐船出去。好多都是二十几手的中介,之前卖园区一个三十万,现在不知道行情,但好多园区不收 +
+ 贴吧:背包客作者:贴吧用户_GC64AUS + 2024-08-03 07:35
+
*2025泰国冷链制冷运输展*东南亚外贸出口 +
**2025泰国曼谷国际冷库、空调制冷、仓储暨冷链运输展 *2025泰国冷链制冷运输展*东南亚外贸出口-观展游览考察 + 展出时间:2025-7月(具体时间待定) 展出地点:泰国曼谷会展中心 展会周期:一年一届 组展单位:北京励航国际商务会展有限公司 + 人员跟团观展补贴!为您节省成本,寻找适合您的市场: + 本公司为您提供观展考察机会,让您在大型展会上获得世界同行**科技的资料同时,感受异域文化气息。展会现场走展考察→→当地游览→→当地相关市 +
+ 贴吧:国际展会作者:zhaot_188 2024-07-19 15:44
+
京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承 +
来源标题:京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承 京湘楼(KING HERO)品牌创始人:肖鑫 + 京湘楼,KING + HERO,集酱板鸭、肥肠、鸭头、鸭脖、鸭肠、小龙虾、牛蛙、捆鸡、鸡爪、鱼嘴巴、鱼尾、鱿鱼、牛肉、猪头肉等特色食品卤制,加工、包装与生产经营。2022年3月在北京朝阳区双井开设了第一家“京湘楼·鲜卤集市”卤味熟食快餐店,2023年5月在湖南省长沙市开福区注册成立了“长沙京湘楼品牌管理有限公司”,以“京湘楼”作为品 +
+ 贴吧:京湘楼作者:天神渡尘 2024-07-17 23:43
+
广州能争取到迪士尼与环球落户吗? +
+ 不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。 + 美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃 +
+ 贴吧:地理作者:SeaRoutes 2024-07-13 20:17
+
#城市GDP#广州应该全力去争取迪士尼和环球影城 +
+ 不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。 + 美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃 +
+ 贴吧:城市gdp作者:SeaRoutes 2024-07-13 20:14
+
云南省首批《云南日报》昆明新闻头条聚焦阳宗海省级物流枢纽建设 +
+ 7月11日《云南日报》昆明新闻头条刊发文章《阳宗海风景名胜区立足“衔接西部陆海新通道与中老铁路”优势——加速28个物流枢纽设施建设》聚焦昆明阳宗海风景名胜区系统推进省级物流枢纽建设和功能提升深挖比较优势壮大物流产业据云南省发展和改革委员会在昆明召开的新闻发布会上公布,今年全省共有5地纳入云南省第一批省级物流枢纽和省级骨干冷链物流基地建设名单,其中,昆明市有两家获批,阳宗海物流枢纽上榜!一起来看近日,云南省 +
+ 贴吧:昆明作者: 2024-07-12 23:04
+
寻找弟弟,很久没跟家里联系 +
Kk四期世纪园区,寻找弟弟,外号大佐,F3 2楼,公司cj集团
+ 贴吧:东南亚作者:贴吧用户_GC2CtRa + 2024-07-11 07:53
+
拉美 非洲 东南亚 南亚等发展中国家不太可能普及八小时双休吧? +
拉美 和 东南亚的泰国 之类的连毒枭和黑色产业都管不好感觉普及八小时双休不太可能 缅甸和非洲军阀林立 + 跟军阀谈八小时双休那么不开玩笑?缅北诈骗园区就能看出来。 +
+ 贴吧:历史作者:yoursagain 2024-07-10 09:00
+
东南亚,园区【 工 价 低 】 +
+ 贴吧:园区招商作者:QQ59052966 2024-06-30 12:09
+
\ No newline at end of file diff --git a/schema/tables.sql b/schema/tables.sql index 3530189..88828b7 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -2,192 +2,200 @@ -- Table structure for bilibili_video -- ---------------------------- DROP TABLE IF EXISTS `bilibili_video`; -CREATE TABLE `bilibili_video` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `video_id` varchar(64) NOT NULL COMMENT '视频ID', - `video_type` varchar(16) NOT NULL COMMENT '视频类型', - `title` varchar(500) DEFAULT NULL COMMENT '视频标题', - `desc` longtext COMMENT '视频描述', - `create_time` bigint NOT NULL COMMENT '视频发布时间戳', - `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', - `video_play_count` varchar(16) DEFAULT NULL COMMENT '视频播放数量', - `video_danmaku` varchar(16) DEFAULT NULL COMMENT '视频弹幕数量', - `video_comment` varchar(16) DEFAULT NULL COMMENT '视频评论数量', - `video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL', - `video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL', - PRIMARY KEY (`id`), - KEY `idx_bilibili_vi_video_i_31c36e` (`video_id`), - KEY `idx_bilibili_vi_create__73e0ec` (`create_time`) +CREATE TABLE `bilibili_video` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `video_id` varchar(64) NOT NULL COMMENT '视频ID', + `video_type` varchar(16) NOT NULL COMMENT '视频类型', + `title` varchar(500) DEFAULT NULL COMMENT '视频标题', + `desc` longtext COMMENT '视频描述', + `create_time` bigint NOT NULL COMMENT '视频发布时间戳', + `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', + `video_play_count` varchar(16) DEFAULT NULL COMMENT '视频播放数量', + `video_danmaku` varchar(16) DEFAULT NULL COMMENT '视频弹幕数量', + `video_comment` varchar(16) DEFAULT NULL COMMENT '视频评论数量', + `video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL', + `video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL', + PRIMARY KEY (`id`), + KEY `idx_bilibili_vi_video_i_31c36e` (`video_id`), + KEY `idx_bilibili_vi_create__73e0ec` (`create_time`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B站视频'; -- ---------------------------- -- Table structure for bilibili_video_comment -- ---------------------------- DROP TABLE IF EXISTS `bilibili_video_comment`; -CREATE TABLE `bilibili_video_comment` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `comment_id` varchar(64) NOT NULL COMMENT '评论ID', - `video_id` varchar(64) NOT NULL COMMENT '视频ID', - `content` longtext COMMENT '评论内容', - `create_time` bigint NOT NULL COMMENT '评论时间戳', - `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', - PRIMARY KEY (`id`), - KEY `idx_bilibili_vi_comment_41c34e` (`comment_id`), - KEY `idx_bilibili_vi_video_i_f22873` (`video_id`) +CREATE TABLE `bilibili_video_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `video_id` varchar(64) NOT NULL COMMENT '视频ID', + `content` longtext COMMENT '评论内容', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', + PRIMARY KEY (`id`), + KEY `idx_bilibili_vi_comment_41c34e` (`comment_id`), + KEY `idx_bilibili_vi_video_i_f22873` (`video_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站视频评论'; -- ---------------------------- -- Table structure for bilibili_up_info -- ---------------------------- DROP TABLE IF EXISTS `bilibili_up_info`; -CREATE TABLE `bilibili_up_info` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `total_fans` bigint DEFAULT NULL COMMENT '粉丝数', - `total_liked` bigint DEFAULT NULL COMMENT '总获赞数', - `user_rank` int DEFAULT NULL COMMENT '用户等级', - `is_official` int DEFAULT NULL COMMENT '是否官号', - PRIMARY KEY (`id`), - KEY `idx_bilibili_vi_user_123456` (`user_id`) +CREATE TABLE `bilibili_up_info` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `total_fans` bigint DEFAULT NULL COMMENT '粉丝数', + `total_liked` bigint DEFAULT NULL COMMENT '总获赞数', + `user_rank` int DEFAULT NULL COMMENT '用户等级', + `is_official` int DEFAULT NULL COMMENT '是否官号', + PRIMARY KEY (`id`), + KEY `idx_bilibili_vi_user_123456` (`user_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站UP主信息'; -- ---------------------------- -- Table structure for douyin_aweme -- ---------------------------- DROP TABLE IF EXISTS `douyin_aweme`; -CREATE TABLE `douyin_aweme` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid', - `short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID', - `user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `aweme_id` varchar(64) NOT NULL COMMENT '视频ID', - `aweme_type` varchar(16) NOT NULL COMMENT '视频类型', - `title` varchar(500) DEFAULT NULL COMMENT '视频标题', - `desc` longtext COMMENT '视频描述', - `create_time` bigint NOT NULL COMMENT '视频发布时间戳', - `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', - `comment_count` varchar(16) DEFAULT NULL COMMENT '视频评论数', - `share_count` varchar(16) DEFAULT NULL COMMENT '视频分享数', - `collected_count` varchar(16) DEFAULT NULL COMMENT '视频收藏数', - `aweme_url` varchar(255) DEFAULT NULL COMMENT '视频详情页URL', - PRIMARY KEY (`id`), - KEY `idx_douyin_awem_aweme_i_6f7bc6` (`aweme_id`), - KEY `idx_douyin_awem_create__299dfe` (`create_time`) +CREATE TABLE `douyin_aweme` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid', + `short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID', + `user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `aweme_id` varchar(64) NOT NULL COMMENT '视频ID', + `aweme_type` varchar(16) NOT NULL COMMENT '视频类型', + `title` varchar(500) DEFAULT NULL COMMENT '视频标题', + `desc` longtext COMMENT '视频描述', + `create_time` bigint NOT NULL COMMENT '视频发布时间戳', + `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', + `comment_count` varchar(16) DEFAULT NULL COMMENT '视频评论数', + `share_count` varchar(16) DEFAULT NULL COMMENT '视频分享数', + `collected_count` varchar(16) DEFAULT NULL COMMENT '视频收藏数', + `aweme_url` varchar(255) DEFAULT NULL COMMENT '视频详情页URL', + PRIMARY KEY (`id`), + KEY `idx_douyin_awem_aweme_i_6f7bc6` (`aweme_id`), + KEY `idx_douyin_awem_create__299dfe` (`create_time`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音视频'; -- ---------------------------- -- Table structure for douyin_aweme_comment -- ---------------------------- DROP TABLE IF EXISTS `douyin_aweme_comment`; -CREATE TABLE `douyin_aweme_comment` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid', - `short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID', - `user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `comment_id` varchar(64) NOT NULL COMMENT '评论ID', - `aweme_id` varchar(64) NOT NULL COMMENT '视频ID', - `content` longtext COMMENT '评论内容', - `create_time` bigint NOT NULL COMMENT '评论时间戳', - `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', - PRIMARY KEY (`id`), - KEY `idx_douyin_awem_comment_fcd7e4` (`comment_id`), - KEY `idx_douyin_awem_aweme_i_c50049` (`aweme_id`) +CREATE TABLE `douyin_aweme_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid', + `short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID', + `user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `aweme_id` varchar(64) NOT NULL COMMENT '视频ID', + `content` longtext COMMENT '评论内容', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', + PRIMARY KEY (`id`), + KEY `idx_douyin_awem_comment_fcd7e4` (`comment_id`), + KEY `idx_douyin_awem_aweme_i_c50049` (`aweme_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音视频评论'; -- ---------------------------- -- Table structure for dy_creator -- ---------------------------- DROP TABLE IF EXISTS `dy_creator`; -CREATE TABLE `dy_creator` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(128) NOT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `desc` longtext COMMENT '用户描述', - `gender` varchar(1) DEFAULT NULL COMMENT '性别', - `follows` varchar(16) DEFAULT NULL COMMENT '关注数', - `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', - `interaction` varchar(16) DEFAULT NULL COMMENT '获赞数', - `videos_count` varchar(16) DEFAULT NULL COMMENT '作品数', - PRIMARY KEY (`id`) +CREATE TABLE `dy_creator` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(128) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `desc` longtext COMMENT '用户描述', + `gender` varchar(1) DEFAULT NULL COMMENT '性别', + `follows` varchar(16) DEFAULT NULL COMMENT '关注数', + `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', + `interaction` varchar(16) DEFAULT NULL COMMENT '获赞数', + `videos_count` varchar(16) DEFAULT NULL COMMENT '作品数', + PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音博主信息'; -- ---------------------------- -- Table structure for kuaishou_video -- ---------------------------- DROP TABLE IF EXISTS `kuaishou_video`; -CREATE TABLE `kuaishou_video` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `video_id` varchar(64) NOT NULL COMMENT '视频ID', - `video_type` varchar(16) NOT NULL COMMENT '视频类型', - `title` varchar(500) DEFAULT NULL COMMENT '视频标题', - `desc` longtext COMMENT '视频描述', - `create_time` bigint NOT NULL COMMENT '视频发布时间戳', - `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', - `viewd_count` varchar(16) DEFAULT NULL COMMENT '视频浏览数量', - `video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL', - `video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL', - `video_play_url` varchar(512) DEFAULT NULL COMMENT '视频播放 URL', - PRIMARY KEY (`id`), - KEY `idx_kuaishou_vi_video_i_c5c6a6` (`video_id`), - KEY `idx_kuaishou_vi_create__a10dee` (`create_time`) +CREATE TABLE `kuaishou_video` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `video_id` varchar(64) NOT NULL COMMENT '视频ID', + `video_type` varchar(16) NOT NULL COMMENT '视频类型', + `title` varchar(500) DEFAULT NULL COMMENT '视频标题', + `desc` longtext COMMENT '视频描述', + `create_time` bigint NOT NULL COMMENT '视频发布时间戳', + `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', + `viewd_count` varchar(16) DEFAULT NULL COMMENT '视频浏览数量', + `video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL', + `video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL', + `video_play_url` varchar(512) DEFAULT NULL COMMENT '视频播放 URL', + PRIMARY KEY (`id`), + KEY `idx_kuaishou_vi_video_i_c5c6a6` (`video_id`), + KEY `idx_kuaishou_vi_create__a10dee` (`create_time`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='快手视频'; -- ---------------------------- -- Table structure for kuaishou_video_comment -- ---------------------------- DROP TABLE IF EXISTS `kuaishou_video_comment`; -CREATE TABLE `kuaishou_video_comment` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `comment_id` varchar(64) NOT NULL COMMENT '评论ID', - `video_id` varchar(64) NOT NULL COMMENT '视频ID', - `content` longtext COMMENT '评论内容', - `create_time` bigint NOT NULL COMMENT '评论时间戳', - `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', - PRIMARY KEY (`id`), - KEY `idx_kuaishou_vi_comment_ed48fa` (`comment_id`), - KEY `idx_kuaishou_vi_video_i_e50914` (`video_id`) +CREATE TABLE `kuaishou_video_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `video_id` varchar(64) NOT NULL COMMENT '视频ID', + `content` longtext COMMENT '评论内容', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', + PRIMARY KEY (`id`), + KEY `idx_kuaishou_vi_comment_ed48fa` (`comment_id`), + KEY `idx_kuaishou_vi_video_i_e50914` (`video_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='快手视频评论'; @@ -195,145 +203,175 @@ CREATE TABLE `kuaishou_video_comment` ( -- Table structure for weibo_note -- ---------------------------- DROP TABLE IF EXISTS `weibo_note`; -CREATE TABLE `weibo_note` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `gender` varchar(12) DEFAULT NULL COMMENT '用户性别', - `profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址', - `ip_location` varchar(32) DEFAULT '发布微博的地理信息', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `note_id` varchar(64) NOT NULL COMMENT '帖子ID', - `content` longtext COMMENT '帖子正文内容', - `create_time` bigint NOT NULL COMMENT '帖子发布时间戳', - `create_date_time` varchar(32) NOT NULL COMMENT '帖子发布日期时间', - `liked_count` varchar(16) DEFAULT NULL COMMENT '帖子点赞数', - `comments_count` varchar(16) DEFAULT NULL COMMENT '帖子评论数量', - `shared_count` varchar(16) DEFAULT NULL COMMENT '帖子转发数量', - `note_url` varchar(512) DEFAULT NULL COMMENT '帖子详情URL', - PRIMARY KEY (`id`), - KEY `idx_weibo_note_note_id_f95b1a` (`note_id`), - KEY `idx_weibo_note_create__692709` (`create_time`), - KEY `idx_weibo_note_create__d05ed2` (`create_date_time`) +CREATE TABLE `weibo_note` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `gender` varchar(12) DEFAULT NULL COMMENT '用户性别', + `profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址', + `ip_location` varchar(32) DEFAULT '发布微博的地理信息', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `note_id` varchar(64) NOT NULL COMMENT '帖子ID', + `content` longtext COMMENT '帖子正文内容', + `create_time` bigint NOT NULL COMMENT '帖子发布时间戳', + `create_date_time` varchar(32) NOT NULL COMMENT '帖子发布日期时间', + `liked_count` varchar(16) DEFAULT NULL COMMENT '帖子点赞数', + `comments_count` varchar(16) DEFAULT NULL COMMENT '帖子评论数量', + `shared_count` varchar(16) DEFAULT NULL COMMENT '帖子转发数量', + `note_url` varchar(512) DEFAULT NULL COMMENT '帖子详情URL', + PRIMARY KEY (`id`), + KEY `idx_weibo_note_note_id_f95b1a` (`note_id`), + KEY `idx_weibo_note_create__692709` (`create_time`), + KEY `idx_weibo_note_create__d05ed2` (`create_date_time`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博帖子'; -- ---------------------------- -- Table structure for weibo_note_comment -- ---------------------------- DROP TABLE IF EXISTS `weibo_note_comment`; -CREATE TABLE `weibo_note_comment` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `gender` varchar(12) DEFAULT NULL COMMENT '用户性别', - `profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址', - `ip_location` varchar(32) DEFAULT '发布微博的地理信息', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `comment_id` varchar(64) NOT NULL COMMENT '评论ID', - `note_id` varchar(64) NOT NULL COMMENT '帖子ID', - `content` longtext COMMENT '评论内容', - `create_time` bigint NOT NULL COMMENT '评论时间戳', - `create_date_time` varchar(32) NOT NULL COMMENT '评论日期时间', - `comment_like_count` varchar(16) NOT NULL COMMENT '评论点赞数量', - `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', - PRIMARY KEY (`id`), - KEY `idx_weibo_note__comment_c7611c` (`comment_id`), - KEY `idx_weibo_note__note_id_24f108` (`note_id`), - KEY `idx_weibo_note__create__667fe3` (`create_date_time`) +CREATE TABLE `weibo_note_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `gender` varchar(12) DEFAULT NULL COMMENT '用户性别', + `profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址', + `ip_location` varchar(32) DEFAULT '发布微博的地理信息', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `note_id` varchar(64) NOT NULL COMMENT '帖子ID', + `content` longtext COMMENT '评论内容', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `create_date_time` varchar(32) NOT NULL COMMENT '评论日期时间', + `comment_like_count` varchar(16) NOT NULL COMMENT '评论点赞数量', + `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', + PRIMARY KEY (`id`), + KEY `idx_weibo_note__comment_c7611c` (`comment_id`), + KEY `idx_weibo_note__note_id_24f108` (`note_id`), + KEY `idx_weibo_note__create__667fe3` (`create_date_time`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博帖子评论'; -- ---------------------------- -- Table structure for xhs_creator -- ---------------------------- DROP TABLE IF EXISTS `xhs_creator`; -CREATE TABLE `xhs_creator` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) NOT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `desc` longtext COMMENT '用户描述', - `gender` varchar(1) DEFAULT NULL COMMENT '性别', - `follows` varchar(16) DEFAULT NULL COMMENT '关注数', - `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', - `interaction` varchar(16) DEFAULT NULL COMMENT '获赞和收藏数', - `tag_list` longtext COMMENT '标签列表', - PRIMARY KEY (`id`) +CREATE TABLE `xhs_creator` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `desc` longtext COMMENT '用户描述', + `gender` varchar(1) DEFAULT NULL COMMENT '性别', + `follows` varchar(16) DEFAULT NULL COMMENT '关注数', + `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', + `interaction` varchar(16) DEFAULT NULL COMMENT '获赞和收藏数', + `tag_list` longtext COMMENT '标签列表', + PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书博主'; -- ---------------------------- -- Table structure for xhs_note -- ---------------------------- DROP TABLE IF EXISTS `xhs_note`; -CREATE TABLE `xhs_note` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) NOT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `note_id` varchar(64) NOT NULL COMMENT '笔记ID', - `type` varchar(16) DEFAULT NULL COMMENT '笔记类型(normal | video)', - `title` varchar(255) DEFAULT NULL COMMENT '笔记标题', - `desc` longtext COMMENT '笔记描述', - `video_url` longtext COMMENT '视频地址', - `time` bigint NOT NULL COMMENT '笔记发布时间戳', - `last_update_time` bigint NOT NULL COMMENT '笔记最后更新时间戳', - `liked_count` varchar(16) DEFAULT NULL COMMENT '笔记点赞数', - `collected_count` varchar(16) DEFAULT NULL COMMENT '笔记收藏数', - `comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数', - `share_count` varchar(16) DEFAULT NULL COMMENT '笔记分享数', - `image_list` longtext COMMENT '笔记封面图片列表', - `tag_list` longtext COMMENT '标签列表', - `note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL', - PRIMARY KEY (`id`), - KEY `idx_xhs_note_note_id_209457` (`note_id`), - KEY `idx_xhs_note_time_eaa910` (`time`) +CREATE TABLE `xhs_note` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `note_id` varchar(64) NOT NULL COMMENT '笔记ID', + `type` varchar(16) DEFAULT NULL COMMENT '笔记类型(normal | video)', + `title` varchar(255) DEFAULT NULL COMMENT '笔记标题', + `desc` longtext COMMENT '笔记描述', + `video_url` longtext COMMENT '视频地址', + `time` bigint NOT NULL COMMENT '笔记发布时间戳', + `last_update_time` bigint NOT NULL COMMENT '笔记最后更新时间戳', + `liked_count` varchar(16) DEFAULT NULL COMMENT '笔记点赞数', + `collected_count` varchar(16) DEFAULT NULL COMMENT '笔记收藏数', + `comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数', + `share_count` varchar(16) DEFAULT NULL COMMENT '笔记分享数', + `image_list` longtext COMMENT '笔记封面图片列表', + `tag_list` longtext COMMENT '标签列表', + `note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL', + PRIMARY KEY (`id`), + KEY `idx_xhs_note_note_id_209457` (`note_id`), + KEY `idx_xhs_note_time_eaa910` (`time`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书笔记'; -- ---------------------------- -- Table structure for xhs_note_comment -- ---------------------------- DROP TABLE IF EXISTS `xhs_note_comment`; -CREATE TABLE `xhs_note_comment` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) NOT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `comment_id` varchar(64) NOT NULL COMMENT '评论ID', - `create_time` bigint NOT NULL COMMENT '评论时间戳', - `note_id` varchar(64) NOT NULL COMMENT '笔记ID', - `content` longtext NOT NULL COMMENT '评论内容', - `sub_comment_count` int NOT NULL COMMENT '子评论数量', - `pictures` varchar(512) DEFAULT NULL, - PRIMARY KEY (`id`), - KEY `idx_xhs_note_co_comment_8e8349` (`comment_id`), - KEY `idx_xhs_note_co_create__204f8d` (`create_time`) +CREATE TABLE `xhs_note_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `note_id` varchar(64) NOT NULL COMMENT '笔记ID', + `content` longtext NOT NULL COMMENT '评论内容', + `sub_comment_count` int NOT NULL COMMENT '子评论数量', + `pictures` varchar(512) DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `idx_xhs_note_co_comment_8e8349` (`comment_id`), + KEY `idx_xhs_note_co_create__204f8d` (`create_time`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书笔记评论'; -- ---------------------------- -- alter table xhs_note_comment to support parent_comment_id -- ---------------------------- ALTER TABLE `xhs_note_comment` -ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ALTER TABLE `douyin_aweme_comment` -ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ALTER TABLE `bilibili_video_comment` -ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ALTER TABLE `weibo_note_comment` -ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; -SET FOREIGN_KEY_CHECKS = 1; +SET +FOREIGN_KEY_CHECKS = 1; + + +DROP TABLE IF EXISTS `tieba_note`; +CREATE TABLE `tieba_note` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `note_id` varchar(64) NOT NULL COMMENT '帖子ID', + `title` varchar(255) DEFAULT NULL COMMENT '笔记标题', + `desc` longtext COMMENT '笔记描述', + `time` varchar NOT NULL COMMENT '笔记发布时间', + `note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `nickname_link` varchar(255) DEFAULT NULL COMMENT '用户主页地址', + `tieba_name` varchar(255) DEFAULT NULL COMMENT '贴吧名称', + `tieba_link` varchar(255) DEFAULT NULL COMMENT '贴吧链接地址', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数', + PRIMARY KEY (`id`), + KEY `idx_tieba_note_id` (`note_id`), + KEY `idx_tieba_note_time` (`time`) +) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧帖子表'; diff --git a/store/tieba/__init__.py b/store/tieba/__init__.py index 9605d58..9e47fa4 100644 --- a/store/tieba/__init__.py +++ b/store/tieba/__init__.py @@ -22,31 +22,20 @@ class TieBaStoreFactory: async def update_tieba_note(note_item: Dict): + tieba_url = "https://tieba.baidu.com" note_id = note_item.get("note_id") - user_info = note_item.get("user", {}) - interact_info = note_item.get("interact_info", {}) - tag_list: List[Dict] = note_item.get("tag_list", []) - local_db_item = { "note_id": note_id, - "type": note_item.get("type"), "title": note_item.get("title") or note_item.get("desc", "")[:255], "desc": note_item.get("desc", ""), + "note_url": tieba_url + note_item.get("note_url"), "time": note_item.get("time"), - "last_update_time": note_item.get("last_update_time", 0), - "user_id": user_info.get("user_id"), - "nickname": user_info.get("nickname"), - "avatar": user_info.get("avatar"), - "liked_count": interact_info.get("liked_count"), - "collected_count": interact_info.get("collected_count"), - "comment_count": interact_info.get("comment_count"), - "share_count": interact_info.get("share_count"), + "tieba_name": note_item.get("tieba_name"), + "tieba_link": tieba_url + note_item.get("tieba_link", ""), + "nickname": note_item.get("nickname"), + "nickname_link": tieba_url + note_item.get("nickname_link", ""), "ip_location": note_item.get("ip_location", ""), - - "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']), "last_modify_ts": utils.get_current_timestamp(), - # todo: add note_url - "note_url": "" } utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {local_db_item}") await TieBaStoreFactory.create_store().store_content(local_db_item) diff --git a/store/tieba/tieba_store_sql.py b/store/tieba/tieba_store_sql.py index 9ec03a4..f99f491 100644 --- a/store/tieba/tieba_store_sql.py +++ b/store/tieba/tieba_store_sql.py @@ -15,7 +15,7 @@ async def query_content_by_content_id(content_id: str) -> Dict: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - sql: str = f"select * from baidu_tieba where note_id = '{content_id}'" + sql: str = f"select * from tieba_note where note_id = '{content_id}'" rows: List[Dict] = await async_db_conn.query(sql) if len(rows) > 0: return rows[0] @@ -32,7 +32,7 @@ async def add_new_content(content_item: Dict) -> int: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - last_row_id: int = await async_db_conn.item_to_table("baidu_tieba", content_item) + last_row_id: int = await async_db_conn.item_to_table("tieba_note", content_item) return last_row_id @@ -47,7 +47,7 @@ async def update_content_by_content_id(content_id: str, content_item: Dict) -> i """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - effect_row: int = await async_db_conn.update_table("baidu_tieba", content_item, "note_id", content_id) + effect_row: int = await async_db_conn.update_table("tieba_note", content_item, "note_id", content_id) return effect_row @@ -62,7 +62,7 @@ async def query_comment_by_comment_id(comment_id: str) -> Dict: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - sql: str = f"select * from baidu_tieba_comment where comment_id = '{comment_id}'" + sql: str = f"select * from tieba_comment where comment_id = '{comment_id}'" rows: List[Dict] = await async_db_conn.query(sql) if len(rows) > 0: return rows[0] @@ -79,7 +79,7 @@ async def add_new_comment(comment_item: Dict) -> int: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - last_row_id: int = await async_db_conn.item_to_table("baidu_tieba_comment", comment_item) + last_row_id: int = await async_db_conn.item_to_table("tieba_comment", comment_item) return last_row_id @@ -94,7 +94,7 @@ async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> i """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - effect_row: int = await async_db_conn.update_table("baidu_tieba_comment", comment_item, "comment_id", comment_id) + effect_row: int = await async_db_conn.update_table("tieba_comment", comment_item, "comment_id", comment_id) return effect_row @@ -108,7 +108,7 @@ async def query_creator_by_user_id(user_id: str) -> Dict: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - sql: str = f"select * from baidu_tieba_creator where user_id = '{user_id}'" + sql: str = f"select * from tieba_creator where user_id = '{user_id}'" rows: List[Dict] = await async_db_conn.query(sql) if len(rows) > 0: return rows[0] @@ -125,7 +125,7 @@ async def add_new_creator(creator_item: Dict) -> int: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - last_row_id: int = await async_db_conn.item_to_table("baidu_tieba_creator", creator_item) + last_row_id: int = await async_db_conn.item_to_table("tieba_creator", creator_item) return last_row_id @@ -140,5 +140,5 @@ async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - effect_row: int = await async_db_conn.update_table("baidu_tieba_creator", creator_item, "user_id", user_id) + effect_row: int = await async_db_conn.update_table("tieba_creator", creator_item, "user_id", user_id) return effect_row \ No newline at end of file diff --git a/tools/crawler_util.py b/tools/crawler_util.py index 026d86a..8e37881 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -13,6 +13,7 @@ import httpx from PIL import Image, ImageDraw from playwright.async_api import Cookie, Page +from proxy import IpInfoModel from . import utils @@ -133,3 +134,16 @@ def match_interact_info_count(count_str: str) -> int: return int(number) else: return 0 + + +def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: + """format proxy info for playwright and httpx""" + playwright_proxy = { + "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", + "username": ip_proxy_info.user, + "password": ip_proxy_info.password, + } + httpx_proxy = { + f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" + } + return playwright_proxy, httpx_proxy \ No newline at end of file