From a87094f2fdab54d138faf4c4c1ff2b1fc506c17a Mon Sep 17 00:00:00 2001 From: Relakkes Date: Mon, 5 Aug 2024 18:51:51 +0800 Subject: [PATCH 1/8] =?UTF-8?q?feat:=20=E7=99=BE=E5=BA=A6=E8=B4=B4?= =?UTF-8?q?=E5=90=A7=E6=9E=B6=E5=AD=90=20&=20=E7=99=BB=E5=BD=95done?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd_arg/arg.py | 4 +- config/base_config.py | 6 + main.py | 7 +- media_platform/tieba/__init__.py | 2 + media_platform/tieba/client.py | 169 ++++++++++++++++++++ media_platform/tieba/core.py | 265 +++++++++++++++++++++++++++++++ media_platform/tieba/field.py | 18 +++ media_platform/tieba/login.py | 112 +++++++++++++ store/tieba/__init__.py | 91 +++++++++++ store/tieba/tieba_store_impl.py | 244 ++++++++++++++++++++++++++++ store/tieba/tieba_store_sql.py | 144 +++++++++++++++++ 11 files changed, 1058 insertions(+), 4 deletions(-) create mode 100644 media_platform/tieba/__init__.py create mode 100644 media_platform/tieba/client.py create mode 100644 media_platform/tieba/core.py create mode 100644 media_platform/tieba/field.py create mode 100644 media_platform/tieba/login.py create mode 100644 store/tieba/__init__.py create mode 100644 store/tieba/tieba_store_impl.py create mode 100644 store/tieba/tieba_store_sql.py diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 27854f7..65819a1 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -7,8 +7,8 @@ from tools.utils import str2bool async def parse_cmd(): # 读取command arg parser = argparse.ArgumentParser(description='Media crawler program.') - parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)', - choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM) + parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb | tieba)', + choices=["xhs", "dy", "ks", "bili", "wb", "tieba"], default=config.PLATFORM) parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)', diff --git a/config/base_config.py b/config/base_config.py index 076003a..08dd421 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -88,6 +88,12 @@ WEIBO_SPECIFIED_ID_LIST = [ # ........................ ] +# 指定贴吧需要爬取的帖子列表 +TIEBA_SPECIFIED_ID_LIST = [ + +] + + # 指定小红书创作者ID列表 XHS_CREATOR_ID_LIST = [ "63e36c9a000000002703502b", diff --git a/main.py b/main.py index 27d84ad..e051b5e 100644 --- a/main.py +++ b/main.py @@ -8,6 +8,7 @@ from base.base_crawler import AbstractCrawler from media_platform.bilibili import BilibiliCrawler from media_platform.douyin import DouYinCrawler from media_platform.kuaishou import KuaishouCrawler +from media_platform.tieba import TieBaCrawler from media_platform.weibo import WeiboCrawler from media_platform.xhs import XiaoHongShuCrawler @@ -18,7 +19,8 @@ class CrawlerFactory: "dy": DouYinCrawler, "ks": KuaishouCrawler, "bili": BilibiliCrawler, - "wb": WeiboCrawler + "wb": WeiboCrawler, + "tieba": TieBaCrawler } @staticmethod @@ -28,6 +30,7 @@ class CrawlerFactory: raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...") return crawler_class() + async def main(): # parse cmd await cmd_arg.parse_cmd() @@ -38,7 +41,7 @@ async def main(): crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM) await crawler.start() - + if config.SAVE_DATA_OPTION == "db": await db.close() diff --git a/media_platform/tieba/__init__.py b/media_platform/tieba/__init__.py new file mode 100644 index 0000000..e7e2a44 --- /dev/null +++ b/media_platform/tieba/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +from .core import TieBaCrawler \ No newline at end of file diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py new file mode 100644 index 0000000..a7ebaa1 --- /dev/null +++ b/media_platform/tieba/client.py @@ -0,0 +1,169 @@ +import asyncio +import json +import re +from typing import Any, Callable, Dict, List, Optional, Union +from urllib.parse import urlencode + +import httpx +from playwright.async_api import BrowserContext, Page + +import config +from base.base_crawler import AbstractApiClient +from tools import utils + +from .field import SearchNoteType, SearchSortType + + +class BaiduTieBaClient(AbstractApiClient): + def __init__( + self, + timeout=10, + proxies=None, + *, + headers: Dict[str, str], + playwright_page: Page, + cookie_dict: Dict[str, str], + ): + self.proxies = proxies + self.timeout = timeout + self.headers = headers + self.playwright_page = playwright_page + self.cookie_dict = cookie_dict + self._host = "https://tieba.baidu.com" + + async def request(self, method, url, **kwargs) -> Union[str, Any]: + """ + 封装httpx的公共请求方法,对请求响应做一些处理 + Args: + method: 请求方法 + url: 请求的URL + **kwargs: 其他请求参数,例如请求头、请求体等 + + Returns: + + """ + # return response.text + return_response = kwargs.pop('return_response', False) + + async with httpx.AsyncClient(proxies=self.proxies) as client: + response = await client.request( + method, url, timeout=self.timeout, + **kwargs + ) + + if return_response: + return response.text + + return response.json() + + async def get(self, uri: str, params=None) -> Dict: + """ + GET请求,对请求头签名 + Args: + uri: 请求路由 + params: 请求参数 + + Returns: + + """ + final_uri = uri + if isinstance(params, dict): + final_uri = (f"{uri}?" + f"{urlencode(params)}") + return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=self.headers) + + async def post(self, uri: str, data: dict) -> Dict: + """ + POST请求,对请求头签名 + Args: + uri: 请求路由 + data: 请求体参数 + + Returns: + + """ + json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) + return await self.request(method="POST", url=f"{self._host}{uri}", + data=json_str, headers=self.headers) + + async def pong(self) -> bool: + """ + 用于检查登录态是否失效了 + Returns: + + """ + utils.logger.info("[BaiduTieBaClient.pong] Begin to pong tieba...") + try: + uri = "/mo/q/sync" + res: Dict = await self.get(uri) + if res and res.get("no") == 0: + ping_flag = True + else: + utils.logger.info(f"[BaiduTieBaClient.pong] user not login, will try to login again...") + ping_flag = False + except Exception as e: + utils.logger.error(f"[BaiduTieBaClient.pong] Ping tieba failed: {e}, and try to login again...") + ping_flag = False + return ping_flag + + async def update_cookies(self, browser_context: BrowserContext): + """ + API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法 + Args: + browser_context: 浏览器上下文对象 + + Returns: + + """ + cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + self.headers["Cookie"] = cookie_str + self.cookie_dict = cookie_dict + + async def get_note_by_keyword( + self, keyword: str, + page: int = 1, + page_size: int = 10, + sort: SearchSortType = SearchSortType.TIME_DESC, + note_type: SearchNoteType = SearchNoteType.FIXED_THREAD + ) -> Dict: + """ + 根据关键词搜索贴吧帖子 + Args: + keyword: 关键词 + page: 分页第几页 + page_size: 每页肠病毒 + sort: 结果排序方式 + note_type: 帖子类型(主题贴|主题+回复混合模式) + + Returns: + + """ + # todo impl it + return {} + + async def get_note_by_id(self, note_id: str) -> Dict: + """ + 根据帖子ID获取帖子详情 + Args: + note_id: + + Returns: + + """ + # todo impl it + return {} + + async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, + callback: Optional[Callable] = None) -> List[Dict]: + """ + 获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 + Args: + note_id: 帖子ID + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + + Returns: + + """ + # todo impl it + return [] \ No newline at end of file diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py new file mode 100644 index 0000000..c7a99d5 --- /dev/null +++ b/media_platform/tieba/core.py @@ -0,0 +1,265 @@ +import asyncio +import os +import random +from asyncio import Task +from typing import Dict, List, Optional, Tuple + +from playwright.async_api import (BrowserContext, BrowserType, Page, + async_playwright) + +import config +from base.base_crawler import AbstractCrawler +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool +from store import tieba as tieba_store +from tools import utils +from var import crawler_type_var + +from .client import BaiduTieBaClient +from .field import SearchNoteType, SearchSortType +from .login import BaiduTieBaLogin + + +class TieBaCrawler(AbstractCrawler): + context_page: Page + tieba_client: BaiduTieBaClient + browser_context: BrowserContext + + def __init__(self) -> None: + self.index_url = "https://tieba.baidu.com" + self.user_agent = utils.get_user_agent() + + async def start(self) -> None: + playwright_proxy_format, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info) + + async with async_playwright() as playwright: + # Launch a browser context. + chromium = playwright.chromium + self.browser_context = await self.launch_browser( + chromium, + None, + self.user_agent, + headless=config.HEADLESS + ) + # stealth.min.js is a js script to prevent the website from detecting the crawler. + await self.browser_context.add_init_script(path="libs/stealth.min.js") + self.context_page = await self.browser_context.new_page() + await self.context_page.goto(self.index_url) + + # Create a client to interact with the baidutieba website. + self.tieba_client = await self.create_tieba_client(httpx_proxy_format) + if not await self.tieba_client.pong(): + login_obj = BaiduTieBaLogin( + login_type=config.LOGIN_TYPE, + login_phone="", # input your phone number + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES + ) + await login_obj.begin() + await self.tieba_client.update_cookies(browser_context=self.browser_context) + + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": + # Search for notes and retrieve their comment information. + await self.search() + elif config.CRAWLER_TYPE == "detail": + # Get the information and comments of the specified post + await self.get_specified_notes() + else: + pass + + utils.logger.info("[BaiduTieBaCrawler.start] Xhs Crawler finished ...") + + async def search(self) -> None: + """Search for notes and retrieve their comment information.""" + utils.logger.info("[BaiduTieBaCrawler.search] Begin search baidutieba keywords") + tieba_limit_count = 10 # tieba limit page fixed value + if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count + start_page = config.START_PAGE + for keyword in config.KEYWORDS.split(","): + utils.logger.info(f"[BaiduTieBaCrawler.search] Current search keyword: {keyword}") + page = 1 + while (page - start_page + 1) * tieba_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[BaiduTieBaCrawler.search] Skip page {page}") + page += 1 + continue + try: + utils.logger.info(f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}") + note_id_list: List[str] = [] + notes_res = await self.tieba_client.get_note_by_keyword( + keyword=keyword, + page=page, + page_size=tieba_limit_count, + sort=SearchSortType.TIME_DESC, + note_type=SearchNoteType.FIXED_THREAD + ) + utils.logger.info(f"[BaiduTieBaCrawler.search] Search notes res:{notes_res}") + if not notes_res or not notes_res.get('has_more', False): + utils.logger.info("No more content!") + break + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_note_detail( + note_id=post_item.get("id"), + semaphore=semaphore + ) + for post_item in notes_res.get("items", {}) + if post_item.get('model_type') not in ('rec_query', 'hot_query') + ] + note_details = await asyncio.gather(*task_list) + for note_detail in note_details: + if note_detail: + await tieba_store.update_tieba_note(note_detail) + note_id_list.append(note_detail.get("note_id")) + page += 1 + utils.logger.info(f"[BaiduTieBaCrawler.search] Note details: {note_details}") + await self.batch_get_note_comments(note_id_list) + except Exception as ex: + utils.logger.error(f"[BaiduTieBaCrawler.search] Get note detail error, err: {ex}") + break + + async def fetch_creator_notes_detail(self, note_list: List[Dict]): + """ + Concurrently obtain the specified post list and save the data + """ + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_note_detail( + note_id=post_item.get("note_id"), + semaphore=semaphore + ) + for post_item in note_list + ] + + note_details = await asyncio.gather(*task_list) + for note_detail in note_details: + if note_detail: + await tieba_store.update_tieba_note(note_detail) + + async def get_specified_notes(self): + """Get the information and comments of the specified post""" + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_note_detail(note_id=note_id, semaphore=semaphore) for note_id in config.TIEBA_SPECIFIED_ID_LIST + ] + note_details = await asyncio.gather(*task_list) + for note_detail in note_details: + if note_detail is not None: + await tieba_store.update_tieba_note(note_detail) + await self.batch_get_note_comments(config.TIEBA_SPECIFIED_ID_LIST) + + async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: + """Get note detail""" + async with semaphore: + try: + note_detail: Dict = await self.tieba_client.get_note_by_id(note_id) + if not note_detail: + utils.logger.error( + f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}") + return None + return note_detail + except Exception as ex: + utils.logger.error(f"[BaiduTieBaCrawler.get_note_detail] Get note detail error: {ex}") + return None + except KeyError as ex: + utils.logger.error( + f"[BaiduTieBaCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}") + return None + + async def batch_get_note_comments(self, note_list: List[str]): + """Batch get note comments""" + if not config.ENABLE_GET_COMMENTS: + utils.logger.info(f"[BaiduTieBaCrawler.batch_get_note_comments] Crawling comment mode is not enabled") + return + + utils.logger.info( + f"[BaiduTieBaCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}") + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list: List[Task] = [] + for note_id in note_list: + task = asyncio.create_task(self.get_comments(note_id, semaphore), name=note_id) + task_list.append(task) + await asyncio.gather(*task_list) + + async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore): + """Get note comments with keyword filtering and quantity limitation""" + async with semaphore: + utils.logger.info(f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_id}") + await self.tieba_client.get_note_all_comments( + note_id=note_id, + crawl_interval=random.random(), + callback=tieba_store.batch_update_tieba_note_comments + ) + + @staticmethod + def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: + """format proxy info for playwright and httpx""" + playwright_proxy = { + "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", + "username": ip_proxy_info.user, + "password": ip_proxy_info.password, + } + httpx_proxy = { + f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" + } + return playwright_proxy, httpx_proxy + + async def create_tieba_client(self, httpx_proxy: Optional[str]) -> BaiduTieBaClient: + """Create tieba client""" + utils.logger.info("[BaiduTieBaCrawler.create_tieba_client] Begin create baidutieba API client ...") + cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) + tieba_client_obj = BaiduTieBaClient( + proxies=httpx_proxy, + headers={ + "User-Agent": self.user_agent, + "Cookie": cookie_str, + "Origin": "https://www.baidutieba.com", + "Referer": "https://www.baidutieba.com", + "Content-Type": "application/json;charset=UTF-8" + }, + playwright_page=self.context_page, + cookie_dict=cookie_dict, + ) + return tieba_client_obj + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True + ) -> BrowserContext: + """Launch browser and create browser context""" + utils.logger.info("[BaiduTieBaCrawler.launch_browser] Begin create browser context ...") + if config.SAVE_LOGIN_STATE: + # feat issue #14 + # we will save login state to avoid login every time + user_data_dir = os.path.join(os.getcwd(), "browser_data", + config.USER_DATA_DIR % config.PLATFORM) # type: ignore + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, # type: ignore + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent + ) + return browser_context + else: + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore + browser_context = await browser.new_context( + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent + ) + return browser_context + + async def close(self): + """Close browser context""" + await self.browser_context.close() + utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...") diff --git a/media_platform/tieba/field.py b/media_platform/tieba/field.py new file mode 100644 index 0000000..824fe88 --- /dev/null +++ b/media_platform/tieba/field.py @@ -0,0 +1,18 @@ +from enum import Enum + + +class SearchSortType(Enum): + """search sort type""" + # 按时间倒序 + TIME_DESC = "1" + # 按时间顺序 + TIME_ASC = "0" + # 按相关性顺序 + RELEVANCE_ORDER = "2" + + +class SearchNoteType(Enum): + # 只看主题贴 + MAIN_THREAD = "1" + # 混合模式(帖子+回复) + FIXED_THREAD = "0" diff --git a/media_platform/tieba/login.py b/media_platform/tieba/login.py new file mode 100644 index 0000000..8c1eb15 --- /dev/null +++ b/media_platform/tieba/login.py @@ -0,0 +1,112 @@ +import asyncio +import functools +import sys +from typing import Optional + +from playwright.async_api import BrowserContext, Page +from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, + wait_fixed) + +import config +from base.base_crawler import AbstractLogin +from tools import utils + + +class BaiduTieBaLogin(AbstractLogin): + + def __init__(self, + login_type: str, + browser_context: BrowserContext, + context_page: Page, + login_phone: Optional[str] = "", + cookie_str: str = "" + ): + config.LOGIN_TYPE = login_type + self.browser_context = browser_context + self.context_page = context_page + self.login_phone = login_phone + self.cookie_str = cookie_str + + @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) + async def check_login_state(self) -> bool: + """ + 轮训检查登录状态是否成功,成功返回True否则返回False + + Returns: + + """ + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + stoken = cookie_dict.get("STOKEN") + ptoken = cookie_dict.get("PTOKEN") + if stoken or ptoken: + return True + return False + + async def begin(self): + """Start login baidutieba""" + utils.logger.info("[BaiduTieBaLogin.begin] Begin login baidutieba ...") + if config.LOGIN_TYPE == "qrcode": + await self.login_by_qrcode() + elif config.LOGIN_TYPE == "phone": + await self.login_by_mobile() + elif config.LOGIN_TYPE == "cookie": + await self.login_by_cookies() + else: + raise ValueError("[BaiduTieBaLogin.begin]Invalid Login Type Currently only supported qrcode or phone or cookies ...") + + async def login_by_mobile(self): + """Login baidutieba by mobile""" + pass + + async def login_by_qrcode(self): + """login baidutieba website and keep webdriver login state""" + utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Begin login baidutieba by qrcode ...") + qrcode_img_selector = "xpath=//img[@class='tang-pass-qrcode-img']" + # find login qrcode + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector=qrcode_img_selector + ) + if not base64_qrcode_img: + utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....") + # if this website does not automatically popup login dialog box, we will manual click login button + await asyncio.sleep(0.5) + login_button_ele = self.context_page.locator("xpath=//li[@class='u_login']") + await login_button_ele.click() + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector=qrcode_img_selector + ) + if not base64_qrcode_img: + utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] login failed , have not found qrcode please check ....") + sys.exit() + + # show login qrcode + # fix issue #12 + # we need to use partial function to call show_qrcode function and run in executor + # then current asyncio event loop will not be blocked + partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) + asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) + + utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s") + try: + await self.check_login_state() + except RetryError: + utils.logger.info("[BaiduTieBaLogin.login_by_qrcode] Login baidutieba failed by qrcode login method ...") + sys.exit() + + wait_redirect_seconds = 5 + utils.logger.info(f"[BaiduTieBaLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + await asyncio.sleep(wait_redirect_seconds) + + async def login_by_cookies(self): + """login baidutieba website by cookies""" + utils.logger.info("[BaiduTieBaLogin.login_by_cookies] Begin login baidutieba by cookie ...") + for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): + await self.browser_context.add_cookies([{ + 'name': key, + 'value': value, + 'domain': ".baidu.com", + 'path': "/" + }]) diff --git a/store/tieba/__init__.py b/store/tieba/__init__.py new file mode 100644 index 0000000..9605d58 --- /dev/null +++ b/store/tieba/__init__.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- +from typing import List + +from . import tieba_store_impl +from .tieba_store_impl import * + + +class TieBaStoreFactory: + STORES = { + "csv": TieBaCsvStoreImplement, + "db": TieBaDbStoreImplement, + "json": TieBaJsonStoreImplement + } + + @staticmethod + def create_store() -> AbstractStore: + store_class = TieBaStoreFactory.STORES.get(config.SAVE_DATA_OPTION) + if not store_class: + raise ValueError( + "[TieBaStoreFactory.create_store] Invalid save option only supported csv or db or json ...") + return store_class() + + +async def update_tieba_note(note_item: Dict): + note_id = note_item.get("note_id") + user_info = note_item.get("user", {}) + interact_info = note_item.get("interact_info", {}) + tag_list: List[Dict] = note_item.get("tag_list", []) + + local_db_item = { + "note_id": note_id, + "type": note_item.get("type"), + "title": note_item.get("title") or note_item.get("desc", "")[:255], + "desc": note_item.get("desc", ""), + "time": note_item.get("time"), + "last_update_time": note_item.get("last_update_time", 0), + "user_id": user_info.get("user_id"), + "nickname": user_info.get("nickname"), + "avatar": user_info.get("avatar"), + "liked_count": interact_info.get("liked_count"), + "collected_count": interact_info.get("collected_count"), + "comment_count": interact_info.get("comment_count"), + "share_count": interact_info.get("share_count"), + "ip_location": note_item.get("ip_location", ""), + + "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']), + "last_modify_ts": utils.get_current_timestamp(), + # todo: add note_url + "note_url": "" + } + utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {local_db_item}") + await TieBaStoreFactory.create_store().store_content(local_db_item) + + +async def batch_update_tieba_note_comments(note_id: str, comments: List[Dict]): + if not comments: + return + for comment_item in comments: + await update_tieba_note_comment(note_id, comment_item) + + +async def update_tieba_note_comment(note_id: str, comment_item: Dict): + """ + Update tieba note comment + Args: + note_id: + comment_item: + + Returns: + + """ + user_info = comment_item.get("user_info", {}) + comment_id = comment_item.get("id") + comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])] + target_comment = comment_item.get("target_comment", {}) + local_db_item = { + "comment_id": comment_id, + "create_time": comment_item.get("create_time"), + "ip_location": comment_item.get("ip_location"), + "note_id": note_id, + "content": comment_item.get("content"), + "user_id": user_info.get("user_id"), + "nickname": user_info.get("nickname"), + "avatar": user_info.get("image"), + "sub_comment_count": comment_item.get("sub_comment_count", 0), + "pictures": ",".join(comment_pictures), + "parent_comment_id": target_comment.get("id", 0), + "last_modify_ts": utils.get_current_timestamp(), + } + utils.logger.info(f"[store.tieba.update_tieba_note_comment] tieba note comment:{local_db_item}") + await TieBaStoreFactory.create_store().store_comment(local_db_item) diff --git a/store/tieba/tieba_store_impl.py b/store/tieba/tieba_store_impl.py new file mode 100644 index 0000000..fe0ccbc --- /dev/null +++ b/store/tieba/tieba_store_impl.py @@ -0,0 +1,244 @@ +# -*- coding: utf-8 -*- +import asyncio +import csv +import json +import os +import pathlib +from typing import Dict + +import aiofiles + +import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var + + +def calculate_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + try: + return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1 + except ValueError: + return 1 + + +class TieBaCsvStoreImplement(AbstractStore): + csv_store_path: str = "data/tieba" + file_count:int=calculate_number_of_files(csv_store_path) + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: contents or comments + + Returns: eg: data/tieba/search_comments_20240114.csv ... + + """ + return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + + async def save_data_to_csv(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in CSV format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: no returns + + """ + pathlib.Path(self.csv_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + async with aiofiles.open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: + f.fileno() + writer = csv.writer(f) + if await f.tell() == 0: + await writer.writerow(save_item.keys()) + await writer.writerow(save_item.values()) + + async def store_content(self, content_item: Dict): + """ + Xiaohongshu content CSV storage implementation + Args: + content_item: note item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=content_item, store_type="contents") + + async def store_comment(self, comment_item: Dict): + """ + Xiaohongshu comment CSV storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=comment_item, store_type="comments") + + async def store_creator(self, creator: Dict): + """ + Xiaohongshu content CSV storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_csv(save_item=creator, store_type="creator") + + +class TieBaDbStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + Xiaohongshu content DB storage implementation + Args: + content_item: content item dict + + Returns: + + """ + from .tieba_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + note_id = content_item.get("note_id") + note_detail: Dict = await query_content_by_content_id(content_id=note_id) + if not note_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(note_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Xiaohongshu content DB storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .tieba_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Xiaohongshu content DB storage implementation + Args: + creator: creator dict + + Returns: + + """ + from .tieba_store_sql import (add_new_creator, + query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) + + +class TieBaJsonStoreImplement(AbstractStore): + json_store_path: str = "data/tieba/json" + words_store_path: str = "data/tieba/words" + lock = asyncio.Lock() + file_count:int=calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() + + def make_save_file_name(self, store_type: str) -> (str,str): + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type) + save_data = [] + + async with self.lock: + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False)) + + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + async def store_content(self, content_item: Dict): + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") + + async def store_comment(self, comment_item: Dict): + """ + comment JSON storage implementatio + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") + + async def store_creator(self, creator: Dict): + """ + Xiaohongshu content JSON storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_json(creator, "creator") diff --git a/store/tieba/tieba_store_sql.py b/store/tieba/tieba_store_sql.py new file mode 100644 index 0000000..9ec03a4 --- /dev/null +++ b/store/tieba/tieba_store_sql.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- +from typing import Dict, List + +from db import AsyncMysqlDB +from var import media_crawler_db_var + + +async def query_content_by_content_id(content_id: str) -> Dict: + """ + 查询一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + sql: str = f"select * from baidu_tieba where note_id = '{content_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_content(content_item: Dict) -> int: + """ + 新增一条内容记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("baidu_tieba", content_item) + return last_row_id + + +async def update_content_by_content_id(content_id: str, content_item: Dict) -> int: + """ + 更新一条记录(xhs的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + content_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("baidu_tieba", content_item, "note_id", content_id) + return effect_row + + + +async def query_comment_by_comment_id(comment_id: str) -> Dict: + """ + 查询一条评论内容 + Args: + comment_id: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + sql: str = f"select * from baidu_tieba_comment where comment_id = '{comment_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_comment(comment_item: Dict) -> int: + """ + 新增一条评论记录 + Args: + comment_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("baidu_tieba_comment", comment_item) + return last_row_id + + +async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> int: + """ + 更新增一条评论记录 + Args: + comment_id: + comment_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("baidu_tieba_comment", comment_item, "comment_id", comment_id) + return effect_row + + +async def query_creator_by_user_id(user_id: str) -> Dict: + """ + 查询一条创作者记录 + Args: + user_id: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + sql: str = f"select * from baidu_tieba_creator where user_id = '{user_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_creator(creator_item: Dict) -> int: + """ + 新增一条创作者信息 + Args: + creator_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("baidu_tieba_creator", creator_item) + return last_row_id + + +async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int: + """ + 更新一条创作者信息 + Args: + user_id: + creator_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("baidu_tieba_creator", creator_item, "user_id", user_id) + return effect_row \ No newline at end of file From d347cf5a2c171573be40c63e3226c34010b1de3c Mon Sep 17 00:00:00 2001 From: Relakkes Date: Tue, 6 Aug 2024 03:37:55 +0800 Subject: [PATCH 2/8] =?UTF-8?q?feat:=20=E5=B8=96=E5=AD=90=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=20&=20=E7=A7=BB=E9=99=A4=E7=99=BB=E5=BD=95=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E4=BD=BF=E7=94=A8IP=E4=BB=A3=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- media_platform/tieba/client.py | 104 ++-- media_platform/tieba/core.py | 129 ++--- media_platform/tieba/help.py | 69 +++ .../tieba/test_data/search_keyword_notes.html | 96 ++++ schema/tables.sql | 536 ++++++++++-------- store/tieba/__init__.py | 23 +- store/tieba/tieba_store_sql.py | 18 +- tools/crawler_util.py | 14 + 8 files changed, 600 insertions(+), 389 deletions(-) create mode 100644 media_platform/tieba/help.py create mode 100644 media_platform/tieba/test_data/search_keyword_notes.html diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index a7ebaa1..a02e243 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -1,67 +1,77 @@ -import asyncio import json -import re +import random from typing import Any, Callable, Dict, List, Optional, Union from urllib.parse import urlencode import httpx -from playwright.async_api import BrowserContext, Page +from playwright.async_api import BrowserContext +from tenacity import (RetryError, retry, stop_after_attempt, + wait_fixed) -import config from base.base_crawler import AbstractApiClient +from proxy.proxy_ip_pool import ProxyIpPool from tools import utils from .field import SearchNoteType, SearchSortType +from .help import TieBaExtractor class BaiduTieBaClient(AbstractApiClient): def __init__( self, timeout=10, - proxies=None, - *, - headers: Dict[str, str], - playwright_page: Page, - cookie_dict: Dict[str, str], + ip_pool=None, + default_ip_proxy=None, ): - self.proxies = proxies + self.ip_pool: Optional[ProxyIpPool] = ip_pool self.timeout = timeout - self.headers = headers - self.playwright_page = playwright_page - self.cookie_dict = cookie_dict + self.headers = utils.get_user_agent() self._host = "https://tieba.baidu.com" + self._page_extractor = TieBaExtractor() + self.default_ip_proxy = default_ip_proxy - async def request(self, method, url, **kwargs) -> Union[str, Any]: + @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) + async def request(self, method, url, return_ori_content=False, proxies=None, **kwargs) -> Union[str, Any]: """ 封装httpx的公共请求方法,对请求响应做一些处理 Args: method: 请求方法 url: 请求的URL + return_ori_content: 是否返回原始内容 + proxies: 代理IP **kwargs: 其他请求参数,例如请求头、请求体等 Returns: """ - # return response.text - return_response = kwargs.pop('return_response', False) - - async with httpx.AsyncClient(proxies=self.proxies) as client: + actual_proxies = proxies if proxies else self.default_ip_proxy + async with httpx.AsyncClient(proxies=actual_proxies) as client: response = await client.request( method, url, timeout=self.timeout, **kwargs ) - if return_response: + if response.status_code != 200: + utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}") + utils.logger.error(f"Request failed, response: {response.text}") + raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}") + + if response.text == "" or response.text == "blocked": + utils.logger.error(f"request params incrr, response.text: {response.text}") + raise Exception("account blocked") + + if return_ori_content: return response.text return response.json() - async def get(self, uri: str, params=None) -> Dict: + async def get(self, uri: str, params=None, return_ori_content=False, **kwargs) -> Any: """ GET请求,对请求头签名 Args: uri: 请求路由 params: 请求参数 + return_ori_content: 是否返回原始内容 Returns: @@ -70,9 +80,25 @@ class BaiduTieBaClient(AbstractApiClient): if isinstance(params, dict): final_uri = (f"{uri}?" f"{urlencode(params)}") - return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=self.headers) + try: + res = await self.request(method="GET", url=f"{self._host}{final_uri}", + return_ori_content=return_ori_content, + **kwargs) + return res + except RetryError as e: + if self.ip_pool: + proxie_model = await self.ip_pool.get_proxy() + _, proxies = utils.format_proxy_info(proxie_model) + res = await self.request(method="GET", url=f"{self._host}{final_uri}", + return_ori_content=return_ori_content, + proxies=proxies, + **kwargs) + self.default_ip_proxy = proxies + return res - async def post(self, uri: str, data: dict) -> Dict: + utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,请尝试更换新的IP代理: {e}") + + async def post(self, uri: str, data: dict, **kwargs) -> Dict: """ POST请求,对请求头签名 Args: @@ -84,7 +110,7 @@ class BaiduTieBaClient(AbstractApiClient): """ json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) return await self.request(method="POST", url=f"{self._host}{uri}", - data=json_str, headers=self.headers) + data=json_str, **kwargs) async def pong(self) -> bool: """ @@ -96,6 +122,7 @@ class BaiduTieBaClient(AbstractApiClient): try: uri = "/mo/q/sync" res: Dict = await self.get(uri) + utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}") if res and res.get("no") == 0: ping_flag = True else: @@ -115,31 +142,42 @@ class BaiduTieBaClient(AbstractApiClient): Returns: """ - cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) - self.headers["Cookie"] = cookie_str - self.cookie_dict = cookie_dict + pass - async def get_note_by_keyword( + async def get_notes_by_keyword( self, keyword: str, page: int = 1, page_size: int = 10, sort: SearchSortType = SearchSortType.TIME_DESC, - note_type: SearchNoteType = SearchNoteType.FIXED_THREAD - ) -> Dict: + note_type: SearchNoteType = SearchNoteType.FIXED_THREAD, + random_sleep: bool = True + ) -> List[Dict]: """ 根据关键词搜索贴吧帖子 Args: keyword: 关键词 page: 分页第几页 - page_size: 每页肠病毒 + page_size: 每页大小 sort: 结果排序方式 note_type: 帖子类型(主题贴|主题+回复混合模式) + random_sleep: 是否随机休眠 Returns: """ - # todo impl it - return {} + uri = "/f/search/res" + params = { + "isnew": 1, + "qw": keyword, + "rn": page_size, + "pn": page, + "sm": sort.value, + "only_thread": note_type.value + } + page_content = await self.get(uri, params=params, return_ori_content=True) + if random_sleep: + random.randint(1, 5) + return self._page_extractor.extract_search_note_list(page_content) async def get_note_by_id(self, note_id: str) -> Dict: """ @@ -166,4 +204,4 @@ class BaiduTieBaClient(AbstractApiClient): """ # todo impl it - return [] \ No newline at end of file + return [] diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index c7a99d5..91795a4 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -9,9 +9,10 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler -from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool, ProxyIpPool from store import tieba as tieba_store from tools import utils +from tools.crawler_util import format_proxy_info from var import crawler_type_var from .client import BaiduTieBaClient @@ -29,53 +30,43 @@ class TieBaCrawler(AbstractCrawler): self.user_agent = utils.get_user_agent() async def start(self) -> None: - playwright_proxy_format, httpx_proxy_format = None, None + """ + Start the crawler + Returns: + + """ + ip_proxy_pool, httpx_proxy_format = None, None if config.ENABLE_IP_PROXY: + utils.logger.info("[BaiduTieBaCrawler.start] Begin create ip proxy pool ...") ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() - playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info) + _, httpx_proxy_format = format_proxy_info(ip_proxy_info) + utils.logger.info(f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}") - async with async_playwright() as playwright: - # Launch a browser context. - chromium = playwright.chromium - self.browser_context = await self.launch_browser( - chromium, - None, - self.user_agent, - headless=config.HEADLESS - ) - # stealth.min.js is a js script to prevent the website from detecting the crawler. - await self.browser_context.add_init_script(path="libs/stealth.min.js") - self.context_page = await self.browser_context.new_page() - await self.context_page.goto(self.index_url) + # Create a client to interact with the baidutieba website. + self.tieba_client = BaiduTieBaClient( + ip_pool=ip_proxy_pool, + default_ip_proxy=httpx_proxy_format, + ) + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": + # Search for notes and retrieve their comment information. + await self.search() + elif config.CRAWLER_TYPE == "detail": + # Get the information and comments of the specified post + await self.get_specified_notes() + else: + pass - # Create a client to interact with the baidutieba website. - self.tieba_client = await self.create_tieba_client(httpx_proxy_format) - if not await self.tieba_client.pong(): - login_obj = BaiduTieBaLogin( - login_type=config.LOGIN_TYPE, - login_phone="", # input your phone number - browser_context=self.browser_context, - context_page=self.context_page, - cookie_str=config.COOKIES - ) - await login_obj.begin() - await self.tieba_client.update_cookies(browser_context=self.browser_context) - - crawler_type_var.set(config.CRAWLER_TYPE) - if config.CRAWLER_TYPE == "search": - # Search for notes and retrieve their comment information. - await self.search() - elif config.CRAWLER_TYPE == "detail": - # Get the information and comments of the specified post - await self.get_specified_notes() - else: - pass - - utils.logger.info("[BaiduTieBaCrawler.start] Xhs Crawler finished ...") + utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...") async def search(self) -> None: - """Search for notes and retrieve their comment information.""" + """ + Search for notes and retrieve their comment information. + Returns: + + """ + utils.logger.info("[BaiduTieBaCrawler.search] Begin search baidutieba keywords") tieba_limit_count = 10 # tieba limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count: @@ -92,36 +83,26 @@ class TieBaCrawler(AbstractCrawler): try: utils.logger.info(f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}") note_id_list: List[str] = [] - notes_res = await self.tieba_client.get_note_by_keyword( + notes_list_res = await self.tieba_client.get_notes_by_keyword( keyword=keyword, page=page, page_size=tieba_limit_count, sort=SearchSortType.TIME_DESC, note_type=SearchNoteType.FIXED_THREAD ) - utils.logger.info(f"[BaiduTieBaCrawler.search] Search notes res:{notes_res}") - if not notes_res or not notes_res.get('has_more', False): - utils.logger.info("No more content!") + utils.logger.info(f"[BaiduTieBaCrawler.search] Search notes res:{notes_list_res}") + if not notes_list_res: break - semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) - task_list = [ - self.get_note_detail( - note_id=post_item.get("id"), - semaphore=semaphore - ) - for post_item in notes_res.get("items", {}) - if post_item.get('model_type') not in ('rec_query', 'hot_query') - ] - note_details = await asyncio.gather(*task_list) - for note_detail in note_details: + + for note_detail in notes_list_res: if note_detail: await tieba_store.update_tieba_note(note_detail) note_id_list.append(note_detail.get("note_id")) page += 1 - utils.logger.info(f"[BaiduTieBaCrawler.search] Note details: {note_details}") + utils.logger.info(f"[BaiduTieBaCrawler.search] Note details: {notes_list_res}") await self.batch_get_note_comments(note_id_list) except Exception as ex: - utils.logger.error(f"[BaiduTieBaCrawler.search] Get note detail error, err: {ex}") + utils.logger.error(f"[BaiduTieBaCrawler.search] Search note list error, err: {ex}") break async def fetch_creator_notes_detail(self, note_list: List[Dict]): @@ -197,34 +178,20 @@ class TieBaCrawler(AbstractCrawler): callback=tieba_store.batch_update_tieba_note_comments ) - @staticmethod - def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: - """format proxy info for playwright and httpx""" - playwright_proxy = { - "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", - "username": ip_proxy_info.user, - "password": ip_proxy_info.password, - } - httpx_proxy = { - f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" - } - return playwright_proxy, httpx_proxy + async def create_tieba_client(self, ip_pool: ProxyIpPool) -> BaiduTieBaClient: + """ + Create tieba client + Args: + ip_pool: - async def create_tieba_client(self, httpx_proxy: Optional[str]) -> BaiduTieBaClient: + Returns: + + """ """Create tieba client""" utils.logger.info("[BaiduTieBaCrawler.create_tieba_client] Begin create baidutieba API client ...") cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) tieba_client_obj = BaiduTieBaClient( - proxies=httpx_proxy, - headers={ - "User-Agent": self.user_agent, - "Cookie": cookie_str, - "Origin": "https://www.baidutieba.com", - "Referer": "https://www.baidutieba.com", - "Content-Type": "application/json;charset=UTF-8" - }, - playwright_page=self.context_page, - cookie_dict=cookie_dict, + ip_pool=ip_pool, ) return tieba_client_obj diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py new file mode 100644 index 0000000..59eabdb --- /dev/null +++ b/media_platform/tieba/help.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- + +from typing import List, Dict + +from parsel import Selector + + +class TieBaExtractor: + def __init__(self): + pass + + @staticmethod + def extract_search_note_list(page_content: str) -> List[Dict]: + """ + 提取贴吧帖子列表 + Args: + page_content: 页面内容的HTML字符串 + + Returns: + 包含帖子信息的字典列表 + """ + xpath_selector = "//div[@class='s_post']" + post_list = Selector(text=page_content).xpath(xpath_selector) + result = [] + for post in post_list: + post_id = post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip() + title = post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip() + link = post.xpath(".//span[@class='p_title']/a/@href").get(default='') + description = post.xpath(".//div[@class='p_content']/text()").get(default='').strip() + forum = post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip() + forum_link = post.xpath(".//a[@class='p_forum']/@href").get(default='') + author = post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip() + author_link = post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default='') + date = post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip() + + result.append({ + "note_id": post_id, + "title": title, + "desc": description, + "note_url": link, + "time": date, + "tieba_name": forum, + "tieba_link": forum_link, + "nickname": author, + "nickname_link": author_link, + }) + + return result + + @staticmethod + def extract_tieba_note_comments(page_content: str) -> List[Dict]: + """ + 提取贴吧帖子评论 + Args: + page_content: + + Returns: + + """ + pass + + +if __name__ == '__main__': + with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f: + content = f.read() + extractor = TieBaExtractor() + _result = extractor.extract_search_note_list(content) + print(_result) + print(f"Total: {len(_result)}") diff --git a/media_platform/tieba/test_data/search_keyword_notes.html b/media_platform/tieba/test_data/search_keyword_notes.html new file mode 100644 index 0000000..d15d8ce --- /dev/null +++ b/media_platform/tieba/test_data/search_keyword_notes.html @@ -0,0 +1,96 @@ +
+
武汉交互空间科技:富士康10亿加码中国大陆,印度为何逐渐“失宠 +
+ 全球知名的电子制造服务巨头富士康的母公司鸿海精密工业股份有限公司正式对外发布了一则重大投资公告,富士康将在郑州投资建设新事业总部大楼,承载新事业总部功能。这一战略举措不仅彰显了富士康对中国市场持续深化的承诺与信心,也预示着该集团业务版图的新一轮扩张与升级。 + 项目一期选址位于郑东新区,建筑面积约700公亩,总投资约10亿元人民币。主要建设总部管理中心、研发中心和工程中心、战略产业发展中心、战略产业金融平台、 +
+ 贴吧:武汉交互空间作者:VR虚拟达人 + 2024-08-05 16:45
+
请各位急用玛尼的小心,骗子最多 +
+ 这里面到处是骗子,大家小心。特别那些叫出村背货的,基本是卖园区,天下没有那么好的事。就是有这好事,我们在边境上的人,比你们最清楚,轮不到你们,边境上比你们胆子大的人大把,你一不熟悉小路,为什么叫你带货。东南亚带货的集结地,一般在南宁,防城港,昆明,西双版纳,临沧然后师机接了走小路出去,南宁,防城港坐船出去。好多都是二十几手的中介,之前卖园区一个三十万,现在不知道行情,但好多园区不收 +
+ 贴吧:背包客作者:贴吧用户_GC64AUS + 2024-08-03 07:35
+
*2025泰国冷链制冷运输展*东南亚外贸出口 +
**2025泰国曼谷国际冷库、空调制冷、仓储暨冷链运输展 *2025泰国冷链制冷运输展*东南亚外贸出口-观展游览考察 + 展出时间:2025-7月(具体时间待定) 展出地点:泰国曼谷会展中心 展会周期:一年一届 组展单位:北京励航国际商务会展有限公司 + 人员跟团观展补贴!为您节省成本,寻找适合您的市场: + 本公司为您提供观展考察机会,让您在大型展会上获得世界同行**科技的资料同时,感受异域文化气息。展会现场走展考察→→当地游览→→当地相关市 +
+ 贴吧:国际展会作者:zhaot_188 2024-07-19 15:44
+
京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承 +
来源标题:京湘楼创始人肖鑫:创立于北京,植根长沙,百年美食传承 京湘楼(KING HERO)品牌创始人:肖鑫 + 京湘楼,KING + HERO,集酱板鸭、肥肠、鸭头、鸭脖、鸭肠、小龙虾、牛蛙、捆鸡、鸡爪、鱼嘴巴、鱼尾、鱿鱼、牛肉、猪头肉等特色食品卤制,加工、包装与生产经营。2022年3月在北京朝阳区双井开设了第一家“京湘楼·鲜卤集市”卤味熟食快餐店,2023年5月在湖南省长沙市开福区注册成立了“长沙京湘楼品牌管理有限公司”,以“京湘楼”作为品 +
+ 贴吧:京湘楼作者:天神渡尘 2024-07-17 23:43
+
广州能争取到迪士尼与环球落户吗? +
+ 不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。 + 美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃 +
+ 贴吧:地理作者:SeaRoutes 2024-07-13 20:17
+
#城市GDP#广州应该全力去争取迪士尼和环球影城 +
+ 不是二选一,而是全都要。上一组数据,上海迪士尼2016年开业就接待游客超过1.2亿人次,香港迪士尼2023全年游客人数才640万人次,约等于无,这么低的入园人次已经引来迪士尼方面的不悦。 + 美国有两个迪士尼,说实话迪士尼的门票并不高,普通人都去的起,中国完全有能力建两到三个迪士尼,欧洲只有第一个迪士尼,因为它的人口只有中国的一半,假设中国人一年吃一包盐,一年就是14包,那么欧洲就是七亿包盐,盐再便宜,欧洲人也不可能一人吃 +
+ 贴吧:城市gdp作者:SeaRoutes 2024-07-13 20:14
+
云南省首批《云南日报》昆明新闻头条聚焦阳宗海省级物流枢纽建设 +
+ 7月11日《云南日报》昆明新闻头条刊发文章《阳宗海风景名胜区立足“衔接西部陆海新通道与中老铁路”优势——加速28个物流枢纽设施建设》聚焦昆明阳宗海风景名胜区系统推进省级物流枢纽建设和功能提升深挖比较优势壮大物流产业据云南省发展和改革委员会在昆明召开的新闻发布会上公布,今年全省共有5地纳入云南省第一批省级物流枢纽和省级骨干冷链物流基地建设名单,其中,昆明市有两家获批,阳宗海物流枢纽上榜!一起来看近日,云南省 +
+ 贴吧:昆明作者: 2024-07-12 23:04
+
寻找弟弟,很久没跟家里联系 +
Kk四期世纪园区,寻找弟弟,外号大佐,F3 2楼,公司cj集团
+ 贴吧:东南亚作者:贴吧用户_GC2CtRa + 2024-07-11 07:53
+
拉美 非洲 东南亚 南亚等发展中国家不太可能普及八小时双休吧? +
拉美 和 东南亚的泰国 之类的连毒枭和黑色产业都管不好感觉普及八小时双休不太可能 缅甸和非洲军阀林立 + 跟军阀谈八小时双休那么不开玩笑?缅北诈骗园区就能看出来。 +
+ 贴吧:历史作者:yoursagain 2024-07-10 09:00
+
东南亚,园区【 工 价 低 】 +
+ 贴吧:园区招商作者:QQ59052966 2024-06-30 12:09
+
\ No newline at end of file diff --git a/schema/tables.sql b/schema/tables.sql index 3530189..88828b7 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -2,192 +2,200 @@ -- Table structure for bilibili_video -- ---------------------------- DROP TABLE IF EXISTS `bilibili_video`; -CREATE TABLE `bilibili_video` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `video_id` varchar(64) NOT NULL COMMENT '视频ID', - `video_type` varchar(16) NOT NULL COMMENT '视频类型', - `title` varchar(500) DEFAULT NULL COMMENT '视频标题', - `desc` longtext COMMENT '视频描述', - `create_time` bigint NOT NULL COMMENT '视频发布时间戳', - `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', - `video_play_count` varchar(16) DEFAULT NULL COMMENT '视频播放数量', - `video_danmaku` varchar(16) DEFAULT NULL COMMENT '视频弹幕数量', - `video_comment` varchar(16) DEFAULT NULL COMMENT '视频评论数量', - `video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL', - `video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL', - PRIMARY KEY (`id`), - KEY `idx_bilibili_vi_video_i_31c36e` (`video_id`), - KEY `idx_bilibili_vi_create__73e0ec` (`create_time`) +CREATE TABLE `bilibili_video` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `video_id` varchar(64) NOT NULL COMMENT '视频ID', + `video_type` varchar(16) NOT NULL COMMENT '视频类型', + `title` varchar(500) DEFAULT NULL COMMENT '视频标题', + `desc` longtext COMMENT '视频描述', + `create_time` bigint NOT NULL COMMENT '视频发布时间戳', + `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', + `video_play_count` varchar(16) DEFAULT NULL COMMENT '视频播放数量', + `video_danmaku` varchar(16) DEFAULT NULL COMMENT '视频弹幕数量', + `video_comment` varchar(16) DEFAULT NULL COMMENT '视频评论数量', + `video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL', + `video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL', + PRIMARY KEY (`id`), + KEY `idx_bilibili_vi_video_i_31c36e` (`video_id`), + KEY `idx_bilibili_vi_create__73e0ec` (`create_time`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B站视频'; -- ---------------------------- -- Table structure for bilibili_video_comment -- ---------------------------- DROP TABLE IF EXISTS `bilibili_video_comment`; -CREATE TABLE `bilibili_video_comment` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `comment_id` varchar(64) NOT NULL COMMENT '评论ID', - `video_id` varchar(64) NOT NULL COMMENT '视频ID', - `content` longtext COMMENT '评论内容', - `create_time` bigint NOT NULL COMMENT '评论时间戳', - `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', - PRIMARY KEY (`id`), - KEY `idx_bilibili_vi_comment_41c34e` (`comment_id`), - KEY `idx_bilibili_vi_video_i_f22873` (`video_id`) +CREATE TABLE `bilibili_video_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `video_id` varchar(64) NOT NULL COMMENT '视频ID', + `content` longtext COMMENT '评论内容', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', + PRIMARY KEY (`id`), + KEY `idx_bilibili_vi_comment_41c34e` (`comment_id`), + KEY `idx_bilibili_vi_video_i_f22873` (`video_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站视频评论'; -- ---------------------------- -- Table structure for bilibili_up_info -- ---------------------------- DROP TABLE IF EXISTS `bilibili_up_info`; -CREATE TABLE `bilibili_up_info` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `total_fans` bigint DEFAULT NULL COMMENT '粉丝数', - `total_liked` bigint DEFAULT NULL COMMENT '总获赞数', - `user_rank` int DEFAULT NULL COMMENT '用户等级', - `is_official` int DEFAULT NULL COMMENT '是否官号', - PRIMARY KEY (`id`), - KEY `idx_bilibili_vi_user_123456` (`user_id`) +CREATE TABLE `bilibili_up_info` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `total_fans` bigint DEFAULT NULL COMMENT '粉丝数', + `total_liked` bigint DEFAULT NULL COMMENT '总获赞数', + `user_rank` int DEFAULT NULL COMMENT '用户等级', + `is_official` int DEFAULT NULL COMMENT '是否官号', + PRIMARY KEY (`id`), + KEY `idx_bilibili_vi_user_123456` (`user_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='B 站UP主信息'; -- ---------------------------- -- Table structure for douyin_aweme -- ---------------------------- DROP TABLE IF EXISTS `douyin_aweme`; -CREATE TABLE `douyin_aweme` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid', - `short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID', - `user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `aweme_id` varchar(64) NOT NULL COMMENT '视频ID', - `aweme_type` varchar(16) NOT NULL COMMENT '视频类型', - `title` varchar(500) DEFAULT NULL COMMENT '视频标题', - `desc` longtext COMMENT '视频描述', - `create_time` bigint NOT NULL COMMENT '视频发布时间戳', - `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', - `comment_count` varchar(16) DEFAULT NULL COMMENT '视频评论数', - `share_count` varchar(16) DEFAULT NULL COMMENT '视频分享数', - `collected_count` varchar(16) DEFAULT NULL COMMENT '视频收藏数', - `aweme_url` varchar(255) DEFAULT NULL COMMENT '视频详情页URL', - PRIMARY KEY (`id`), - KEY `idx_douyin_awem_aweme_i_6f7bc6` (`aweme_id`), - KEY `idx_douyin_awem_create__299dfe` (`create_time`) +CREATE TABLE `douyin_aweme` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid', + `short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID', + `user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `aweme_id` varchar(64) NOT NULL COMMENT '视频ID', + `aweme_type` varchar(16) NOT NULL COMMENT '视频类型', + `title` varchar(500) DEFAULT NULL COMMENT '视频标题', + `desc` longtext COMMENT '视频描述', + `create_time` bigint NOT NULL COMMENT '视频发布时间戳', + `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', + `comment_count` varchar(16) DEFAULT NULL COMMENT '视频评论数', + `share_count` varchar(16) DEFAULT NULL COMMENT '视频分享数', + `collected_count` varchar(16) DEFAULT NULL COMMENT '视频收藏数', + `aweme_url` varchar(255) DEFAULT NULL COMMENT '视频详情页URL', + PRIMARY KEY (`id`), + KEY `idx_douyin_awem_aweme_i_6f7bc6` (`aweme_id`), + KEY `idx_douyin_awem_create__299dfe` (`create_time`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音视频'; -- ---------------------------- -- Table structure for douyin_aweme_comment -- ---------------------------- DROP TABLE IF EXISTS `douyin_aweme_comment`; -CREATE TABLE `douyin_aweme_comment` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid', - `short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID', - `user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `comment_id` varchar(64) NOT NULL COMMENT '评论ID', - `aweme_id` varchar(64) NOT NULL COMMENT '视频ID', - `content` longtext COMMENT '评论内容', - `create_time` bigint NOT NULL COMMENT '评论时间戳', - `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', - PRIMARY KEY (`id`), - KEY `idx_douyin_awem_comment_fcd7e4` (`comment_id`), - KEY `idx_douyin_awem_aweme_i_c50049` (`aweme_id`) +CREATE TABLE `douyin_aweme_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `sec_uid` varchar(128) DEFAULT NULL COMMENT '用户sec_uid', + `short_user_id` varchar(64) DEFAULT NULL COMMENT '用户短ID', + `user_unique_id` varchar(64) DEFAULT NULL COMMENT '用户唯一ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `user_signature` varchar(500) DEFAULT NULL COMMENT '用户签名', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `aweme_id` varchar(64) NOT NULL COMMENT '视频ID', + `content` longtext COMMENT '评论内容', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', + PRIMARY KEY (`id`), + KEY `idx_douyin_awem_comment_fcd7e4` (`comment_id`), + KEY `idx_douyin_awem_aweme_i_c50049` (`aweme_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音视频评论'; -- ---------------------------- -- Table structure for dy_creator -- ---------------------------- DROP TABLE IF EXISTS `dy_creator`; -CREATE TABLE `dy_creator` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(128) NOT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `desc` longtext COMMENT '用户描述', - `gender` varchar(1) DEFAULT NULL COMMENT '性别', - `follows` varchar(16) DEFAULT NULL COMMENT '关注数', - `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', - `interaction` varchar(16) DEFAULT NULL COMMENT '获赞数', - `videos_count` varchar(16) DEFAULT NULL COMMENT '作品数', - PRIMARY KEY (`id`) +CREATE TABLE `dy_creator` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(128) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `desc` longtext COMMENT '用户描述', + `gender` varchar(1) DEFAULT NULL COMMENT '性别', + `follows` varchar(16) DEFAULT NULL COMMENT '关注数', + `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', + `interaction` varchar(16) DEFAULT NULL COMMENT '获赞数', + `videos_count` varchar(16) DEFAULT NULL COMMENT '作品数', + PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='抖音博主信息'; -- ---------------------------- -- Table structure for kuaishou_video -- ---------------------------- DROP TABLE IF EXISTS `kuaishou_video`; -CREATE TABLE `kuaishou_video` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `video_id` varchar(64) NOT NULL COMMENT '视频ID', - `video_type` varchar(16) NOT NULL COMMENT '视频类型', - `title` varchar(500) DEFAULT NULL COMMENT '视频标题', - `desc` longtext COMMENT '视频描述', - `create_time` bigint NOT NULL COMMENT '视频发布时间戳', - `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', - `viewd_count` varchar(16) DEFAULT NULL COMMENT '视频浏览数量', - `video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL', - `video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL', - `video_play_url` varchar(512) DEFAULT NULL COMMENT '视频播放 URL', - PRIMARY KEY (`id`), - KEY `idx_kuaishou_vi_video_i_c5c6a6` (`video_id`), - KEY `idx_kuaishou_vi_create__a10dee` (`create_time`) +CREATE TABLE `kuaishou_video` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `video_id` varchar(64) NOT NULL COMMENT '视频ID', + `video_type` varchar(16) NOT NULL COMMENT '视频类型', + `title` varchar(500) DEFAULT NULL COMMENT '视频标题', + `desc` longtext COMMENT '视频描述', + `create_time` bigint NOT NULL COMMENT '视频发布时间戳', + `liked_count` varchar(16) DEFAULT NULL COMMENT '视频点赞数', + `viewd_count` varchar(16) DEFAULT NULL COMMENT '视频浏览数量', + `video_url` varchar(512) DEFAULT NULL COMMENT '视频详情URL', + `video_cover_url` varchar(512) DEFAULT NULL COMMENT '视频封面图 URL', + `video_play_url` varchar(512) DEFAULT NULL COMMENT '视频播放 URL', + PRIMARY KEY (`id`), + KEY `idx_kuaishou_vi_video_i_c5c6a6` (`video_id`), + KEY `idx_kuaishou_vi_create__a10dee` (`create_time`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='快手视频'; -- ---------------------------- -- Table structure for kuaishou_video_comment -- ---------------------------- DROP TABLE IF EXISTS `kuaishou_video_comment`; -CREATE TABLE `kuaishou_video_comment` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `comment_id` varchar(64) NOT NULL COMMENT '评论ID', - `video_id` varchar(64) NOT NULL COMMENT '视频ID', - `content` longtext COMMENT '评论内容', - `create_time` bigint NOT NULL COMMENT '评论时间戳', - `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', - PRIMARY KEY (`id`), - KEY `idx_kuaishou_vi_comment_ed48fa` (`comment_id`), - KEY `idx_kuaishou_vi_video_i_e50914` (`video_id`) +CREATE TABLE `kuaishou_video_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `video_id` varchar(64) NOT NULL COMMENT '视频ID', + `content` longtext COMMENT '评论内容', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', + PRIMARY KEY (`id`), + KEY `idx_kuaishou_vi_comment_ed48fa` (`comment_id`), + KEY `idx_kuaishou_vi_video_i_e50914` (`video_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='快手视频评论'; @@ -195,145 +203,175 @@ CREATE TABLE `kuaishou_video_comment` ( -- Table structure for weibo_note -- ---------------------------- DROP TABLE IF EXISTS `weibo_note`; -CREATE TABLE `weibo_note` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `gender` varchar(12) DEFAULT NULL COMMENT '用户性别', - `profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址', - `ip_location` varchar(32) DEFAULT '发布微博的地理信息', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `note_id` varchar(64) NOT NULL COMMENT '帖子ID', - `content` longtext COMMENT '帖子正文内容', - `create_time` bigint NOT NULL COMMENT '帖子发布时间戳', - `create_date_time` varchar(32) NOT NULL COMMENT '帖子发布日期时间', - `liked_count` varchar(16) DEFAULT NULL COMMENT '帖子点赞数', - `comments_count` varchar(16) DEFAULT NULL COMMENT '帖子评论数量', - `shared_count` varchar(16) DEFAULT NULL COMMENT '帖子转发数量', - `note_url` varchar(512) DEFAULT NULL COMMENT '帖子详情URL', - PRIMARY KEY (`id`), - KEY `idx_weibo_note_note_id_f95b1a` (`note_id`), - KEY `idx_weibo_note_create__692709` (`create_time`), - KEY `idx_weibo_note_create__d05ed2` (`create_date_time`) +CREATE TABLE `weibo_note` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `gender` varchar(12) DEFAULT NULL COMMENT '用户性别', + `profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址', + `ip_location` varchar(32) DEFAULT '发布微博的地理信息', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `note_id` varchar(64) NOT NULL COMMENT '帖子ID', + `content` longtext COMMENT '帖子正文内容', + `create_time` bigint NOT NULL COMMENT '帖子发布时间戳', + `create_date_time` varchar(32) NOT NULL COMMENT '帖子发布日期时间', + `liked_count` varchar(16) DEFAULT NULL COMMENT '帖子点赞数', + `comments_count` varchar(16) DEFAULT NULL COMMENT '帖子评论数量', + `shared_count` varchar(16) DEFAULT NULL COMMENT '帖子转发数量', + `note_url` varchar(512) DEFAULT NULL COMMENT '帖子详情URL', + PRIMARY KEY (`id`), + KEY `idx_weibo_note_note_id_f95b1a` (`note_id`), + KEY `idx_weibo_note_create__692709` (`create_time`), + KEY `idx_weibo_note_create__d05ed2` (`create_date_time`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博帖子'; -- ---------------------------- -- Table structure for weibo_note_comment -- ---------------------------- DROP TABLE IF EXISTS `weibo_note_comment`; -CREATE TABLE `weibo_note_comment` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `gender` varchar(12) DEFAULT NULL COMMENT '用户性别', - `profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址', - `ip_location` varchar(32) DEFAULT '发布微博的地理信息', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `comment_id` varchar(64) NOT NULL COMMENT '评论ID', - `note_id` varchar(64) NOT NULL COMMENT '帖子ID', - `content` longtext COMMENT '评论内容', - `create_time` bigint NOT NULL COMMENT '评论时间戳', - `create_date_time` varchar(32) NOT NULL COMMENT '评论日期时间', - `comment_like_count` varchar(16) NOT NULL COMMENT '评论点赞数量', - `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', - PRIMARY KEY (`id`), - KEY `idx_weibo_note__comment_c7611c` (`comment_id`), - KEY `idx_weibo_note__note_id_24f108` (`note_id`), - KEY `idx_weibo_note__create__667fe3` (`create_date_time`) +CREATE TABLE `weibo_note_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) DEFAULT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `gender` varchar(12) DEFAULT NULL COMMENT '用户性别', + `profile_url` varchar(255) DEFAULT NULL COMMENT '用户主页地址', + `ip_location` varchar(32) DEFAULT '发布微博的地理信息', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `note_id` varchar(64) NOT NULL COMMENT '帖子ID', + `content` longtext COMMENT '评论内容', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `create_date_time` varchar(32) NOT NULL COMMENT '评论日期时间', + `comment_like_count` varchar(16) NOT NULL COMMENT '评论点赞数量', + `sub_comment_count` varchar(16) NOT NULL COMMENT '评论回复数', + PRIMARY KEY (`id`), + KEY `idx_weibo_note__comment_c7611c` (`comment_id`), + KEY `idx_weibo_note__note_id_24f108` (`note_id`), + KEY `idx_weibo_note__create__667fe3` (`create_date_time`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博帖子评论'; -- ---------------------------- -- Table structure for xhs_creator -- ---------------------------- DROP TABLE IF EXISTS `xhs_creator`; -CREATE TABLE `xhs_creator` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) NOT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `desc` longtext COMMENT '用户描述', - `gender` varchar(1) DEFAULT NULL COMMENT '性别', - `follows` varchar(16) DEFAULT NULL COMMENT '关注数', - `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', - `interaction` varchar(16) DEFAULT NULL COMMENT '获赞和收藏数', - `tag_list` longtext COMMENT '标签列表', - PRIMARY KEY (`id`) +CREATE TABLE `xhs_creator` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `desc` longtext COMMENT '用户描述', + `gender` varchar(1) DEFAULT NULL COMMENT '性别', + `follows` varchar(16) DEFAULT NULL COMMENT '关注数', + `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', + `interaction` varchar(16) DEFAULT NULL COMMENT '获赞和收藏数', + `tag_list` longtext COMMENT '标签列表', + PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书博主'; -- ---------------------------- -- Table structure for xhs_note -- ---------------------------- DROP TABLE IF EXISTS `xhs_note`; -CREATE TABLE `xhs_note` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) NOT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `note_id` varchar(64) NOT NULL COMMENT '笔记ID', - `type` varchar(16) DEFAULT NULL COMMENT '笔记类型(normal | video)', - `title` varchar(255) DEFAULT NULL COMMENT '笔记标题', - `desc` longtext COMMENT '笔记描述', - `video_url` longtext COMMENT '视频地址', - `time` bigint NOT NULL COMMENT '笔记发布时间戳', - `last_update_time` bigint NOT NULL COMMENT '笔记最后更新时间戳', - `liked_count` varchar(16) DEFAULT NULL COMMENT '笔记点赞数', - `collected_count` varchar(16) DEFAULT NULL COMMENT '笔记收藏数', - `comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数', - `share_count` varchar(16) DEFAULT NULL COMMENT '笔记分享数', - `image_list` longtext COMMENT '笔记封面图片列表', - `tag_list` longtext COMMENT '标签列表', - `note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL', - PRIMARY KEY (`id`), - KEY `idx_xhs_note_note_id_209457` (`note_id`), - KEY `idx_xhs_note_time_eaa910` (`time`) +CREATE TABLE `xhs_note` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `note_id` varchar(64) NOT NULL COMMENT '笔记ID', + `type` varchar(16) DEFAULT NULL COMMENT '笔记类型(normal | video)', + `title` varchar(255) DEFAULT NULL COMMENT '笔记标题', + `desc` longtext COMMENT '笔记描述', + `video_url` longtext COMMENT '视频地址', + `time` bigint NOT NULL COMMENT '笔记发布时间戳', + `last_update_time` bigint NOT NULL COMMENT '笔记最后更新时间戳', + `liked_count` varchar(16) DEFAULT NULL COMMENT '笔记点赞数', + `collected_count` varchar(16) DEFAULT NULL COMMENT '笔记收藏数', + `comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数', + `share_count` varchar(16) DEFAULT NULL COMMENT '笔记分享数', + `image_list` longtext COMMENT '笔记封面图片列表', + `tag_list` longtext COMMENT '标签列表', + `note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL', + PRIMARY KEY (`id`), + KEY `idx_xhs_note_note_id_209457` (`note_id`), + KEY `idx_xhs_note_time_eaa910` (`time`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书笔记'; -- ---------------------------- -- Table structure for xhs_note_comment -- ---------------------------- DROP TABLE IF EXISTS `xhs_note_comment`; -CREATE TABLE `xhs_note_comment` ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `user_id` varchar(64) NOT NULL COMMENT '用户ID', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `comment_id` varchar(64) NOT NULL COMMENT '评论ID', - `create_time` bigint NOT NULL COMMENT '评论时间戳', - `note_id` varchar(64) NOT NULL COMMENT '笔记ID', - `content` longtext NOT NULL COMMENT '评论内容', - `sub_comment_count` int NOT NULL COMMENT '子评论数量', - `pictures` varchar(512) DEFAULT NULL, - PRIMARY KEY (`id`), - KEY `idx_xhs_note_co_comment_8e8349` (`comment_id`), - KEY `idx_xhs_note_co_create__204f8d` (`create_time`) +CREATE TABLE `xhs_note_comment` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `create_time` bigint NOT NULL COMMENT '评论时间戳', + `note_id` varchar(64) NOT NULL COMMENT '笔记ID', + `content` longtext NOT NULL COMMENT '评论内容', + `sub_comment_count` int NOT NULL COMMENT '子评论数量', + `pictures` varchar(512) DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `idx_xhs_note_co_comment_8e8349` (`comment_id`), + KEY `idx_xhs_note_co_create__204f8d` (`create_time`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='小红书笔记评论'; -- ---------------------------- -- alter table xhs_note_comment to support parent_comment_id -- ---------------------------- ALTER TABLE `xhs_note_comment` -ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ALTER TABLE `douyin_aweme_comment` -ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ALTER TABLE `bilibili_video_comment` -ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; ALTER TABLE `weibo_note_comment` -ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; + ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; -SET FOREIGN_KEY_CHECKS = 1; +SET +FOREIGN_KEY_CHECKS = 1; + + +DROP TABLE IF EXISTS `tieba_note`; +CREATE TABLE `tieba_note` +( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `note_id` varchar(64) NOT NULL COMMENT '帖子ID', + `title` varchar(255) DEFAULT NULL COMMENT '笔记标题', + `desc` longtext COMMENT '笔记描述', + `time` varchar NOT NULL COMMENT '笔记发布时间', + `note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL', + `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', + `nickname_link` varchar(255) DEFAULT NULL COMMENT '用户主页地址', + `tieba_name` varchar(255) DEFAULT NULL COMMENT '贴吧名称', + `tieba_link` varchar(255) DEFAULT NULL COMMENT '贴吧链接地址', + `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', + `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + `comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数', + PRIMARY KEY (`id`), + KEY `idx_tieba_note_id` (`note_id`), + KEY `idx_tieba_note_time` (`time`) +) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧帖子表'; diff --git a/store/tieba/__init__.py b/store/tieba/__init__.py index 9605d58..9e47fa4 100644 --- a/store/tieba/__init__.py +++ b/store/tieba/__init__.py @@ -22,31 +22,20 @@ class TieBaStoreFactory: async def update_tieba_note(note_item: Dict): + tieba_url = "https://tieba.baidu.com" note_id = note_item.get("note_id") - user_info = note_item.get("user", {}) - interact_info = note_item.get("interact_info", {}) - tag_list: List[Dict] = note_item.get("tag_list", []) - local_db_item = { "note_id": note_id, - "type": note_item.get("type"), "title": note_item.get("title") or note_item.get("desc", "")[:255], "desc": note_item.get("desc", ""), + "note_url": tieba_url + note_item.get("note_url"), "time": note_item.get("time"), - "last_update_time": note_item.get("last_update_time", 0), - "user_id": user_info.get("user_id"), - "nickname": user_info.get("nickname"), - "avatar": user_info.get("avatar"), - "liked_count": interact_info.get("liked_count"), - "collected_count": interact_info.get("collected_count"), - "comment_count": interact_info.get("comment_count"), - "share_count": interact_info.get("share_count"), + "tieba_name": note_item.get("tieba_name"), + "tieba_link": tieba_url + note_item.get("tieba_link", ""), + "nickname": note_item.get("nickname"), + "nickname_link": tieba_url + note_item.get("nickname_link", ""), "ip_location": note_item.get("ip_location", ""), - - "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']), "last_modify_ts": utils.get_current_timestamp(), - # todo: add note_url - "note_url": "" } utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {local_db_item}") await TieBaStoreFactory.create_store().store_content(local_db_item) diff --git a/store/tieba/tieba_store_sql.py b/store/tieba/tieba_store_sql.py index 9ec03a4..f99f491 100644 --- a/store/tieba/tieba_store_sql.py +++ b/store/tieba/tieba_store_sql.py @@ -15,7 +15,7 @@ async def query_content_by_content_id(content_id: str) -> Dict: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - sql: str = f"select * from baidu_tieba where note_id = '{content_id}'" + sql: str = f"select * from tieba_note where note_id = '{content_id}'" rows: List[Dict] = await async_db_conn.query(sql) if len(rows) > 0: return rows[0] @@ -32,7 +32,7 @@ async def add_new_content(content_item: Dict) -> int: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - last_row_id: int = await async_db_conn.item_to_table("baidu_tieba", content_item) + last_row_id: int = await async_db_conn.item_to_table("tieba_note", content_item) return last_row_id @@ -47,7 +47,7 @@ async def update_content_by_content_id(content_id: str, content_item: Dict) -> i """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - effect_row: int = await async_db_conn.update_table("baidu_tieba", content_item, "note_id", content_id) + effect_row: int = await async_db_conn.update_table("tieba_note", content_item, "note_id", content_id) return effect_row @@ -62,7 +62,7 @@ async def query_comment_by_comment_id(comment_id: str) -> Dict: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - sql: str = f"select * from baidu_tieba_comment where comment_id = '{comment_id}'" + sql: str = f"select * from tieba_comment where comment_id = '{comment_id}'" rows: List[Dict] = await async_db_conn.query(sql) if len(rows) > 0: return rows[0] @@ -79,7 +79,7 @@ async def add_new_comment(comment_item: Dict) -> int: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - last_row_id: int = await async_db_conn.item_to_table("baidu_tieba_comment", comment_item) + last_row_id: int = await async_db_conn.item_to_table("tieba_comment", comment_item) return last_row_id @@ -94,7 +94,7 @@ async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> i """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - effect_row: int = await async_db_conn.update_table("baidu_tieba_comment", comment_item, "comment_id", comment_id) + effect_row: int = await async_db_conn.update_table("tieba_comment", comment_item, "comment_id", comment_id) return effect_row @@ -108,7 +108,7 @@ async def query_creator_by_user_id(user_id: str) -> Dict: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - sql: str = f"select * from baidu_tieba_creator where user_id = '{user_id}'" + sql: str = f"select * from tieba_creator where user_id = '{user_id}'" rows: List[Dict] = await async_db_conn.query(sql) if len(rows) > 0: return rows[0] @@ -125,7 +125,7 @@ async def add_new_creator(creator_item: Dict) -> int: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - last_row_id: int = await async_db_conn.item_to_table("baidu_tieba_creator", creator_item) + last_row_id: int = await async_db_conn.item_to_table("tieba_creator", creator_item) return last_row_id @@ -140,5 +140,5 @@ async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int: """ async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() - effect_row: int = await async_db_conn.update_table("baidu_tieba_creator", creator_item, "user_id", user_id) + effect_row: int = await async_db_conn.update_table("tieba_creator", creator_item, "user_id", user_id) return effect_row \ No newline at end of file diff --git a/tools/crawler_util.py b/tools/crawler_util.py index 026d86a..8e37881 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -13,6 +13,7 @@ import httpx from PIL import Image, ImageDraw from playwright.async_api import Cookie, Page +from proxy import IpInfoModel from . import utils @@ -133,3 +134,16 @@ def match_interact_info_count(count_str: str) -> int: return int(number) else: return 0 + + +def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: + """format proxy info for playwright and httpx""" + playwright_proxy = { + "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", + "username": ip_proxy_info.user, + "password": ip_proxy_info.password, + } + httpx_proxy = { + f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" + } + return playwright_proxy, httpx_proxy \ No newline at end of file From 1b585cb215e82a6f2a0fda2de26a279627b6981f Mon Sep 17 00:00:00 2001 From: Relakkes Date: Tue, 6 Aug 2024 19:21:34 +0800 Subject: [PATCH 3/8] temp commit --- config/base_config.py | 6 +- media_platform/tieba/client.py | 48 +- media_platform/tieba/help.py | 39 +- .../test_data/note_detail_and_comments.html | 7558 +++++++++++++++++ 4 files changed, 7644 insertions(+), 7 deletions(-) create mode 100644 media_platform/tieba/test_data/note_detail_and_comments.html diff --git a/config/base_config.py b/config/base_config.py index 08dd421..2985d40 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -1,6 +1,6 @@ # 基础配置 PLATFORM = "xhs" -KEYWORDS = "编程副业,编程兼职" +KEYWORDS = "缅甸边境,缅北边境,缅北边境线,缅甸边境线" LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" # 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书 @@ -28,7 +28,7 @@ HEADLESS = False SAVE_LOGIN_STATE = True # 数据保存类型选项配置,支持三种类型:csv、db、json -SAVE_DATA_OPTION = "json" # csv or db or json +SAVE_DATA_OPTION = "db" # csv or db or json # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name @@ -37,7 +37,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name START_PAGE = 1 # 爬取视频/帖子的数量控制 -CRAWLER_MAX_NOTES_COUNT = 20 +CRAWLER_MAX_NOTES_COUNT = 100 # 并发爬虫数量控制 MAX_CONCURRENCY_NUM = 1 diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index a02e243..f9e0375 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -1,3 +1,4 @@ +import asyncio import json import random from typing import Any, Callable, Dict, List, Optional, Union @@ -188,6 +189,9 @@ class BaiduTieBaClient(AbstractApiClient): Returns: """ + uri = f"/p/{note_id}" + page_content = await self.get(uri, return_ori_content=True) + return self._page_extractor.extract_note_detail(page_content) # todo impl it return {} @@ -203,5 +207,45 @@ class BaiduTieBaClient(AbstractApiClient): Returns: """ - # todo impl it - return [] + uri = f"/p/{note_id}" + result = [] + comments_has_more = True + comments_cursor = 1 + while comments_has_more: + comments_res = await self.get(uri, params={"pn": comments_cursor}) + comments_has_more = comments_res.get("has_more", False) + comments_cursor = comments_res.get("cursor", "") + if "comments" not in comments_res: + utils.logger.info( + f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}") + break + comments = comments_res["comments"] + if callback: + await callback(note_id, comments) + await asyncio.sleep(crawl_interval) + result.extend(comments) + sub_comments = await self.get_comments_all_sub_comments(comments, crawl_interval, callback) + result.extend(sub_comments) + return result + + async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0, + callback: Optional[Callable] = None) -> List[Dict]: + """ + 获取指定评论下的所有子评论 + Args: + comments: 评论列表 + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + + Returns: + + """ + result = [] + for comment in comments: + sub_comments = comment.get("comments") + if sub_comments: + if callback: + await callback(comment.get("id"), sub_comments) + await asyncio.sleep(crawl_interval) + result.extend(sub_comments) + return result diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index 59eabdb..2c1144d 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -32,7 +32,6 @@ class TieBaExtractor: author = post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip() author_link = post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default='') date = post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip() - result.append({ "note_id": post_id, "title": title, @@ -47,6 +46,25 @@ class TieBaExtractor: return result + @staticmethod + def extract_note_detail(page_content: str) -> Dict: + """ + 提取贴吧帖子详情 + Args: + page_content: + + Returns: + + """ + content_selector = Selector(text=page_content) + # 查看楼主的链接: only_view_author_link: / p / 9117905169?see_lz = 1 + only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip() # + note_id = only_view_author_link.split("?")[0].split("/")[-1] + title = content_selector.xpath("//*[@id='j_core_title_wrap']/h3").get(default='').strip() + desc = content_selector.xpath("//meta[@name='description']").get(default='').strip() + note_url = f"/p/{note_id}" + pass + @staticmethod def extract_tieba_note_comments(page_content: str) -> List[Dict]: """ @@ -57,7 +75,24 @@ class TieBaExtractor: Returns: """ - pass + xpath_selector = "//div[@id='j_p_postlist']/div[@class='l_post l_post_bright j_l_post clearfix']" + comment_list = Selector(text=page_content).xpath(xpath_selector) + result = [] + for comment in comment_list: + comment_id = comment.xpath(".//@data-pid").get(default='').strip() + author = comment.xpath(".//a[@data-field]/text()").get(default='').strip() + author_link = comment.xpath(".//a[@data-field]/@href").get(default='') + content = comment.xpath(".//div[@class='d_post_content j_d_post_content ']/text()").get(default='').strip() + date = comment.xpath(".//span[@class='tail-info']/text()").get(default='').strip() + + result.append({ + "comment_id": comment_id, + "author": author, + "author_link": author_link, + "content": content, + "time": date, + }) + if __name__ == '__main__': diff --git a/media_platform/tieba/test_data/note_detail_and_comments.html b/media_platform/tieba/test_data/note_detail_and_comments.html new file mode 100644 index 0000000..132068a --- /dev/null +++ b/media_platform/tieba/test_data/note_detail_and_comments.html @@ -0,0 +1,7558 @@ +
+
+
+
+
+
+ +
+ +
+
+
+ +
+ 以太比特吧 关注:309,572贴子:5,386,110
+
+
+ +
+
+
    +
  • 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 下一页 + 尾页 +
  • +
  • 760回复贴,共12页 +
  • +
  • ,跳到 页   +
  • +
+ +
+
+ + +
+ 贴子管理 +
    + +
+
+
+
+ +
+
+
+
+
+

对于一个父亲来说,这个女儿14岁就死了

只看楼主收藏回复 +

+
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+
+
+
+ + + +
+
+
点击展开,查看完整图片 +
+
+
+
+
+ +
+ +
+
+
+
IP属地:广东来自Android客户端1楼2024-08-05 16:56回复 +
+
    +
    + +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + + +
    + 本来觉得就凭14岁的这点叛逆父亲不再理她觉得这个这个父亲是有点问题的,后来看到母亲也不理了,我就知道这女的肯定隐藏了很多自己干得垃圾事没说,她活该
    +
    +
    + +
    + +
    +
    +
    IP属地:广东来自Android客户端2楼2024-08-05 17:07 +
    收起回复
    +
    +
      +
      +
      +
      +
      +
        +
      • +
        铭寒号废了重练一个而已,只是她妈后来才明白这一点 + +
        +
      • +
      • +
        youxi卡米糯小错一般都能包容,能这样多半是原则上大是大非 + +
        +
      • +
      • +
        你的隔壁王哥十四岁能把人逼到没有一点犹豫的跳楼,有多大的学习压力想过没?这种家庭内为了子女成才会不记一切代价,甚至是以折磨的方式,而之后的一切变故都是由于这次跳楼父亲不闻不问的态度,换作是你心灰意泠后只会做的比他更过分,亲情破裂会让最后一丝克制也一同丧失。 + +
        +
      • +
      • +
        +
        + 快拉黑尔父回复 你的隔壁王哥 :闷油瓶的话还能理解一下,小太妹为了得到什么说跳就跳我是一点也不怀疑也不同情的。你现在同情小心以后糟老罪咯。真要对她不好也不至于长大了好多事想明白了反而一直想修复关系。 + +
        +
      • +
      • +
        你的隔壁王哥回复 快拉黑尔父 :十四岁第一次逃学,还在担心父母会不会打他,说明在此之前完全就是个乖乖女。初三才逃第一次学,如果是太妹初二就已经插着翅膀到处飞了,而且跳楼母亲没有任何心里准备,就说明在以往的形象里是不可能做出这事,说明从一开始就只是正常女学生。 + +
        +
      • +
          +
        • +
          +
          + 快拉黑尔父回复 你的隔壁王哥 :人变成太妹,性格一完全变成了很难理解吗?初中时代常有的事 + +
          +
        • +
        • +
          你的隔壁王哥回复 快拉黑尔父 :如果说是太妹,那么跳楼之前必然会有各种前车之鉴,换句话说为了得到某样需求常用跳楼作为威胁。这种头也不回没有任何犹豫的跳楼,显然不是为了得到什么,就是单纯的寻死,你觉得太妹会这么纯粹的寻死吗?太妹的心理承受能力可高多了,只有未经世事的小白心里破防了才会这么干脆。 + +
          +
        • +
        • +
          +
          + 快拉黑尔父回复 你的隔壁王哥 :完全的一面之词,结果可以看到的是什么?14岁钱的好父亲当她死了。对她一直很好的母亲也断了联系。想修复关系的反而是她。告诉你一个众所周知的事,人发言,一定,一定会下意识的美化自己。这是下意识。然后你再看看这个故事。 + +
          +
        • +
        • +
          +
          + 快拉黑尔父回复 你的隔壁王哥 :而你所说的这个想索求什么,全包含在了一句叛逆期懂得都懂这一句话里面隐藏了。这就是她下意识的掩盖的事了。 + +
          +
        • +
        • +
          你的隔壁王哥回复 快拉黑尔父 :你要分析心理啊,纯粹的寻死只会在心里破防的时候才会存在,你如果接触过混社会的太妹,你就会发现他们会以寻求刺激为炫耀的资本,在这种群体内心理承受能力高的离谱。要想让一个学生心里破防,只能让她的天塌了,脆弱的心里才会在极短时间内崩溃,只有长期压抑才会产生这种心理。 + +
          +
        • +
        +
      • 我也说一句 +

        还有113条回复,点击查看 +

        +
      • +
      + +
      +
      +
      +
      +
      +
      +
      + + +
      +
      +
      + +
      +
      +
      +
      + + +
      + 这女的晚上不回家她爹去找她,被黄毛打进医院,也没来医院看过,最后和黄毛结婚也不来往。想起三套房想爆她爹金币,结果找不到求助平台。幸好她爹跑得快。 +
      +
      +
      + +
      + +
      +
      +
      IP属地:福建来自Android客户端4楼2024-08-05 17:38 +
      收起回复
      +
      +
        +
        +
        +
        +
        + + +
        +
        +
        +
        +
        +
        +
        +
        + +
        +
        +
        +
        + + +
        我知道,可怜之人,必有() +
        +
        +
        + +
        + +
        +
        +
        IP属地:浙江来自Android客户端7楼2024-08-05 18:38 +
        收起回复
        +
        +
          +
          +
          +
          +
          + + +
          +
          +
          +
          +
          +
          +
          +
          + +
          +
          +
          +
          + + +
          太假了,混社会不良太妹,还考高中,选专业。当没有大专么 +
          +
          +
          + +
          +
          +
          +
          IP属地:天津来自Android客户端8楼2024-08-05 18:43 +
          收起回复
          +
          +
            +
            +
            +
            +
            + + +
            +
            +
            +
            +
            +
            +
            +
            + +
            +
            +
            +
            + + +
            边倪m蓖 +
            +
            +
            + +
            +
            +
            +
            IP属地:广东来自Android客户端9楼2024-08-05 18:52 +
            回复 +
            +
            +
              +
              + +
              +
              +
              +
              +
              + +
              +
              +
              +
              + + +
              父亲问题很大,应该在14岁那年再生一个或者领养一个 +
              +
              +
              + +
              + +
              +
              +
              IP属地:河北来自Android客户端10楼2024-08-05 18:59 +
              收起回复
              +
              +
                +
                +
                +
                +
                + + +
                +
                +
                +
                +
                +
                +
                +
                + +
                +
                +
                +
                + + +
                她爸怎么忍住不创小号的 +
                +
                +
                + +
                + +
                +
                +
                IP属地:浙江来自Android客户端12楼2024-08-05 19:09 +
                收起回复
                +
                +
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  +
                  + +
                  +
                  +
                  + +
                  +
                  +
                  +
                  + + +
                  站在作者的角度来看,肯定都是挑了对自己及其有利的东西来说了,然而 +
                  +
                  +
                  + +
                  +
                  +
                  +
                  IP属地:四川来自Android客户端13楼2024-08-05 19:11 +
                  收起回复
                  +
                  +
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    +
                    +
                    + +
                    +
                    +
                    +
                    + + +
                    这个好像是之前新闻里的 +
                    +
                    +
                    + +
                    +
                    +
                    +
                    IP属地:江苏来自Android客户端17楼2024-08-05 19:31 +
                    收起回复
                    +
                    +
                      +
                      +
                      +
                      +
                      + + +
                      +
                      +
                      +
                      +
                      +
                      +
                      +
                      + +
                      +
                      +
                      +
                      + + +
                      叛逆期你懂的这6个字包含了不知道多少事父母没对他发火而是耐心劝导也不知道包含了多少,我不好说,而且14岁逃学混社会初三高一的学生这么弄基本也是烂了 +
                      +
                      +
                      + +
                      +
                      +
                      +
                      IP属地:黑龙江来自Android客户端21楼2024-08-05 20:06 +
                      收起回复
                      +
                      +
                        +
                        +
                        +
                        +
                        + + +
                        +
                        +
                        +
                        +
                        +
                        +
                        +
                        + +
                        +
                        +
                        +
                        + + +
                        + 我们群有个女的。。。他说他爹家暴。。。喝点酒打她跟他妈。。她还轻生过。。。慢慢的的了解了。。。。他爹好像没那么不堪。。。一个月4000多生活费给她。。。她上学都打出租车。。。他爹还不怎么喝酒。。。他有抑郁症他爹还带她去看病。。。。还学了中医给她食补。。但是他就记得他爹喝酒打她跟他妈,。。。。我就纳了闷了。。。。这两个版本的故事不大对。。。。女人嘴里没实话啊。。。。。她说她爹喝酒打他妈,他直接拿水果刀给他爹捅了。淌了好多血,所以他爹送她进精神病院 + 反正挺混乱的。。。挺漂亮的一个高中女孩,就喜欢酒吧喝酒。。蹦迪。。。说全班男的都给她表过白。。。但是就喜欢小混混。。。
                        我得出一个结论。这家伙真有病。。。。她爹绝对对她不错。。。。。也是贱高中家庭好,还喜欢混混很蹦迪。。。。高考才两百还是三百多让同学骂了一顿。。。。破防了在群里哭跑路了。。。 +
                        +
                        +
                        + +
                        + +
                        +
                        +
                        IP属地:山东来自Android客户端22楼2024-08-05 20:19 +
                        收起回复
                        +
                        +
                          +
                          +
                          +
                          +
                          +
                            +
                          • +
                            小学森有的人记吃不记打,有的则相反 其实想想,谁都是第一次做父母,也都是第一次做儿女,我真觉得家庭关系挺难处的。有时候我能感受到父母的爱,但是说不了几句话,我就会有一股莫名的戾气,很容易发火,但其实我是个脾气很好的人 + +
                            +
                          • +
                          • +
                            小学森回复 萌新龍傲天 :最近看了一些文章,就有提到这种情绪,大概是因为小时候父母的情感投射导致的原因,每一次父母的好,都是在加深我的愧疚(比如说,赚钱都是为了你,怎么怎么滴,宣扬牺牲和奉献以及苦难),所以我拒绝父母对我的好,所以我逆反,似乎这样可以减轻我的负面情绪 + +
                            +
                          • +
                          • +
                            小学森记得有次电视上在放一个情绪很浓烈的视频,可能是近代史之类的,我爸问我有什么感想,其实我心里波澜壮阔,但面无表情,并且淡淡回了一句,一般,没什么感受。。然后我爸是我是个冷血动物,我笑了,说,没错,我确实是个冷血动物 + +
                            +
                          • +
                          • +
                            萌新龍傲天回复 小学森 :看个人跟家庭环境被 + +
                            +
                          • +
                          • +
                            小学森所以,如果我在这个故事里面,我大概率也会这样,甚至别说还能“抬头不见低头见了”,可能我早快活去了 + +
                            +
                          • + +
                          • 我也说一句 +

                            还有34条回复,点击查看 +

                            +
                          • +
                          + +
                          +
                          +
                          +
                          +
                          +
                          +
                          +
                          + +
                          +
                          +
                          +
                          + + +
                          女的独生,八成是结婚嫁了混混日子不如意,想着爆父母金币3套房,后来连母亲都躲着她足以说明一切 +
                          +
                          +
                          + +
                          +
                          +
                          +
                          IP属地:广东来自Android客户端23楼2024-08-05 20:35 +
                          收起回复
                          +
                          +
                            +
                            +
                            +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            +
                            +
                            + +
                            +
                            +
                            +
                            + + +
                            活该,早点死别耽误别人 +
                            +
                            +
                            + +
                            +
                            +
                            +
                            IP属地:江西来自Android客户端24楼2024-08-05 20:37 +
                            回复 +
                            +
                            +
                              +
                              + +
                              +
                              +
                              + +
                              +
                              + +
                              +
                              +
                              +
                              + + +
                              对自己闭口不谈,不好评价 +
                              +
                              +
                              + +
                              +
                              +
                              +
                              IP属地:安徽来自Android客户端25楼2024-08-05 22:54 +
                              回复 +
                              +
                              +
                                +
                                + +
                                +
                                +
                                +
                                +
                                + +
                                +
                                +
                                +
                                +
                                + + +
                                再叛逆也不至于寻死
                                硬要死那就满足你当你死了 +
                                +
                                +
                                + +
                                +
                                +
                                +
                                IP属地:广西来自Android客户端26楼2024-08-05 22:57 +
                                回复 +
                                +
                                +
                                  +
                                  + +
                                  +
                                  +
                                  +
                                  +
                                  + +
                                  +
                                  +
                                  +
                                  + + +
                                  xxn的话一个标点符号都不能信 +
                                  +
                                  +
                                  + +
                                  +
                                  +
                                  +
                                  IP属地:广西来自Android客户端27楼2024-08-05 23:03 +
                                  回复 +
                                  +
                                  +
                                    +
                                    + +
                                    +
                                    +
                                    +
                                    +
                                    + +
                                    +
                                    +
                                    +
                                    + + +
                                    故事太过于离谱,是没讲完还是编的 +
                                    +
                                    +
                                    + +
                                    +
                                    +
                                    +
                                    IP属地:湖北来自Android客户端28楼2024-08-05 23:05 +
                                    收起回复
                                    +
                                    +
                                      +
                                      +
                                      +
                                      +
                                      + + +
                                      +
                                      +
                                      +
                                      +
                                      +
                                      +
                                      +
                                      + +
                                      +
                                      +
                                      +
                                      + + +
                                      她的母亲从前那么希望这个家和好,对女儿也很好,结果突然也躲着她 +
                                      +
                                      +
                                      + +
                                      +
                                      +
                                      +
                                      IP属地:新疆来自Android客户端30楼2024-08-05 23:32 +
                                      回复 +
                                      +
                                      +
                                        +
                                        + +
                                        +
                                        +
                                        +
                                        +
                                        + +
                                        +
                                        +
                                        +
                                        + + +
                                        一眼就是避重就轻,能说的都是最轻的了 +
                                        +
                                        +
                                        + +
                                        +
                                        +
                                        +
                                        IP属地:广东来自iPhone客户端31楼2024-08-05 23:45 +
                                        收起回复
                                        +
                                        +
                                          +
                                          +
                                          +
                                          +
                                          + + +
                                          +
                                          +
                                          +
                                          +
                                          +
                                          + +
                                          +
                                          + +
                                          +
                                          +
                                          +
                                          + + +
                                          网传的被隐瞒的另一部分故事,不保真
                                          +
                                          +
                                          +
                                          + +
                                          + +
                                          +
                                          +
                                          IP属地:湖南来自Android客户端32楼2024-08-06 00:08 +
                                          收起回复
                                          +
                                          +
                                            +
                                            +
                                            +
                                            +
                                            + + +
                                            +
                                            +
                                            +
                                            +
                                            +
                                            +
                                            +
                                            + +
                                            +
                                            +
                                            +
                                            + + +
                                            一般人做不到的绝情,可疑 +
                                            +
                                            +
                                            + +
                                            +
                                            +
                                            +
                                            IP属地:陕西来自Android客户端33楼2024-08-06 00:08 +
                                            收起回复
                                            +
                                            +
                                              +
                                              +
                                              +
                                              +
                                              + + +
                                              +
                                              +
                                              +
                                              +
                                              +
                                              +
                                              +
                                              + +
                                              +
                                              +
                                              +
                                              + + +
                                              快马加编 +
                                              +
                                              +
                                              + +
                                              +
                                              +
                                              +
                                              IP属地:四川来自Android客户端35楼2024-08-06 00:13 +
                                              回复 +
                                              +
                                              +
                                                +
                                                + +
                                                +
                                                +
                                                +
                                                +
                                                + +
                                                +
                                                +
                                                +
                                                + + +
                                                默认信xxn说的话已经很反映现在的环境了 +
                                                +
                                                +
                                                + +
                                                +
                                                +
                                                +
                                                IP属地:上海来自iPhone客户端36楼2024-08-06 00:30 +
                                                回复 +
                                                +
                                                +
                                                  +
                                                  + +
                                                  +
                                                  +
                                                  +
                                                  +
                                                  + +
                                                  +
                                                  +
                                                  +
                                                  + + +
                                                  這是最後一個教訓了
                                                  父親給的最後一個教訓,停止了你的反叛期,永久有效 +
                                                  +
                                                  +
                                                  + +
                                                  +
                                                  +
                                                  +
                                                  IP属地:中国香港来自Android客户端37楼2024-08-06 00:39 +
                                                  回复 +
                                                  +
                                                  +
                                                    +
                                                    + +
                                                    +
                                                    +
                                                    +
                                                    +
                                                    + +
                                                    +
                                                    +
                                                    +
                                                    + + +
                                                    编 +
                                                    +
                                                    +
                                                    + +
                                                    +
                                                    +
                                                    +
                                                    IP属地:河北来自Android客户端38楼2024-08-06 00:39 +
                                                    回复 +
                                                    +
                                                    +
                                                      +
                                                      + +
                                                      +
                                                      +
                                                      + +
                                                      +
                                                      + +
                                                      +
                                                      +
                                                      +
                                                      + + +
                                                      哇,是没头没尾的讲故事,甚至比聊天记录还干净,这下不得不信了 +
                                                      +
                                                      +
                                                      + +
                                                      +
                                                      +
                                                      +
                                                      IP属地:湖北来自Android客户端39楼2024-08-06 00:40 +
                                                      收起回复
                                                      +
                                                      +
                                                        +
                                                        +
                                                        +
                                                        +
                                                        + + +
                                                        +
                                                        +
                                                        +
                                                        +
                                                        +
                                                        +
                                                        +
                                                        + +
                                                        +
                                                        +
                                                        +
                                                        + + +
                                                        +
                                                        +
                                                        + +
                                                        +
                                                        +
                                                        +
                                                        IP属地:湖北来自iPhone客户端40楼2024-08-06 00:46 +
                                                        收起回复
                                                        +
                                                        +
                                                          +
                                                          +
                                                          +
                                                          +
                                                          + + +
                                                          +
                                                          +
                                                          +
                                                          +
                                                          +
                                                          +
                                                          +
                                                          + +
                                                          +
                                                          +
                                                          +
                                                          + + +
                                                          叛逆期,是我懂的那个吗?
                                                          就是咒他爸要死还找烂仔来对付他爸,给人当街一顿打自己跑路了那个吗?
                                                          要我说,父母都体现出最大的斯文和忍让了,换作素质低点的可能牙齿都给人干碎了。 +
                                                          +
                                                          +
                                                          + +
                                                          +
                                                          +
                                                          +
                                                          IP属地:广西来自Android客户端41楼2024-08-06 00:52 +
                                                          收起回复
                                                          +
                                                          +
                                                            +
                                                            +
                                                            +
                                                            +
                                                            + + +
                                                            +
                                                            +
                                                            +
                                                            +
                                                            +
                                                            +
                                                            +
                                                            + +
                                                            +
                                                            +
                                                            +
                                                            + + +
                                                            自己犯贱能怪谁呢 +
                                                            +
                                                            +
                                                            + +
                                                            +
                                                            +
                                                            +
                                                            IP属地:浙江来自Android客户端42楼2024-08-06 00:55 +
                                                            回复 +
                                                            +
                                                            +
                                                              +
                                                              + +
                                                              +
                                                              +
                                                              +
                                                              +
                                                              +
                                                              +

                                                              +
                                                              +
                                                              + +
                                                              +
                                                                +
                                                              • 发贴红色标题
                                                              • +
                                                              • 显示红名
                                                              • +
                                                              • 签到六倍经验
                                                              • +
                                                              +
                                                              +
                                                              +
                                                              +
                                                              + +
                                                              +
                                                              +

                                                              赠送补签卡1张,获得[经验书购买权]

                                                              + +
                                                              +

                                                              我在贴吧

                                                              +
                                                              + +
                                                              +
                                                              +
                                                              +
                                                              + +
                                                              +

                                                              扫二维码下载贴吧客户端

                                                              +
                                                              +
                                                              +
                                                              下载贴吧APP
                                                              看高清直播、视频!
                                                              +
                                                              +
                                                              + + +
                                                              +
                                                              +
                                                              + +
                                                              +
                                                              \ No newline at end of file From 3c98808409324cc5114dc3bad9da33d7a60c06de Mon Sep 17 00:00:00 2001 From: Relakkes Date: Wed, 7 Aug 2024 01:01:21 +0800 Subject: [PATCH 4/8] =?UTF-8?q?feat:=20=E8=B4=B4=E5=90=A7=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=E9=87=8D=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- constant/__init__.py | 1 + constant/baidu_tieba.py | 3 + db.py | 2 +- media_platform/tieba/client.py | 10 +- media_platform/tieba/core.py | 131 +- media_platform/tieba/help.py | 118 +- .../tieba/test_data/note_detail.html | 839 ++ .../test_data/note_detail_and_comments.html | 7558 ----------------- model/__init__.py | 1 + model/m_baidu_tieba.py | 19 + model/m_douyin.py | 1 + model/m_kuaishou.py | 1 + model/m_weibo.py | 1 + model/m_xiaohongshu.py | 1 + requirements.txt | 2 +- schema/tables.sql | 43 +- store/tieba/__init__.py | 33 +- 17 files changed, 1057 insertions(+), 7707 deletions(-) create mode 100644 constant/__init__.py create mode 100644 constant/baidu_tieba.py create mode 100644 media_platform/tieba/test_data/note_detail.html delete mode 100644 media_platform/tieba/test_data/note_detail_and_comments.html create mode 100644 model/__init__.py create mode 100644 model/m_baidu_tieba.py create mode 100644 model/m_douyin.py create mode 100644 model/m_kuaishou.py create mode 100644 model/m_weibo.py create mode 100644 model/m_xiaohongshu.py diff --git a/constant/__init__.py b/constant/__init__.py new file mode 100644 index 0000000..40a96af --- /dev/null +++ b/constant/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/constant/baidu_tieba.py b/constant/baidu_tieba.py new file mode 100644 index 0000000..cfd15e1 --- /dev/null +++ b/constant/baidu_tieba.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +TIEBA_URL = 'https://tieba.baidu.com' \ No newline at end of file diff --git a/db.py b/db.py index 335d8ae..13777a6 100644 --- a/db.py +++ b/db.py @@ -85,7 +85,7 @@ async def init_table_schema(): utils.logger.info("[init_table_schema] begin init mysql table schema ...") await init_mediacrawler_db() async_db_obj: AsyncMysqlDB = media_crawler_db_var.get() - async with aiofiles.open("schema/tables.sql", mode="r") as f: + async with aiofiles.open("schema/tables.sql", mode="r", encoding="utf-8") as f: schema_sql = await f.read() await async_db_obj.execute(schema_sql) utils.logger.info("[init_table_schema] mediacrawler table schema init successful") diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index f9e0375..edbbf19 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -10,6 +10,7 @@ from tenacity import (RetryError, retry, stop_after_attempt, wait_fixed) from base.base_crawler import AbstractApiClient +from model.m_baidu_tieba import TiebaNote from proxy.proxy_ip_pool import ProxyIpPool from tools import utils @@ -98,6 +99,7 @@ class BaiduTieBaClient(AbstractApiClient): return res utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,请尝试更换新的IP代理: {e}") + raise e async def post(self, uri: str, data: dict, **kwargs) -> Dict: """ @@ -152,7 +154,7 @@ class BaiduTieBaClient(AbstractApiClient): sort: SearchSortType = SearchSortType.TIME_DESC, note_type: SearchNoteType = SearchNoteType.FIXED_THREAD, random_sleep: bool = True - ) -> List[Dict]: + ) -> List[TiebaNote]: """ 根据关键词搜索贴吧帖子 Args: @@ -180,7 +182,7 @@ class BaiduTieBaClient(AbstractApiClient): random.randint(1, 5) return self._page_extractor.extract_search_note_list(page_content) - async def get_note_by_id(self, note_id: str) -> Dict: + async def get_note_by_id(self, note_id: str) -> TiebaNote: """ 根据帖子ID获取帖子详情 Args: @@ -192,8 +194,6 @@ class BaiduTieBaClient(AbstractApiClient): uri = f"/p/{note_id}" page_content = await self.get(uri, return_ori_content=True) return self._page_extractor.extract_note_detail(page_content) - # todo impl it - return {} async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, callback: Optional[Callable] = None) -> List[Dict]: @@ -229,7 +229,7 @@ class BaiduTieBaClient(AbstractApiClient): return result async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[Dict]: + callback: Optional[Callable] = None) -> List[Dict]: """ 获取指定评论下的所有子评论 Args: diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index 91795a4..a03f0ad 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -9,7 +9,8 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler -from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool, ProxyIpPool +from model.m_baidu_tieba import TiebaNote +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import tieba as tieba_store from tools import utils from tools.crawler_util import format_proxy_info @@ -66,8 +67,7 @@ class TieBaCrawler(AbstractCrawler): Returns: """ - - utils.logger.info("[BaiduTieBaCrawler.search] Begin search baidutieba keywords") + utils.logger.info("[BaiduTieBaCrawler.search] Begin search baidu tieba keywords") tieba_limit_count = 10 # tieba limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count: config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count @@ -82,52 +82,36 @@ class TieBaCrawler(AbstractCrawler): continue try: utils.logger.info(f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}") - note_id_list: List[str] = [] - notes_list_res = await self.tieba_client.get_notes_by_keyword( + notes_list: List[TiebaNote] = await self.tieba_client.get_notes_by_keyword( keyword=keyword, page=page, page_size=tieba_limit_count, sort=SearchSortType.TIME_DESC, note_type=SearchNoteType.FIXED_THREAD ) - utils.logger.info(f"[BaiduTieBaCrawler.search] Search notes res:{notes_list_res}") - if not notes_list_res: + if not notes_list: + utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty") break - - for note_detail in notes_list_res: - if note_detail: - await tieba_store.update_tieba_note(note_detail) - note_id_list.append(note_detail.get("note_id")) + utils.logger.info(f"[BaiduTieBaCrawler.search] Note List: {notes_list}") + await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list]) page += 1 - utils.logger.info(f"[BaiduTieBaCrawler.search] Note details: {notes_list_res}") - await self.batch_get_note_comments(note_id_list) except Exception as ex: - utils.logger.error(f"[BaiduTieBaCrawler.search] Search note list error, err: {ex}") + utils.logger.error( + f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}") break - async def fetch_creator_notes_detail(self, note_list: List[Dict]): + async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST): """ - Concurrently obtain the specified post list and save the data + Get the information and comments of the specified post + Args: + note_id_list: + + Returns: + """ semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ - self.get_note_detail( - note_id=post_item.get("note_id"), - semaphore=semaphore - ) - for post_item in note_list - ] - - note_details = await asyncio.gather(*task_list) - for note_detail in note_details: - if note_detail: - await tieba_store.update_tieba_note(note_detail) - - async def get_specified_notes(self): - """Get the information and comments of the specified post""" - semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) - task_list = [ - self.get_note_detail(note_id=note_id, semaphore=semaphore) for note_id in config.TIEBA_SPECIFIED_ID_LIST + self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore) for note_id in note_id_list ] note_details = await asyncio.gather(*task_list) for note_detail in note_details: @@ -135,11 +119,20 @@ class TieBaCrawler(AbstractCrawler): await tieba_store.update_tieba_note(note_detail) await self.batch_get_note_comments(config.TIEBA_SPECIFIED_ID_LIST) - async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: - """Get note detail""" + async def get_note_detail_async_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[TiebaNote]: + """ + Get note detail + Args: + note_id: baidu tieba note id + semaphore: asyncio semaphore + + Returns: + + """ async with semaphore: try: - note_detail: Dict = await self.tieba_client.get_note_by_id(note_id) + utils.logger.info(f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}") + note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id) if not note_detail: utils.logger.error( f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}") @@ -153,23 +146,38 @@ class TieBaCrawler(AbstractCrawler): f"[BaiduTieBaCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}") return None - async def batch_get_note_comments(self, note_list: List[str]): - """Batch get note comments""" + async def batch_get_note_comments(self, note_id_list: List[str]): + """ + Batch get note comments + Args: + note_id_list: + + Returns: + + """ if not config.ENABLE_GET_COMMENTS: utils.logger.info(f"[BaiduTieBaCrawler.batch_get_note_comments] Crawling comment mode is not enabled") return utils.logger.info( - f"[BaiduTieBaCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}") + f"[BaiduTieBaCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_id_list}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list: List[Task] = [] - for note_id in note_list: - task = asyncio.create_task(self.get_comments(note_id, semaphore), name=note_id) + for note_id in note_id_list: + task = asyncio.create_task(self.get_comments_async_task(note_id, semaphore), name=note_id) task_list.append(task) await asyncio.gather(*task_list) - async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore): - """Get note comments with keyword filtering and quantity limitation""" + async def get_comments_async_task(self, note_id: str, semaphore: asyncio.Semaphore): + """ + Get comments async task + Args: + note_id: + semaphore: + + Returns: + + """ async with semaphore: utils.logger.info(f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_id}") await self.tieba_client.get_note_all_comments( @@ -178,23 +186,6 @@ class TieBaCrawler(AbstractCrawler): callback=tieba_store.batch_update_tieba_note_comments ) - async def create_tieba_client(self, ip_pool: ProxyIpPool) -> BaiduTieBaClient: - """ - Create tieba client - Args: - ip_pool: - - Returns: - - """ - """Create tieba client""" - utils.logger.info("[BaiduTieBaCrawler.create_tieba_client] Begin create baidutieba API client ...") - cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) - tieba_client_obj = BaiduTieBaClient( - ip_pool=ip_pool, - ) - return tieba_client_obj - async def launch_browser( self, chromium: BrowserType, @@ -202,7 +193,17 @@ class TieBaCrawler(AbstractCrawler): user_agent: Optional[str], headless: bool = True ) -> BrowserContext: - """Launch browser and create browser context""" + """ + Launch browser and create browser + Args: + chromium: + playwright_proxy: + user_agent: + headless: + + Returns: + + """ utils.logger.info("[BaiduTieBaCrawler.launch_browser] Begin create browser context ...") if config.SAVE_LOGIN_STATE: # feat issue #14 @@ -227,6 +228,10 @@ class TieBaCrawler(AbstractCrawler): return browser_context async def close(self): - """Close browser context""" + """ + Close browser context + Returns: + + """ await self.browser_context.close() utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...") diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index 2c1144d..efba258 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -1,18 +1,21 @@ # -*- coding: utf-8 -*- - -from typing import List, Dict +import re +from typing import List, Dict, Tuple from parsel import Selector +from model.m_baidu_tieba import TiebaNote +from constant import baidu_tieba as const + class TieBaExtractor: def __init__(self): pass @staticmethod - def extract_search_note_list(page_content: str) -> List[Dict]: + def extract_search_note_list(page_content: str) -> List[TiebaNote]: """ - 提取贴吧帖子列表 + 提取贴吧帖子列表,这里提取的关键词搜索结果页的数据,还缺少帖子的回复数和回复页等数据 Args: page_content: 页面内容的HTML字符串 @@ -21,33 +24,24 @@ class TieBaExtractor: """ xpath_selector = "//div[@class='s_post']" post_list = Selector(text=page_content).xpath(xpath_selector) - result = [] + result: List[TiebaNote] = [] for post in post_list: - post_id = post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip() - title = post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip() - link = post.xpath(".//span[@class='p_title']/a/@href").get(default='') - description = post.xpath(".//div[@class='p_content']/text()").get(default='').strip() - forum = post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip() - forum_link = post.xpath(".//a[@class='p_forum']/@href").get(default='') - author = post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip() - author_link = post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default='') - date = post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip() - result.append({ - "note_id": post_id, - "title": title, - "desc": description, - "note_url": link, - "time": date, - "tieba_name": forum, - "tieba_link": forum_link, - "nickname": author, - "nickname_link": author_link, - }) - + tieba_note = TiebaNote( + note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(), + title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(), + desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(), + note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get(default=''), + user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip(), + user_link=const.TIEBA_URL + post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default=''), + tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(), + tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get(default=''), + publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip(), + ) + result.append(tieba_note) return result - @staticmethod - def extract_note_detail(page_content: str) -> Dict: + + def extract_note_detail(self, page_content: str) -> TiebaNote: """ 提取贴吧帖子详情 Args: @@ -57,13 +51,33 @@ class TieBaExtractor: """ content_selector = Selector(text=page_content) - # 查看楼主的链接: only_view_author_link: / p / 9117905169?see_lz = 1 - only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip() # + first_floor_selector = content_selector.xpath("//div[@class='p_postlist'][1]") + only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip() note_id = only_view_author_link.split("?")[0].split("/")[-1] - title = content_selector.xpath("//*[@id='j_core_title_wrap']/h3").get(default='').strip() - desc = content_selector.xpath("//meta[@name='description']").get(default='').strip() - note_url = f"/p/{note_id}" - pass + # 帖子回复数、回复页数 + thread_num_infos = content_selector.xpath( + "//div[@id='thread_theme_5']//li[@class='l_reply_num']//span[@class='red']" + ) + # IP地理位置、发表时间 + other_info_content = content_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip() + ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content) + note = TiebaNote( + note_id=note_id, + title=content_selector.xpath("//title/text()").get(default='').strip(), + desc=content_selector.xpath("//meta[@name='description']/@content").get(default='').strip(), + note_url=const.TIEBA_URL + f"/p/{note_id}", + user_link=const.TIEBA_URL + first_floor_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(), + user_nickname=first_floor_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(), + user_avatar=first_floor_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(default='').strip(), + tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(), + tieba_link=const.TIEBA_URL + content_selector.xpath("//a[@class='card_title_fname']/@href").get(default=''), + ip_location=ip_location, + publish_time=publish_time, + total_replay_num=thread_num_infos[0].xpath("./text()").get(default='').strip(), + total_replay_page=thread_num_infos[1].xpath("./text()").get(default='').strip(), + ) + note.title = note.title.replace(f"【{note.tieba_name}】_百度贴吧", "") + return note @staticmethod def extract_tieba_note_comments(page_content: str) -> List[Dict]: @@ -93,12 +107,40 @@ class TieBaExtractor: "time": date, }) + @staticmethod + def extract_ip_and_pub_time(html_content: str) -> Tuple[str, str]: + """ + 提取IP位置和发布时间 + Args: + html_content: + Returns: -if __name__ == '__main__': + """ + pattern_ip = re.compile(r'IP属地:(\S+)') + pattern_pub_time = re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2})') + ip_match = pattern_ip.search(html_content) + time_match = pattern_pub_time.search(html_content) + ip = ip_match.group(1) if ip_match else "" + pub_time = time_match.group(1) if time_match else "" + return ip, pub_time + +def test_extract_search_note_list(): with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f: content = f.read() extractor = TieBaExtractor() - _result = extractor.extract_search_note_list(content) - print(_result) - print(f"Total: {len(_result)}") + result = extractor.extract_search_note_list(content) + print(result) + + +def test_extract_note_detail(): + with open("test_data/note_detail.html", "r", encoding="utf-8") as f: + content = f.read() + extractor = TieBaExtractor() + result = extractor.extract_note_detail(content) + print(result.model_dump()) + + +if __name__ == '__main__': + test_extract_search_note_list() + test_extract_note_detail() diff --git a/media_platform/tieba/test_data/note_detail.html b/media_platform/tieba/test_data/note_detail.html new file mode 100644 index 0000000..e4ecae6 --- /dev/null +++ b/media_platform/tieba/test_data/note_detail.html @@ -0,0 +1,839 @@ +对于一个父亲来说,这个女儿14岁就死了【以太比特吧】_百度贴吧 + + + + + + + + +
                                                              以太比特吧 关注:309,573贴子:5,368,434
                                                              +
                                                              + + +
                                                              + 贴子管理 +
                                                                + +
                                                              +
                                                              +
                                                              +
                                                              +

                                                              对于一个父亲来说,这个女儿14岁就死了

                                                              只看楼主收藏回复

                                                              +
                                                              + +
                                                              +
                                                              +
                                                              +
                                                              +
                                                              点击展开,查看完整图片


                                                              IP属地:广东来自Android客户端1楼2024-08-05 16:56回复
                                                                本来觉得就凭14岁的这点叛逆父亲不再理她觉得这个这个父亲是有点问题的,后来看到母亲也不理了,我就知道这女的肯定隐藏了很多自己干得垃圾事没说,她活该


                                                                IP属地:广东来自Android客户端2楼2024-08-05 17:07
                                                                收起回复
                                                                  • 铭寒号废了重练一个而已,只是她妈后来才明白这一点
                                                                  • youxi卡米糯小错一般都能包容,能这样多半是原则上大是大非
                                                                  • 你的隔壁王哥十四岁能把人逼到没有一点犹豫的跳楼,有多大的学习压力想过没?这种家庭内为了子女成才会不记一切代价,甚至是以折磨的方式,而之后的一切变故都是由于这次跳楼父亲不闻不问的态度,换作是你心灰意泠后只会做的比他更过分,亲情破裂会让最后一丝克制也一同丧失。
                                                                  • 快拉黑尔父回复 你的隔壁王哥 :闷油瓶的话还能理解一下,小太妹为了得到什么说跳就跳我是一点也不怀疑也不同情的。你现在同情小心以后糟老罪咯。真要对她不好也不至于长大了好多事想明白了反而一直想修复关系。
                                                                  • 你的隔壁王哥回复 快拉黑尔父 :十四岁第一次逃学,还在担心父母会不会打他,说明在此之前完全就是个乖乖女。初三才逃第一次学,如果是太妹初二就已经插着翅膀到处飞了,而且跳楼母亲没有任何心里准备,就说明在以往的形象里是不可能做出这事,说明从一开始就只是正常女学生。
                                                                    • 快拉黑尔父回复 你的隔壁王哥 :人变成太妹,性格一完全变成了很难理解吗?初中时代常有的事
                                                                    • 你的隔壁王哥回复 快拉黑尔父 :如果说是太妹,那么跳楼之前必然会有各种前车之鉴,换句话说为了得到某样需求常用跳楼作为威胁。这种头也不回没有任何犹豫的跳楼,显然不是为了得到什么,就是单纯的寻死,你觉得太妹会这么纯粹的寻死吗?太妹的心理承受能力可高多了,只有未经世事的小白心里破防了才会这么干脆。
                                                                    • 快拉黑尔父回复 你的隔壁王哥 :完全的一面之词,结果可以看到的是什么?14岁钱的好父亲当她死了。对她一直很好的母亲也断了联系。想修复关系的反而是她。告诉你一个众所周知的事,人发言,一定,一定会下意识的美化自己。这是下意识。然后你再看看这个故事。
                                                                    • 快拉黑尔父回复 你的隔壁王哥 :而你所说的这个想索求什么,全包含在了一句叛逆期懂得都懂这一句话里面隐藏了。这就是她下意识的掩盖的事了。
                                                                    • 你的隔壁王哥回复 快拉黑尔父 :你要分析心理啊,纯粹的寻死只会在心里破防的时候才会存在,你如果接触过混社会的太妹,你就会发现他们会以寻求刺激为炫耀的资本,在这种群体内心理承受能力高的离谱。要想让一个学生心里破防,只能让她的天塌了,脆弱的心里才会在极短时间内崩溃,只有长期压抑才会产生这种心理。
                                                                  • 我也说一句

                                                                    还有118条回复,点击查看

                                                                  这女的晚上不回家她爹去找她,被黄毛打进医院,也没来医院看过,最后和黄毛结婚也不来往。想起三套房想爆她爹金币,结果找不到求助平台。幸好她爹跑得快。


                                                                  IP属地:福建来自Android客户端4楼2024-08-05 17:38
                                                                  收起回复
                                                                    我知道,可怜之人,必有()


                                                                    IP属地:浙江来自Android客户端7楼2024-08-05 18:38
                                                                    收起回复
                                                                      太假了,混社会不良太妹,还考高中,选专业。当没有大专么


                                                                      IP属地:天津来自Android客户端8楼2024-08-05 18:43
                                                                      收起回复
                                                                        边倪m蓖


                                                                        IP属地:广东来自Android客户端9楼2024-08-05 18:52
                                                                        回复
                                                                          父亲问题很大,应该在14岁那年再生一个或者领养一个


                                                                          IP属地:河北来自Android客户端10楼2024-08-05 18:59
                                                                          收起回复
                                                                            她爸怎么忍住不创小号的


                                                                            IP属地:浙江来自Android客户端12楼2024-08-05 19:09
                                                                            收起回复
                                                                              站在作者的角度来看,肯定都是挑了对自己及其有利的东西来说了,然而


                                                                              IP属地:四川来自Android客户端13楼2024-08-05 19:11
                                                                              收起回复
                                                                                这个好像是之前新闻里的


                                                                                IP属地:江苏来自Android客户端17楼2024-08-05 19:31
                                                                                收起回复
                                                                                  叛逆期你懂的这6个字包含了不知道多少事父母没对他发火而是耐心劝导也不知道包含了多少,我不好说,而且14岁逃学混社会初三高一的学生这么弄基本也是烂了


                                                                                  IP属地:黑龙江来自Android客户端21楼2024-08-05 20:06
                                                                                  收起回复
                                                                                    我们群有个女的。。。他说他爹家暴。。。喝点酒打她跟他妈。。她还轻生过。。。慢慢的的了解了。。。。他爹好像没那么不堪。。。一个月4000多生活费给她。。。她上学都打出租车。。。他爹还不怎么喝酒。。。他有抑郁症他爹还带她去看病。。。。还学了中医给她食补。。但是他就记得他爹喝酒打她跟他妈,。。。。我就纳了闷了。。。。这两个版本的故事不大对。。。。女人嘴里没实话啊。。。。。她说她爹喝酒打他妈,他直接拿水果刀给他爹捅了。淌了好多血,所以他爹送她进精神病院 反正挺混乱的。。。挺漂亮的一个高中女孩,就喜欢酒吧喝酒。。蹦迪。。。说全班男的都给她表过白。。。但是就喜欢小混混。。。
                                                                                    我得出一个结论。这家伙真有病。。。。她爹绝对对她不错。。。。。也是贱高中家庭好,还喜欢混混很蹦迪。。。。高考才两百还是三百多让同学骂了一顿。。。。破防了在群里哭跑路了。。。


                                                                                    IP属地:山东来自Android客户端22楼2024-08-05 20:19
                                                                                    收起回复
                                                                                      女的独生,八成是结婚嫁了混混日子不如意,想着爆父母金币3套房,后来连母亲都躲着她足以说明一切


                                                                                      IP属地:广东来自Android客户端23楼2024-08-05 20:35
                                                                                      收起回复
                                                                                        活该,早点死别耽误别人


                                                                                        IP属地:江西来自Android客户端24楼2024-08-05 20:37
                                                                                        回复
                                                                                          对自己闭口不谈,不好评价


                                                                                          IP属地:安徽来自Android客户端25楼2024-08-05 22:54
                                                                                          回复
                                                                                            再叛逆也不至于寻死
                                                                                            硬要死那就满足你当你死了


                                                                                            IP属地:广西来自Android客户端26楼2024-08-05 22:57
                                                                                            回复
                                                                                              xxn的话一个标点符号都不能信


                                                                                              IP属地:广西来自Android客户端27楼2024-08-05 23:03
                                                                                              回复
                                                                                                故事太过于离谱,是没讲完还是编的


                                                                                                IP属地:湖北来自Android客户端28楼2024-08-05 23:05
                                                                                                收起回复
                                                                                                  她的母亲从前那么希望这个家和好,对女儿也很好,结果突然也躲着她


                                                                                                  IP属地:新疆来自Android客户端30楼2024-08-05 23:32
                                                                                                  回复
                                                                                                    一眼就是避重就轻,能说的都是最轻的了


                                                                                                    IP属地:广东来自iPhone客户端31楼2024-08-05 23:45
                                                                                                    收起回复
                                                                                                      网传的被隐瞒的另一部分故事,不保真


                                                                                                      IP属地:湖南来自Android客户端32楼2024-08-06 00:08
                                                                                                      收起回复
                                                                                                        一般人做不到的绝情,可疑


                                                                                                        IP属地:陕西来自Android客户端33楼2024-08-06 00:08
                                                                                                        收起回复
                                                                                                          快马加编


                                                                                                          IP属地:四川来自Android客户端35楼2024-08-06 00:13
                                                                                                          回复
                                                                                                            默认信xxn说的话已经很反映现在的环境了


                                                                                                            IP属地:上海来自iPhone客户端36楼2024-08-06 00:30
                                                                                                            回复
                                                                                                              這是最後一個教訓了
                                                                                                              父親給的最後一個教訓,停止了你的反叛期,永久有效


                                                                                                              IP属地:中国香港来自Android客户端37楼2024-08-06 00:39
                                                                                                              回复


                                                                                                                IP属地:河北来自Android客户端38楼2024-08-06 00:39
                                                                                                                回复
                                                                                                                  哇,是没头没尾的讲故事,甚至比聊天记录还干净,这下不得不信了


                                                                                                                  IP属地:湖北来自Android客户端39楼2024-08-06 00:40
                                                                                                                  收起回复


                                                                                                                    IP属地:湖北来自iPhone客户端40楼2024-08-06 00:46
                                                                                                                    收起回复
                                                                                                                      叛逆期,是我懂的那个吗?
                                                                                                                      就是咒他爸要死还找烂仔来对付他爸,给人当街一顿打自己跑路了那个吗?
                                                                                                                      要我说,父母都体现出最大的斯文和忍让了,换作素质低点的可能牙齿都给人干碎了。


                                                                                                                      IP属地:广西来自Android客户端41楼2024-08-06 00:52
                                                                                                                      收起回复
                                                                                                                        自己犯贱能怪谁呢


                                                                                                                        IP属地:浙江来自Android客户端42楼2024-08-06 00:55
                                                                                                                        回复
                                                                                                                          +
                                                                                                                          + + + + + + + + + + + + + + + + +
                                                                                                                          \ No newline at end of file diff --git a/media_platform/tieba/test_data/note_detail_and_comments.html b/media_platform/tieba/test_data/note_detail_and_comments.html deleted file mode 100644 index 132068a..0000000 --- a/media_platform/tieba/test_data/note_detail_and_comments.html +++ /dev/null @@ -1,7558 +0,0 @@ -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          - -
                                                                                                                          - -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          - -
                                                                                                                          - 以太比特吧 关注:309,572贴子:5,386,110
                                                                                                                          -
                                                                                                                          -
                                                                                                                          - -
                                                                                                                          -
                                                                                                                          -
                                                                                                                            -
                                                                                                                          • 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 下一页 - 尾页 -
                                                                                                                          • -
                                                                                                                          • 760回复贴,共12页 -
                                                                                                                          • -
                                                                                                                          • ,跳到 页   -
                                                                                                                          • -
                                                                                                                          - -
                                                                                                                          -
                                                                                                                          - - -
                                                                                                                          - 贴子管理 -
                                                                                                                            - -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          - -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -

                                                                                                                          对于一个父亲来说,这个女儿14岁就死了

                                                                                                                          只看楼主收藏回复 -

                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          - -
                                                                                                                          -
                                                                                                                          - -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          - - - -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          点击展开,查看完整图片 -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          - -
                                                                                                                          - -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          -
                                                                                                                          IP属地:广东来自Android客户端1楼2024-08-05 16:56回复 -
                                                                                                                          -
                                                                                                                            -
                                                                                                                            - -
                                                                                                                            -
                                                                                                                            -
                                                                                                                            - -
                                                                                                                            -
                                                                                                                            - -
                                                                                                                            -
                                                                                                                            -
                                                                                                                            -
                                                                                                                            - - -
                                                                                                                            - 本来觉得就凭14岁的这点叛逆父亲不再理她觉得这个这个父亲是有点问题的,后来看到母亲也不理了,我就知道这女的肯定隐藏了很多自己干得垃圾事没说,她活该
                                                                                                                            -
                                                                                                                            -
                                                                                                                            - -
                                                                                                                            - -
                                                                                                                            -
                                                                                                                            -
                                                                                                                            IP属地:广东来自Android客户端2楼2024-08-05 17:07 -
                                                                                                                            收起回复
                                                                                                                            -
                                                                                                                            -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              -
                                                                                                                                -
                                                                                                                              • -
                                                                                                                                铭寒号废了重练一个而已,只是她妈后来才明白这一点 - -
                                                                                                                                -
                                                                                                                              • -
                                                                                                                              • -
                                                                                                                                youxi卡米糯小错一般都能包容,能这样多半是原则上大是大非 - -
                                                                                                                                -
                                                                                                                              • -
                                                                                                                              • -
                                                                                                                                你的隔壁王哥十四岁能把人逼到没有一点犹豫的跳楼,有多大的学习压力想过没?这种家庭内为了子女成才会不记一切代价,甚至是以折磨的方式,而之后的一切变故都是由于这次跳楼父亲不闻不问的态度,换作是你心灰意泠后只会做的比他更过分,亲情破裂会让最后一丝克制也一同丧失。 - -
                                                                                                                                -
                                                                                                                              • -
                                                                                                                              • -
                                                                                                                                -
                                                                                                                                - 快拉黑尔父回复 你的隔壁王哥 :闷油瓶的话还能理解一下,小太妹为了得到什么说跳就跳我是一点也不怀疑也不同情的。你现在同情小心以后糟老罪咯。真要对她不好也不至于长大了好多事想明白了反而一直想修复关系。 - -
                                                                                                                                -
                                                                                                                              • -
                                                                                                                              • -
                                                                                                                                你的隔壁王哥回复 快拉黑尔父 :十四岁第一次逃学,还在担心父母会不会打他,说明在此之前完全就是个乖乖女。初三才逃第一次学,如果是太妹初二就已经插着翅膀到处飞了,而且跳楼母亲没有任何心里准备,就说明在以往的形象里是不可能做出这事,说明从一开始就只是正常女学生。 - -
                                                                                                                                -
                                                                                                                              • -
                                                                                                                                  -
                                                                                                                                • -
                                                                                                                                  -
                                                                                                                                  - 快拉黑尔父回复 你的隔壁王哥 :人变成太妹,性格一完全变成了很难理解吗?初中时代常有的事 - -
                                                                                                                                  -
                                                                                                                                • -
                                                                                                                                • -
                                                                                                                                  你的隔壁王哥回复 快拉黑尔父 :如果说是太妹,那么跳楼之前必然会有各种前车之鉴,换句话说为了得到某样需求常用跳楼作为威胁。这种头也不回没有任何犹豫的跳楼,显然不是为了得到什么,就是单纯的寻死,你觉得太妹会这么纯粹的寻死吗?太妹的心理承受能力可高多了,只有未经世事的小白心里破防了才会这么干脆。 - -
                                                                                                                                  -
                                                                                                                                • -
                                                                                                                                • -
                                                                                                                                  -
                                                                                                                                  - 快拉黑尔父回复 你的隔壁王哥 :完全的一面之词,结果可以看到的是什么?14岁钱的好父亲当她死了。对她一直很好的母亲也断了联系。想修复关系的反而是她。告诉你一个众所周知的事,人发言,一定,一定会下意识的美化自己。这是下意识。然后你再看看这个故事。 - -
                                                                                                                                  -
                                                                                                                                • -
                                                                                                                                • -
                                                                                                                                  -
                                                                                                                                  - 快拉黑尔父回复 你的隔壁王哥 :而你所说的这个想索求什么,全包含在了一句叛逆期懂得都懂这一句话里面隐藏了。这就是她下意识的掩盖的事了。 - -
                                                                                                                                  -
                                                                                                                                • -
                                                                                                                                • -
                                                                                                                                  你的隔壁王哥回复 快拉黑尔父 :你要分析心理啊,纯粹的寻死只会在心里破防的时候才会存在,你如果接触过混社会的太妹,你就会发现他们会以寻求刺激为炫耀的资本,在这种群体内心理承受能力高的离谱。要想让一个学生心里破防,只能让她的天塌了,脆弱的心里才会在极短时间内崩溃,只有长期压抑才会产生这种心理。 - -
                                                                                                                                  -
                                                                                                                                • -
                                                                                                                                -
                                                                                                                              • 我也说一句 -

                                                                                                                                还有113条回复,点击查看 -

                                                                                                                                -
                                                                                                                              • -
                                                                                                                              - -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              - - -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              - -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              - - -
                                                                                                                              - 这女的晚上不回家她爹去找她,被黄毛打进医院,也没来医院看过,最后和黄毛结婚也不来往。想起三套房想爆她爹金币,结果找不到求助平台。幸好她爹跑得快。 -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              - -
                                                                                                                              - -
                                                                                                                              -
                                                                                                                              -
                                                                                                                              IP属地:福建来自Android客户端4楼2024-08-05 17:38 -
                                                                                                                              收起回复
                                                                                                                              -
                                                                                                                              -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                - - -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                - -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                - - -
                                                                                                                                我知道,可怜之人,必有() -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                - -
                                                                                                                                - -
                                                                                                                                -
                                                                                                                                -
                                                                                                                                IP属地:浙江来自Android客户端7楼2024-08-05 18:38 -
                                                                                                                                收起回复
                                                                                                                                -
                                                                                                                                -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  - - -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  - -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  - - -
                                                                                                                                  太假了,混社会不良太妹,还考高中,选专业。当没有大专么 -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  - -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                  IP属地:天津来自Android客户端8楼2024-08-05 18:43 -
                                                                                                                                  收起回复
                                                                                                                                  -
                                                                                                                                  -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    - - -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    - -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    - - -
                                                                                                                                    边倪m蓖 -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    - -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                    IP属地:广东来自Android客户端9楼2024-08-05 18:52 -
                                                                                                                                    回复 -
                                                                                                                                    -
                                                                                                                                    -
                                                                                                                                      -
                                                                                                                                      - -
                                                                                                                                      -
                                                                                                                                      -
                                                                                                                                      -
                                                                                                                                      -
                                                                                                                                      - -
                                                                                                                                      -
                                                                                                                                      -
                                                                                                                                      -
                                                                                                                                      - - -
                                                                                                                                      父亲问题很大,应该在14岁那年再生一个或者领养一个 -
                                                                                                                                      -
                                                                                                                                      -
                                                                                                                                      - -
                                                                                                                                      - -
                                                                                                                                      -
                                                                                                                                      -
                                                                                                                                      IP属地:河北来自Android客户端10楼2024-08-05 18:59 -
                                                                                                                                      收起回复
                                                                                                                                      -
                                                                                                                                      -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        - - -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        - -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        - - -
                                                                                                                                        她爸怎么忍住不创小号的 -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        - -
                                                                                                                                        - -
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                        IP属地:浙江来自Android客户端12楼2024-08-05 19:09 -
                                                                                                                                        收起回复
                                                                                                                                        -
                                                                                                                                        -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          - - -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          - -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          - -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          - - -
                                                                                                                                          站在作者的角度来看,肯定都是挑了对自己及其有利的东西来说了,然而 -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          - -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                          IP属地:四川来自Android客户端13楼2024-08-05 19:11 -
                                                                                                                                          收起回复
                                                                                                                                          -
                                                                                                                                          -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            - - -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            - -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            - - -
                                                                                                                                            这个好像是之前新闻里的 -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            - -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                            IP属地:江苏来自Android客户端17楼2024-08-05 19:31 -
                                                                                                                                            收起回复
                                                                                                                                            -
                                                                                                                                            -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              - - -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              - -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              - - -
                                                                                                                                              叛逆期你懂的这6个字包含了不知道多少事父母没对他发火而是耐心劝导也不知道包含了多少,我不好说,而且14岁逃学混社会初三高一的学生这么弄基本也是烂了 -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              - -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                              IP属地:黑龙江来自Android客户端21楼2024-08-05 20:06 -
                                                                                                                                              收起回复
                                                                                                                                              -
                                                                                                                                              -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                - - -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                - -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                - - -
                                                                                                                                                - 我们群有个女的。。。他说他爹家暴。。。喝点酒打她跟他妈。。她还轻生过。。。慢慢的的了解了。。。。他爹好像没那么不堪。。。一个月4000多生活费给她。。。她上学都打出租车。。。他爹还不怎么喝酒。。。他有抑郁症他爹还带她去看病。。。。还学了中医给她食补。。但是他就记得他爹喝酒打她跟他妈,。。。。我就纳了闷了。。。。这两个版本的故事不大对。。。。女人嘴里没实话啊。。。。。她说她爹喝酒打他妈,他直接拿水果刀给他爹捅了。淌了好多血,所以他爹送她进精神病院 - 反正挺混乱的。。。挺漂亮的一个高中女孩,就喜欢酒吧喝酒。。蹦迪。。。说全班男的都给她表过白。。。但是就喜欢小混混。。。
                                                                                                                                                我得出一个结论。这家伙真有病。。。。她爹绝对对她不错。。。。。也是贱高中家庭好,还喜欢混混很蹦迪。。。。高考才两百还是三百多让同学骂了一顿。。。。破防了在群里哭跑路了。。。 -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                - -
                                                                                                                                                - -
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                IP属地:山东来自Android客户端22楼2024-08-05 20:19 -
                                                                                                                                                收起回复
                                                                                                                                                -
                                                                                                                                                -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                    -
                                                                                                                                                  • -
                                                                                                                                                    小学森有的人记吃不记打,有的则相反 其实想想,谁都是第一次做父母,也都是第一次做儿女,我真觉得家庭关系挺难处的。有时候我能感受到父母的爱,但是说不了几句话,我就会有一股莫名的戾气,很容易发火,但其实我是个脾气很好的人 - -
                                                                                                                                                    -
                                                                                                                                                  • -
                                                                                                                                                  • -
                                                                                                                                                    小学森回复 萌新龍傲天 :最近看了一些文章,就有提到这种情绪,大概是因为小时候父母的情感投射导致的原因,每一次父母的好,都是在加深我的愧疚(比如说,赚钱都是为了你,怎么怎么滴,宣扬牺牲和奉献以及苦难),所以我拒绝父母对我的好,所以我逆反,似乎这样可以减轻我的负面情绪 - -
                                                                                                                                                    -
                                                                                                                                                  • -
                                                                                                                                                  • -
                                                                                                                                                    小学森记得有次电视上在放一个情绪很浓烈的视频,可能是近代史之类的,我爸问我有什么感想,其实我心里波澜壮阔,但面无表情,并且淡淡回了一句,一般,没什么感受。。然后我爸是我是个冷血动物,我笑了,说,没错,我确实是个冷血动物 - -
                                                                                                                                                    -
                                                                                                                                                  • -
                                                                                                                                                  • -
                                                                                                                                                    萌新龍傲天回复 小学森 :看个人跟家庭环境被 - -
                                                                                                                                                    -
                                                                                                                                                  • -
                                                                                                                                                  • -
                                                                                                                                                    小学森所以,如果我在这个故事里面,我大概率也会这样,甚至别说还能“抬头不见低头见了”,可能我早快活去了 - -
                                                                                                                                                    -
                                                                                                                                                  • - -
                                                                                                                                                  • 我也说一句 -

                                                                                                                                                    还有34条回复,点击查看 -

                                                                                                                                                    -
                                                                                                                                                  • -
                                                                                                                                                  - -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  - -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  - - -
                                                                                                                                                  女的独生,八成是结婚嫁了混混日子不如意,想着爆父母金币3套房,后来连母亲都躲着她足以说明一切 -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  - -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                  IP属地:广东来自Android客户端23楼2024-08-05 20:35 -
                                                                                                                                                  收起回复
                                                                                                                                                  -
                                                                                                                                                  -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    - - -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    - -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    - - -
                                                                                                                                                    活该,早点死别耽误别人 -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    - -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                    IP属地:江西来自Android客户端24楼2024-08-05 20:37 -
                                                                                                                                                    回复 -
                                                                                                                                                    -
                                                                                                                                                    -
                                                                                                                                                      -
                                                                                                                                                      - -
                                                                                                                                                      -
                                                                                                                                                      -
                                                                                                                                                      - -
                                                                                                                                                      -
                                                                                                                                                      - -
                                                                                                                                                      -
                                                                                                                                                      -
                                                                                                                                                      -
                                                                                                                                                      - - -
                                                                                                                                                      对自己闭口不谈,不好评价 -
                                                                                                                                                      -
                                                                                                                                                      -
                                                                                                                                                      - -
                                                                                                                                                      -
                                                                                                                                                      -
                                                                                                                                                      -
                                                                                                                                                      IP属地:安徽来自Android客户端25楼2024-08-05 22:54 -
                                                                                                                                                      回复 -
                                                                                                                                                      -
                                                                                                                                                      -
                                                                                                                                                        -
                                                                                                                                                        - -
                                                                                                                                                        -
                                                                                                                                                        -
                                                                                                                                                        -
                                                                                                                                                        -
                                                                                                                                                        - -
                                                                                                                                                        -
                                                                                                                                                        -
                                                                                                                                                        -
                                                                                                                                                        -
                                                                                                                                                        - - -
                                                                                                                                                        再叛逆也不至于寻死
                                                                                                                                                        硬要死那就满足你当你死了 -
                                                                                                                                                        -
                                                                                                                                                        -
                                                                                                                                                        - -
                                                                                                                                                        -
                                                                                                                                                        -
                                                                                                                                                        -
                                                                                                                                                        IP属地:广西来自Android客户端26楼2024-08-05 22:57 -
                                                                                                                                                        回复 -
                                                                                                                                                        -
                                                                                                                                                        -
                                                                                                                                                          -
                                                                                                                                                          - -
                                                                                                                                                          -
                                                                                                                                                          -
                                                                                                                                                          -
                                                                                                                                                          -
                                                                                                                                                          - -
                                                                                                                                                          -
                                                                                                                                                          -
                                                                                                                                                          -
                                                                                                                                                          - - -
                                                                                                                                                          xxn的话一个标点符号都不能信 -
                                                                                                                                                          -
                                                                                                                                                          -
                                                                                                                                                          - -
                                                                                                                                                          -
                                                                                                                                                          -
                                                                                                                                                          -
                                                                                                                                                          IP属地:广西来自Android客户端27楼2024-08-05 23:03 -
                                                                                                                                                          回复 -
                                                                                                                                                          -
                                                                                                                                                          -
                                                                                                                                                            -
                                                                                                                                                            - -
                                                                                                                                                            -
                                                                                                                                                            -
                                                                                                                                                            -
                                                                                                                                                            -
                                                                                                                                                            - -
                                                                                                                                                            -
                                                                                                                                                            -
                                                                                                                                                            -
                                                                                                                                                            - - -
                                                                                                                                                            故事太过于离谱,是没讲完还是编的 -
                                                                                                                                                            -
                                                                                                                                                            -
                                                                                                                                                            - -
                                                                                                                                                            -
                                                                                                                                                            -
                                                                                                                                                            -
                                                                                                                                                            IP属地:湖北来自Android客户端28楼2024-08-05 23:05 -
                                                                                                                                                            收起回复
                                                                                                                                                            -
                                                                                                                                                            -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              - - -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              - -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              - - -
                                                                                                                                                              她的母亲从前那么希望这个家和好,对女儿也很好,结果突然也躲着她 -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              - -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                              IP属地:新疆来自Android客户端30楼2024-08-05 23:32 -
                                                                                                                                                              回复 -
                                                                                                                                                              -
                                                                                                                                                              -
                                                                                                                                                                -
                                                                                                                                                                - -
                                                                                                                                                                -
                                                                                                                                                                -
                                                                                                                                                                -
                                                                                                                                                                -
                                                                                                                                                                - -
                                                                                                                                                                -
                                                                                                                                                                -
                                                                                                                                                                -
                                                                                                                                                                - - -
                                                                                                                                                                一眼就是避重就轻,能说的都是最轻的了 -
                                                                                                                                                                -
                                                                                                                                                                -
                                                                                                                                                                - -
                                                                                                                                                                -
                                                                                                                                                                -
                                                                                                                                                                -
                                                                                                                                                                IP属地:广东来自iPhone客户端31楼2024-08-05 23:45 -
                                                                                                                                                                收起回复
                                                                                                                                                                -
                                                                                                                                                                -
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                  - - -
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                  - -
                                                                                                                                                                  -
                                                                                                                                                                  - -
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                  - - -
                                                                                                                                                                  网传的被隐瞒的另一部分故事,不保真
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                  - -
                                                                                                                                                                  - -
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                  IP属地:湖南来自Android客户端32楼2024-08-06 00:08 -
                                                                                                                                                                  收起回复
                                                                                                                                                                  -
                                                                                                                                                                  -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    - - -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    - -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    - - -
                                                                                                                                                                    一般人做不到的绝情,可疑 -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    - -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                    IP属地:陕西来自Android客户端33楼2024-08-06 00:08 -
                                                                                                                                                                    收起回复
                                                                                                                                                                    -
                                                                                                                                                                    -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      - - -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      - -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      - - -
                                                                                                                                                                      快马加编 -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      - -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                      IP属地:四川来自Android客户端35楼2024-08-06 00:13 -
                                                                                                                                                                      回复 -
                                                                                                                                                                      -
                                                                                                                                                                      -
                                                                                                                                                                        -
                                                                                                                                                                        - -
                                                                                                                                                                        -
                                                                                                                                                                        -
                                                                                                                                                                        -
                                                                                                                                                                        -
                                                                                                                                                                        - -
                                                                                                                                                                        -
                                                                                                                                                                        -
                                                                                                                                                                        -
                                                                                                                                                                        - - -
                                                                                                                                                                        默认信xxn说的话已经很反映现在的环境了 -
                                                                                                                                                                        -
                                                                                                                                                                        -
                                                                                                                                                                        - -
                                                                                                                                                                        -
                                                                                                                                                                        -
                                                                                                                                                                        -
                                                                                                                                                                        IP属地:上海来自iPhone客户端36楼2024-08-06 00:30 -
                                                                                                                                                                        回复 -
                                                                                                                                                                        -
                                                                                                                                                                        -
                                                                                                                                                                          -
                                                                                                                                                                          - -
                                                                                                                                                                          -
                                                                                                                                                                          -
                                                                                                                                                                          -
                                                                                                                                                                          -
                                                                                                                                                                          - -
                                                                                                                                                                          -
                                                                                                                                                                          -
                                                                                                                                                                          -
                                                                                                                                                                          - - -
                                                                                                                                                                          這是最後一個教訓了
                                                                                                                                                                          父親給的最後一個教訓,停止了你的反叛期,永久有效 -
                                                                                                                                                                          -
                                                                                                                                                                          -
                                                                                                                                                                          - -
                                                                                                                                                                          -
                                                                                                                                                                          -
                                                                                                                                                                          -
                                                                                                                                                                          IP属地:中国香港来自Android客户端37楼2024-08-06 00:39 -
                                                                                                                                                                          回复 -
                                                                                                                                                                          -
                                                                                                                                                                          -
                                                                                                                                                                            -
                                                                                                                                                                            - -
                                                                                                                                                                            -
                                                                                                                                                                            -
                                                                                                                                                                            -
                                                                                                                                                                            -
                                                                                                                                                                            - -
                                                                                                                                                                            -
                                                                                                                                                                            -
                                                                                                                                                                            -
                                                                                                                                                                            - - -
                                                                                                                                                                            编 -
                                                                                                                                                                            -
                                                                                                                                                                            -
                                                                                                                                                                            - -
                                                                                                                                                                            -
                                                                                                                                                                            -
                                                                                                                                                                            -
                                                                                                                                                                            IP属地:河北来自Android客户端38楼2024-08-06 00:39 -
                                                                                                                                                                            回复 -
                                                                                                                                                                            -
                                                                                                                                                                            -
                                                                                                                                                                              -
                                                                                                                                                                              - -
                                                                                                                                                                              -
                                                                                                                                                                              -
                                                                                                                                                                              - -
                                                                                                                                                                              -
                                                                                                                                                                              - -
                                                                                                                                                                              -
                                                                                                                                                                              -
                                                                                                                                                                              -
                                                                                                                                                                              - - -
                                                                                                                                                                              哇,是没头没尾的讲故事,甚至比聊天记录还干净,这下不得不信了 -
                                                                                                                                                                              -
                                                                                                                                                                              -
                                                                                                                                                                              - -
                                                                                                                                                                              -
                                                                                                                                                                              -
                                                                                                                                                                              -
                                                                                                                                                                              IP属地:湖北来自Android客户端39楼2024-08-06 00:40 -
                                                                                                                                                                              收起回复
                                                                                                                                                                              -
                                                                                                                                                                              -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                - - -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                - -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                - - -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                - -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                IP属地:湖北来自iPhone客户端40楼2024-08-06 00:46 -
                                                                                                                                                                                收起回复
                                                                                                                                                                                -
                                                                                                                                                                                -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  - - -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  - -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  - - -
                                                                                                                                                                                  叛逆期,是我懂的那个吗?
                                                                                                                                                                                  就是咒他爸要死还找烂仔来对付他爸,给人当街一顿打自己跑路了那个吗?
                                                                                                                                                                                  要我说,父母都体现出最大的斯文和忍让了,换作素质低点的可能牙齿都给人干碎了。 -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  - -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                  IP属地:广西来自Android客户端41楼2024-08-06 00:52 -
                                                                                                                                                                                  收起回复
                                                                                                                                                                                  -
                                                                                                                                                                                  -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    - - -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    - -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    - - -
                                                                                                                                                                                    自己犯贱能怪谁呢 -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    - -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                    IP属地:浙江来自Android客户端42楼2024-08-06 00:55 -
                                                                                                                                                                                    回复 -
                                                                                                                                                                                    -
                                                                                                                                                                                    -
                                                                                                                                                                                      -
                                                                                                                                                                                      - -
                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      -

                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      - -
                                                                                                                                                                                      -
                                                                                                                                                                                        -
                                                                                                                                                                                      • 发贴红色标题
                                                                                                                                                                                      • -
                                                                                                                                                                                      • 显示红名
                                                                                                                                                                                      • -
                                                                                                                                                                                      • 签到六倍经验
                                                                                                                                                                                      • -
                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      - -
                                                                                                                                                                                      -
                                                                                                                                                                                      -

                                                                                                                                                                                      赠送补签卡1张,获得[经验书购买权]

                                                                                                                                                                                      - -
                                                                                                                                                                                      -

                                                                                                                                                                                      我在贴吧

                                                                                                                                                                                      -
                                                                                                                                                                                      - -
                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      - -
                                                                                                                                                                                      -

                                                                                                                                                                                      扫二维码下载贴吧客户端

                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      下载贴吧APP
                                                                                                                                                                                      看高清直播、视频!
                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      - - -
                                                                                                                                                                                      -
                                                                                                                                                                                      -
                                                                                                                                                                                      - -
                                                                                                                                                                                      -
                                                                                                                                                                                      \ No newline at end of file diff --git a/model/__init__.py b/model/__init__.py new file mode 100644 index 0000000..40a96af --- /dev/null +++ b/model/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/model/m_baidu_tieba.py b/model/m_baidu_tieba.py new file mode 100644 index 0000000..6f420dc --- /dev/null +++ b/model/m_baidu_tieba.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- +from typing import Optional +from pydantic import BaseModel, Field + + +class TiebaNote(BaseModel): + note_id: str = Field(..., description="帖子ID") + title: str = Field(..., description="帖子标题") + desc: str = Field(default="", description="帖子描述") + note_url: str = Field(..., description="帖子链接") + publish_time: str = Field(default="", description="发布时间") + user_link: str = Field(default="", description="用户主页链接") + user_nickname: str = Field(default="", description="用户昵称") + user_avatar: str = Field(default="", description="用户头像地址") + tieba_name: str = Field(..., description="贴吧名称") + tieba_link: str = Field(..., description="贴吧链接") + total_replay_num: int = Field(default=0, description="回复总数") + total_replay_page: int = Field(default=0, description="回复总页数") + ip_location: Optional[str] = Field(default="", description="IP地理位置") diff --git a/model/m_douyin.py b/model/m_douyin.py new file mode 100644 index 0000000..40a96af --- /dev/null +++ b/model/m_douyin.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/model/m_kuaishou.py b/model/m_kuaishou.py new file mode 100644 index 0000000..40a96af --- /dev/null +++ b/model/m_kuaishou.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/model/m_weibo.py b/model/m_weibo.py new file mode 100644 index 0000000..40a96af --- /dev/null +++ b/model/m_weibo.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/model/m_xiaohongshu.py b/model/m_xiaohongshu.py new file mode 100644 index 0000000..40a96af --- /dev/null +++ b/model/m_xiaohongshu.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/requirements.txt b/requirements.txt index 151374f..eb405db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,4 @@ python-dotenv==1.0.1 jieba==0.42.1 wordcloud==1.9.3 matplotlib==3.9.0 -requests==2.32.3 \ No newline at end of file +requests==2.32.3 diff --git a/schema/tables.sql b/schema/tables.sql index 88828b7..2aadb38 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -349,29 +349,26 @@ ALTER TABLE `bilibili_video_comment` ALTER TABLE `weibo_note_comment` ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID'; -SET -FOREIGN_KEY_CHECKS = 1; - DROP TABLE IF EXISTS `tieba_note`; -CREATE TABLE `tieba_note` +CREATE TABLE tieba_note ( - `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', - `note_id` varchar(64) NOT NULL COMMENT '帖子ID', - `title` varchar(255) DEFAULT NULL COMMENT '笔记标题', - `desc` longtext COMMENT '笔记描述', - `time` varchar NOT NULL COMMENT '笔记发布时间', - `note_url` varchar(255) DEFAULT NULL COMMENT '笔记详情页的URL', - `nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称', - `nickname_link` varchar(255) DEFAULT NULL COMMENT '用户主页地址', - `tieba_name` varchar(255) DEFAULT NULL COMMENT '贴吧名称', - `tieba_link` varchar(255) DEFAULT NULL COMMENT '贴吧链接地址', - `avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址', - `ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址', - `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', - `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', - `comment_count` varchar(16) DEFAULT NULL COMMENT '笔记评论数', - PRIMARY KEY (`id`), - KEY `idx_tieba_note_id` (`note_id`), - KEY `idx_tieba_note_time` (`time`) -) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧帖子表'; + id BIGINT AUTO_INCREMENT PRIMARY KEY, + note_id VARCHAR(644) NOT NULL COMMENT '帖子ID', + title VARCHAR(255) NOT NULL COMMENT '帖子标题', + `desc` TEXT COMMENT '帖子描述', + note_url VARCHAR(255) NOT NULL COMMENT '帖子链接', + publish_time VARCHAR(255) NOT NULL COMMENT '发布时间', + user_link VARCHAR(255) NOT NULL COMMENT '用户主页链接', + user_nickname VARCHAR(255) NOT NULL COMMENT '用户昵称', + user_avatar VARCHAR(255) NOT NULL COMMENT '用户头像地址', + tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称', + tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接', + total_replay_num INT DEFAULT 0 COMMENT '帖子回复总数', + total_replay_page INT DEFAULT 0 COMMENT '帖子回复总页数', + ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置', + add_ts BIGINT NOT NULL COMMENT '添加时间戳', + last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳', + KEY `idx_tieba_note_note_id` (`note_id`), + KEY `idx_tieba_note_publish_time` (`publish_time`) +) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧帖子表'; \ No newline at end of file diff --git a/store/tieba/__init__.py b/store/tieba/__init__.py index 9e47fa4..efaa6cc 100644 --- a/store/tieba/__init__.py +++ b/store/tieba/__init__.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from typing import List +from model.m_baidu_tieba import TiebaNote from . import tieba_store_impl from .tieba_store_impl import * @@ -21,24 +22,20 @@ class TieBaStoreFactory: return store_class() -async def update_tieba_note(note_item: Dict): - tieba_url = "https://tieba.baidu.com" - note_id = note_item.get("note_id") - local_db_item = { - "note_id": note_id, - "title": note_item.get("title") or note_item.get("desc", "")[:255], - "desc": note_item.get("desc", ""), - "note_url": tieba_url + note_item.get("note_url"), - "time": note_item.get("time"), - "tieba_name": note_item.get("tieba_name"), - "tieba_link": tieba_url + note_item.get("tieba_link", ""), - "nickname": note_item.get("nickname"), - "nickname_link": tieba_url + note_item.get("nickname_link", ""), - "ip_location": note_item.get("ip_location", ""), - "last_modify_ts": utils.get_current_timestamp(), - } - utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {local_db_item}") - await TieBaStoreFactory.create_store().store_content(local_db_item) +async def update_tieba_note(note_item: TiebaNote): + """ + Add or Update tieba note + Args: + note_item: + + Returns: + + """ + save_note_item = note_item.model_dump() + save_note_item.update({"last_modify_ts": utils.get_current_timestamp()}) + utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {save_note_item}") + + await TieBaStoreFactory.create_store().store_content(save_note_item) async def batch_update_tieba_note_comments(note_id: str, comments: List[Dict]): From 026d81e1317982cc2da047dc37bcc62793cacc01 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Wed, 7 Aug 2024 02:34:56 +0800 Subject: [PATCH 5/8] =?UTF-8?q?feat:=20=E7=99=BE=E5=BA=A6=E8=B4=B4?= =?UTF-8?q?=E5=90=A7=E4=B8=80=E7=BA=A7=E8=AF=84=E8=AE=BAdone?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/base_config.py | 2 +- media_platform/tieba/client.py | 53 +- media_platform/tieba/core.py | 23 +- media_platform/tieba/help.py | 94 +- .../tieba/test_data/note_comments.html | 874 ++++++++++++++++++ model/m_baidu_tieba.py | 23 + schema/tables.sql | 25 +- store/tieba/__init__.py | 39 +- 8 files changed, 1042 insertions(+), 91 deletions(-) create mode 100644 media_platform/tieba/test_data/note_comments.html diff --git a/config/base_config.py b/config/base_config.py index 2985d40..96d87b1 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -46,7 +46,7 @@ MAX_CONCURRENCY_NUM = 1 ENABLE_GET_IMAGES = False # 是否开启爬评论模式, 默认不开启爬评论 -ENABLE_GET_COMMENTS = False +ENABLE_GET_COMMENTS = True # 是否开启爬二级评论模式, 默认不开启爬二级评论 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index edbbf19..2aa6cde 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -9,8 +9,9 @@ from playwright.async_api import BrowserContext from tenacity import (RetryError, retry, stop_after_attempt, wait_fixed) +import config from base.base_crawler import AbstractApiClient -from model.m_baidu_tieba import TiebaNote +from model.m_baidu_tieba import TiebaNote, TiebaComment from proxy.proxy_ip_pool import ProxyIpPool from tools import utils @@ -195,41 +196,38 @@ class BaiduTieBaClient(AbstractApiClient): page_content = await self.get(uri, return_ori_content=True) return self._page_extractor.extract_note_detail(page_content) - async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[Dict]: + async def get_note_all_comments(self, note_detail: TiebaNote, crawl_interval: float = 1.0, + callback: Optional[Callable] = None) -> List[TiebaComment]: """ 获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 Args: - note_id: 帖子ID + note_detail: 帖子详情对象 crawl_interval: 爬取一次笔记的延迟单位(秒) callback: 一次笔记爬取结束后 Returns: """ - uri = f"/p/{note_id}" - result = [] - comments_has_more = True - comments_cursor = 1 - while comments_has_more: - comments_res = await self.get(uri, params={"pn": comments_cursor}) - comments_has_more = comments_res.get("has_more", False) - comments_cursor = comments_res.get("cursor", "") - if "comments" not in comments_res: - utils.logger.info( - f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}") + uri = f"/p/{note_detail.note_id}" + result: List[TiebaComment] = [] + current_page = 1 + while note_detail.total_replay_page >= current_page: + params = { + "pn": current_page + } + page_content = await self.get(uri, params=params, return_ori_content=True) + comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id) + if not comments: break - comments = comments_res["comments"] if callback: - await callback(note_id, comments) - await asyncio.sleep(crawl_interval) + await callback(note_detail.note_id, comments) result.extend(comments) - sub_comments = await self.get_comments_all_sub_comments(comments, crawl_interval, callback) - result.extend(sub_comments) + await asyncio.sleep(crawl_interval) + current_page += 1 return result async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[Dict]: + callback: Optional[Callable] = None) -> List[TiebaComment]: """ 获取指定评论下的所有子评论 Args: @@ -240,12 +238,7 @@ class BaiduTieBaClient(AbstractApiClient): Returns: """ - result = [] - for comment in comments: - sub_comments = comment.get("comments") - if sub_comments: - if callback: - await callback(comment.get("id"), sub_comments) - await asyncio.sleep(crawl_interval) - result.extend(sub_comments) - return result + if not config.ENABLE_GET_SUB_COMMENTS: + return [] + + # todo 未完成子评论的爬取 diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index a03f0ad..2d10a0a 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -114,10 +114,12 @@ class TieBaCrawler(AbstractCrawler): self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore) for note_id in note_id_list ] note_details = await asyncio.gather(*task_list) + note_details_model: List[TiebaNote] = [] for note_detail in note_details: if note_detail is not None: + note_details_model.append(note_detail) await tieba_store.update_tieba_note(note_detail) - await self.batch_get_note_comments(config.TIEBA_SPECIFIED_ID_LIST) + await self.batch_get_note_comments(note_details_model) async def get_note_detail_async_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[TiebaNote]: """ @@ -146,42 +148,39 @@ class TieBaCrawler(AbstractCrawler): f"[BaiduTieBaCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}") return None - async def batch_get_note_comments(self, note_id_list: List[str]): + async def batch_get_note_comments(self, note_detail_list: List[TiebaNote]): """ Batch get note comments Args: - note_id_list: + note_detail_list: Returns: """ if not config.ENABLE_GET_COMMENTS: - utils.logger.info(f"[BaiduTieBaCrawler.batch_get_note_comments] Crawling comment mode is not enabled") return - utils.logger.info( - f"[BaiduTieBaCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_id_list}") semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list: List[Task] = [] - for note_id in note_id_list: - task = asyncio.create_task(self.get_comments_async_task(note_id, semaphore), name=note_id) + for note_detail in note_detail_list: + task = asyncio.create_task(self.get_comments_async_task(note_detail, semaphore), name=note_detail.note_id) task_list.append(task) await asyncio.gather(*task_list) - async def get_comments_async_task(self, note_id: str, semaphore: asyncio.Semaphore): + async def get_comments_async_task(self, note_detail: TiebaNote, semaphore: asyncio.Semaphore): """ Get comments async task Args: - note_id: + note_detail: semaphore: Returns: """ async with semaphore: - utils.logger.info(f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_id}") + utils.logger.info(f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}") await self.tieba_client.get_note_all_comments( - note_id=note_id, + note_detail=note_detail, crawl_interval=random.random(), callback=tieba_store.batch_update_tieba_note_comments ) diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index efba258..2297855 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -1,10 +1,12 @@ # -*- coding: utf-8 -*- import re +import json +import html from typing import List, Dict, Tuple from parsel import Selector -from model.m_baidu_tieba import TiebaNote +from model.m_baidu_tieba import TiebaNote, TiebaComment from constant import baidu_tieba as const @@ -40,7 +42,6 @@ class TieBaExtractor: result.append(tieba_note) return result - def extract_note_detail(self, page_content: str) -> TiebaNote: """ 提取贴吧帖子详情 @@ -66,8 +67,10 @@ class TieBaExtractor: title=content_selector.xpath("//title/text()").get(default='').strip(), desc=content_selector.xpath("//meta[@name='description']/@content").get(default='').strip(), note_url=const.TIEBA_URL + f"/p/{note_id}", - user_link=const.TIEBA_URL + first_floor_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(), - user_nickname=first_floor_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(), + user_link=const.TIEBA_URL + first_floor_selector.xpath(".//a[@class='p_author_face ']/@href").get( + default='').strip(), + user_nickname=first_floor_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get( + default='').strip(), user_avatar=first_floor_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(default='').strip(), tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath("//a[@class='card_title_fname']/@href").get(default=''), @@ -79,33 +82,44 @@ class TieBaExtractor: note.title = note.title.replace(f"【{note.tieba_name}】_百度贴吧", "") return note - @staticmethod - def extract_tieba_note_comments(page_content: str) -> List[Dict]: + def extract_tieba_note_parment_comments(self, page_content: str, note_id: str) -> List[TiebaComment]: """ - 提取贴吧帖子评论 + 提取贴吧帖子一级评论 Args: page_content: + note_id: Returns: """ - xpath_selector = "//div[@id='j_p_postlist']/div[@class='l_post l_post_bright j_l_post clearfix']" + xpath_selector = "//div[@class='l_post l_post_bright j_l_post clearfix ']" comment_list = Selector(text=page_content).xpath(xpath_selector) - result = [] - for comment in comment_list: - comment_id = comment.xpath(".//@data-pid").get(default='').strip() - author = comment.xpath(".//a[@data-field]/text()").get(default='').strip() - author_link = comment.xpath(".//a[@data-field]/@href").get(default='') - content = comment.xpath(".//div[@class='d_post_content j_d_post_content ']/text()").get(default='').strip() - date = comment.xpath(".//span[@class='tail-info']/text()").get(default='').strip() + result: List[TiebaComment] = [] + for comment_selector in comment_list: + comment_field_value: Dict = self.extract_data_field_value(comment_selector) + if not comment_field_value: + continue - result.append({ - "comment_id": comment_id, - "author": author, - "author_link": author_link, - "content": content, - "time": date, - }) + other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip() + ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content) + tieba_comment = TiebaComment( + comment_id=str(comment_field_value.get("content").get("post_id")), + sub_comment_count=comment_field_value.get("content").get("comment_num"), + content=comment_field_value.get("content").get("content"), + note_url=const.TIEBA_URL + f"/p/{note_id}", + user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(), + user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get( + default='').strip(), + user_avatar=comment_selector.xpath(".//a[@class='p_author_face ']/img/@src").get( + default='').strip(), + tieba_name=comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(), + ip_location=ip_location, + publish_time=publish_time, + note_id=note_id, + ) + print(tieba_comment.model_dump()) + result.append(tieba_comment) + return result @staticmethod def extract_ip_and_pub_time(html_content: str) -> Tuple[str, str]: @@ -125,6 +139,31 @@ class TieBaExtractor: pub_time = time_match.group(1) if time_match else "" return ip, pub_time + @staticmethod + def extract_data_field_value(selector: Selector) -> Dict: + """ + 提取data-field的值 + Args: + selector: + + Returns: + + """ + data_field_value = selector.xpath("./@data-field").get(default='').strip() + if not data_field_value or data_field_value == "{}": + return {} + try: + # 先使用 html.unescape 处理转义字符 再json.loads 将 JSON 字符串转换为 Python 字典 + unescaped_json_str = html.unescape(data_field_value) + data_field_dict_value = json.loads(unescaped_json_str) + except Exception as ex: + print(f"extract_data_field_value,错误信息:{ex}, 尝试使用其他方式解析") + data_field_dict_value = {} + return data_field_dict_value + + + + def test_extract_search_note_list(): with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f: content = f.read() @@ -140,7 +179,14 @@ def test_extract_note_detail(): result = extractor.extract_note_detail(content) print(result.model_dump()) +def test_extract_tieba_note_parment_comments(): + with open("test_data/note_comments.html", "r", encoding="utf-8") as f: + content = f.read() + extractor = TieBaExtractor() + result = extractor.extract_tieba_note_parment_comments(content, "123456") + print(result) if __name__ == '__main__': - test_extract_search_note_list() - test_extract_note_detail() + # test_extract_search_note_list() + # test_extract_note_detail() + test_extract_tieba_note_parment_comments() diff --git a/media_platform/tieba/test_data/note_comments.html b/media_platform/tieba/test_data/note_comments.html new file mode 100644 index 0000000..5d08b07 --- /dev/null +++ b/media_platform/tieba/test_data/note_comments.html @@ -0,0 +1,874 @@ +【强烈恭喜】全红婵陈宇汐包揽跳水女子10米台巴黎奥运金银牌!【网球风云吧】_百度贴吧 + + + + + + + + +
                                                                                                                                                                                      网球风云吧 关注:48,523贴子:5,418,043
                                                                                                                                                                                      +
                                                                                                                                                                                      + + +
                                                                                                                                                                                      + 贴子管理 +
                                                                                                                                                                                        + +
                                                                                                                                                                                      +
                                                                                                                                                                                      +
                                                                                                                                                                                      +
                                                                                                                                                                                      +

                                                                                                                                                                                      【强烈恭喜】全红婵陈宇汐包揽跳水女子10米台巴黎奥运金银牌!

                                                                                                                                                                                      只看楼主收藏回复

                                                                                                                                                                                      +
                                                                                                                                                                                      + +
                                                                                                                                                                                      +
                                                                                                                                                                                      +
                                                                                                                                                                                      +
                                                                                                                                                                                      +
                                                                                                                                                                                      中国队第22金!无悬念!



                                                                                                                                                                                      IP属地:福建来自Android客户端1楼2024-08-06 22:09回复
                                                                                                                                                                                        +
                                                                                                                                                                                        + +
                                                                                                                                                                                        +
                                                                                                                                                                                        全后卫冕成功,还是动作质量高,小炸也赢了


                                                                                                                                                                                        IP属地:福建来自Android客户端2楼2024-08-06 22:10
                                                                                                                                                                                        收起回复
                                                                                                                                                                                          全后卫冕,太好了


                                                                                                                                                                                          IP属地:江苏来自Android客户端3楼2024-08-06 22:10
                                                                                                                                                                                          收起回复
                                                                                                                                                                                            毫无悬念


                                                                                                                                                                                            IP属地:上海来自Android客户端4楼2024-08-06 22:10
                                                                                                                                                                                            收起回复
                                                                                                                                                                                              皇后回宫


                                                                                                                                                                                              IP属地:广西来自Android客户端5楼2024-08-06 22:10
                                                                                                                                                                                              收起回复
                                                                                                                                                                                                可惜了,既生婵何生汐


                                                                                                                                                                                                IP属地:湖北来自Android客户端6楼2024-08-06 22:10
                                                                                                                                                                                                收起回复
                                                                                                                                                                                                  全最后水花那么大 居然不是8分


                                                                                                                                                                                                  IP属地:中国澳门来自Android客户端7楼2024-08-06 22:10
                                                                                                                                                                                                  收起回复
                                                                                                                                                                                                    除了第三跳小炸一下,其余的都很棒了…


                                                                                                                                                                                                    IP属地:四川来自iPhone客户端8楼2024-08-06 22:10
                                                                                                                                                                                                    收起回复
                                                                                                                                                                                                      陈完美发挥了还是打不过,没办法


                                                                                                                                                                                                      IP属地:福建来自iPhone客户端9楼2024-08-06 22:10
                                                                                                                                                                                                      收起回复


                                                                                                                                                                                                        IP属地:广东来自Android客户端10楼2024-08-06 22:11
                                                                                                                                                                                                        回复
                                                                                                                                                                                                          恭喜全,陈也蛮惨的,好在是有女双金


                                                                                                                                                                                                          IP属地:江苏来自iPhone客户端11楼2024-08-06 22:11
                                                                                                                                                                                                          收起回复
                                                                                                                                                                                                            陈芋汐简直就是跳水队版孙颖莎


                                                                                                                                                                                                            IP属地:陕西来自Android客户端12楼2024-08-06 22:11
                                                                                                                                                                                                            收起回复
                                                                                                                                                                                                              恭喜全后卫冕 也恭喜汐贵妃银牌,汐贵妃挺遗憾的,不管怎么样还是恭喜两位


                                                                                                                                                                                                              IP属地:广东来自iPhone客户端13楼2024-08-06 22:11
                                                                                                                                                                                                              收起回复
                                                                                                                                                                                                                强烈恭喜


                                                                                                                                                                                                                IP属地:广西来自Android客户端14楼2024-08-06 22:11
                                                                                                                                                                                                                回复
                                                                                                                                                                                                                  恭喜全后卫冕成功


                                                                                                                                                                                                                  IP属地:江苏来自iPhone客户端15楼2024-08-06 22:11
                                                                                                                                                                                                                  回复
                                                                                                                                                                                                                    恭喜全后卫冕


                                                                                                                                                                                                                    IP属地:上海来自Android客户端16楼2024-08-06 22:11
                                                                                                                                                                                                                    收起回复
                                                                                                                                                                                                                      全后真的是后。。。确实今天有点紧,正常应该在440-450左右。。。


                                                                                                                                                                                                                      IP属地:上海17楼2024-08-06 22:11
                                                                                                                                                                                                                      收起回复
                                                                                                                                                                                                                        这俩看谁能先熬过对方吧,恭喜


                                                                                                                                                                                                                        IP属地:上海18楼2024-08-06 22:11
                                                                                                                                                                                                                        收起回复
                                                                                                                                                                                                                          全身体姿态确实更好看


                                                                                                                                                                                                                          IP属地:广西来自Android客户端19楼2024-08-06 22:12
                                                                                                                                                                                                                          收起回复
                                                                                                                                                                                                                            质量好,分数没啥问题,主要是207不炸基本没悬念


                                                                                                                                                                                                                            IP属地:上海来自Android客户端20楼2024-08-06 22:12
                                                                                                                                                                                                                            收起回复
                                                                                                                                                                                                                              恭喜


                                                                                                                                                                                                                              IP属地:山东来自Android客户端21楼2024-08-06 22:12
                                                                                                                                                                                                                              收起回复
                                                                                                                                                                                                                                陈这个周期是不是压着全,吊打了,结果巴黎还是输了好难过哦


                                                                                                                                                                                                                                IP属地:上海来自Android客户端22楼2024-08-06 22:12
                                                                                                                                                                                                                                收起回复
                                                                                                                                                                                                                                  207没炸炸了6组动作也没想到


                                                                                                                                                                                                                                  IP属地:山东来自iPhone客户端23楼2024-08-06 22:12
                                                                                                                                                                                                                                  收起回复
                                                                                                                                                                                                                                    恭喜两位,都很棒


                                                                                                                                                                                                                                    IP属地:广东来自iPhone客户端24楼2024-08-06 22:12
                                                                                                                                                                                                                                    收起回复
                                                                                                                                                                                                                                      汐贵妃有点惨。。相比预赛半决赛已经特别好了,今天机会很大的。。。


                                                                                                                                                                                                                                      IP属地:上海25楼2024-08-06 22:12
                                                                                                                                                                                                                                      收起回复
                                                                                                                                                                                                                                        稳稳的幸福


                                                                                                                                                                                                                                        IP属地:安徽来自iPhone客户端26楼2024-08-06 22:12
                                                                                                                                                                                                                                        回复
                                                                                                                                                                                                                                          心疼陈宇汐一秒


                                                                                                                                                                                                                                          IP属地:安徽27楼2024-08-06 22:12
                                                                                                                                                                                                                                          收起回复
                                                                                                                                                                                                                                            全后居然因为赢而哭,真的长大不少,汐贵妃好无奈


                                                                                                                                                                                                                                            IP属地:广东来自iPhone客户端28楼2024-08-06 22:12
                                                                                                                                                                                                                                            收起回复
                                                                                                                                                                                                                                              陈真的太遗憾了


                                                                                                                                                                                                                                              IP属地:安徽来自Android客户端29楼2024-08-06 22:12
                                                                                                                                                                                                                                              回复
                                                                                                                                                                                                                                                汐贵妃最后神情有点落寞


                                                                                                                                                                                                                                                IP属地:湖南来自iPhone客户端30楼2024-08-06 22:12
                                                                                                                                                                                                                                                收起回复
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  + + + + + + + + + + + + + + + + +
                                                                                                                                                                                                                                                  + 广告 + +
                                                                                                                                                                                                                                                  + + + +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  + +
                                                                                                                                                                                                                                                  \ No newline at end of file diff --git a/model/m_baidu_tieba.py b/model/m_baidu_tieba.py index 6f420dc..95b0175 100644 --- a/model/m_baidu_tieba.py +++ b/model/m_baidu_tieba.py @@ -4,6 +4,9 @@ from pydantic import BaseModel, Field class TiebaNote(BaseModel): + """ + 百度贴吧帖子 + """ note_id: str = Field(..., description="帖子ID") title: str = Field(..., description="帖子标题") desc: str = Field(default="", description="帖子描述") @@ -17,3 +20,23 @@ class TiebaNote(BaseModel): total_replay_num: int = Field(default=0, description="回复总数") total_replay_page: int = Field(default=0, description="回复总页数") ip_location: Optional[str] = Field(default="", description="IP地理位置") + + +class TiebaComment(BaseModel): + """ + 百度贴吧评论 + """ + + comment_id: str = Field(..., description="评论ID") + parment_comment_id: str = Field(default="", description="父评论ID") + content: str = Field(..., description="评论内容") + user_link: str = Field(default="", description="用户主页链接") + user_nickname: str = Field(default="", description="用户昵称") + user_avatar: str = Field(default="", description="用户头像地址") + publish_time: str = Field(default="", description="发布时间") + ip_location: Optional[str] = Field(default="", description="IP地理位置") + sub_comment_count: int = Field(default=0, description="子评论数") + note_id: str = Field(..., description="帖子ID") + note_url: str = Field(..., description="帖子链接") + tieba_name: str = Field(..., description="所属的贴吧名称") + diff --git a/schema/tables.sql b/schema/tables.sql index 2aadb38..c5737f9 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -371,4 +371,27 @@ CREATE TABLE tieba_note last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳', KEY `idx_tieba_note_note_id` (`note_id`), KEY `idx_tieba_note_publish_time` (`publish_time`) -) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧帖子表'; \ No newline at end of file +) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧帖子表'; + +DROP TABLE IF EXISTS `tieba_comment`; +CREATE TABLE tieba_comment +( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + comment_id VARCHAR(255) NOT NULL COMMENT '评论ID', + parment_comment_id VARCHAR(255) DEFAULT '' COMMENT '父评论ID', + content TEXT NOT NULL COMMENT '评论内容', + user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接', + user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称', + user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址', + publish_time VARCHAR(255) DEFAULT '' COMMENT '发布时间', + ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置', + sub_comment_count INT DEFAULT 0 COMMENT '子评论数', + note_id VARCHAR(255) NOT NULL COMMENT '帖子ID', + note_url VARCHAR(255) NOT NULL COMMENT '帖子链接', + tieba_name VARCHAR(255) NOT NULL COMMENT '所属的贴吧名称', + add_ts BIGINT NOT NULL COMMENT '添加时间戳', + last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳', + KEY `idx_tieba_comment_comment_id` (`note_id`), + KEY `idx_tieba_comment_note_id` (`note_id`), + KEY `idx_tieba_comment_publish_time` (`publish_time`) +) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表'; \ No newline at end of file diff --git a/store/tieba/__init__.py b/store/tieba/__init__.py index efaa6cc..e358bb6 100644 --- a/store/tieba/__init__.py +++ b/store/tieba/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- from typing import List -from model.m_baidu_tieba import TiebaNote +from model.m_baidu_tieba import TiebaNote, TiebaComment from . import tieba_store_impl from .tieba_store_impl import * @@ -38,14 +38,23 @@ async def update_tieba_note(note_item: TiebaNote): await TieBaStoreFactory.create_store().store_content(save_note_item) -async def batch_update_tieba_note_comments(note_id: str, comments: List[Dict]): +async def batch_update_tieba_note_comments(note_id:str, comments: List[TiebaComment]): + """ + Batch update tieba note comments + Args: + note_id: + comments: + + Returns: + + """ if not comments: return for comment_item in comments: await update_tieba_note_comment(note_id, comment_item) -async def update_tieba_note_comment(note_id: str, comment_item: Dict): +async def update_tieba_note_comment(note_id: str, comment_item: TiebaComment): """ Update tieba note comment Args: @@ -55,23 +64,7 @@ async def update_tieba_note_comment(note_id: str, comment_item: Dict): Returns: """ - user_info = comment_item.get("user_info", {}) - comment_id = comment_item.get("id") - comment_pictures = [item.get("url_default", "") for item in comment_item.get("pictures", [])] - target_comment = comment_item.get("target_comment", {}) - local_db_item = { - "comment_id": comment_id, - "create_time": comment_item.get("create_time"), - "ip_location": comment_item.get("ip_location"), - "note_id": note_id, - "content": comment_item.get("content"), - "user_id": user_info.get("user_id"), - "nickname": user_info.get("nickname"), - "avatar": user_info.get("image"), - "sub_comment_count": comment_item.get("sub_comment_count", 0), - "pictures": ",".join(comment_pictures), - "parent_comment_id": target_comment.get("id", 0), - "last_modify_ts": utils.get_current_timestamp(), - } - utils.logger.info(f"[store.tieba.update_tieba_note_comment] tieba note comment:{local_db_item}") - await TieBaStoreFactory.create_store().store_comment(local_db_item) + save_comment_item = comment_item.model_dump() + save_comment_item.update({"last_modify_ts": utils.get_current_timestamp()}) + utils.logger.info(f"[store.tieba.update_tieba_note_comment] tieba note id: {note_id} comment:{save_comment_item}") + await TieBaStoreFactory.create_store().store_comment(save_comment_item) From 1208682a9a844b0d22cb67eb79386a2244c25c93 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Wed, 7 Aug 2024 02:39:50 +0800 Subject: [PATCH 6/8] =?UTF-8?q?fix:=20=E8=AF=84=E8=AE=BA=E7=A7=BB=E9=99=A4?= =?UTF-8?q?html=E6=A0=87=E7=AD=BE=E5=86=85=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- media_platform/tieba/help.py | 4 ++-- tools/crawler_util.py | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index 2297855..1225e7a 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -8,6 +8,7 @@ from parsel import Selector from model.m_baidu_tieba import TiebaNote, TiebaComment from constant import baidu_tieba as const +from tools import utils class TieBaExtractor: @@ -105,7 +106,7 @@ class TieBaExtractor: tieba_comment = TiebaComment( comment_id=str(comment_field_value.get("content").get("post_id")), sub_comment_count=comment_field_value.get("content").get("comment_num"), - content=comment_field_value.get("content").get("content"), + content=utils.extract_text_from_html(comment_field_value.get("content").get("content")), note_url=const.TIEBA_URL + f"/p/{note_id}", user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(), user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get( @@ -117,7 +118,6 @@ class TieBaExtractor: publish_time=publish_time, note_id=note_id, ) - print(tieba_comment.model_dump()) result.append(tieba_comment) return result diff --git a/tools/crawler_util.py b/tools/crawler_util.py index 8e37881..9c54f2a 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -146,4 +146,12 @@ def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optio httpx_proxy = { f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" } - return playwright_proxy, httpx_proxy \ No newline at end of file + return playwright_proxy, httpx_proxy + +def extract_text_from_html(html: str) -> str: + """Extract text from HTML, removing all tags.""" + # Remove script and style elements + clean_html = re.sub(r'<(script|style)[^>]*>.*?', '', html, flags=re.DOTALL) + # Remove all other tags + clean_text = re.sub(r'<[^>]+>', '', clean_html).strip() + return clean_text From df0f5c1113099daa8aafc63dcbc725270c55a4ca Mon Sep 17 00:00:00 2001 From: Relakkes Date: Wed, 7 Aug 2024 04:13:15 +0800 Subject: [PATCH 7/8] =?UTF-8?q?feat:=20=E7=99=BE=E5=BA=A6=E8=B4=B4?= =?UTF-8?q?=E5=90=A7=E5=AD=90=E8=AF=84=E8=AE=BAdone?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/base_config.py | 4 +- media_platform/tieba/client.py | 53 ++++- media_platform/tieba/help.py | 74 ++++++- .../tieba/test_data/note_sub_comments.html | 189 ++++++++++++++++++ model/m_baidu_tieba.py | 4 +- schema/tables.sql | 45 +++-- 6 files changed, 328 insertions(+), 41 deletions(-) create mode 100644 media_platform/tieba/test_data/note_sub_comments.html diff --git a/config/base_config.py b/config/base_config.py index 96d87b1..53dc8bf 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -1,6 +1,6 @@ # 基础配置 PLATFORM = "xhs" -KEYWORDS = "缅甸边境,缅北边境,缅北边境线,缅甸边境线" +KEYWORDS = "编程副业,编程兼职" LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" # 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书 @@ -50,7 +50,7 @@ ENABLE_GET_COMMENTS = True # 是否开启爬二级评论模式, 默认不开启爬二级评论 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 -ENABLE_GET_SUB_COMMENTS = False +ENABLE_GET_SUB_COMMENTS = True # 指定小红书需要爬虫的笔记ID列表 XHS_SPECIFIED_ID_LIST = [ diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index 2aa6cde..2ae4304 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -28,7 +28,10 @@ class BaiduTieBaClient(AbstractApiClient): ): self.ip_pool: Optional[ProxyIpPool] = ip_pool self.timeout = timeout - self.headers = utils.get_user_agent() + self.headers = { + "User-Agent": utils.get_user_agent(), + "Cookies": "", + } self._host = "https://tieba.baidu.com" self._page_extractor = TieBaExtractor() self.default_ip_proxy = default_ip_proxy @@ -51,7 +54,7 @@ class BaiduTieBaClient(AbstractApiClient): async with httpx.AsyncClient(proxies=actual_proxies) as client: response = await client.request( method, url, timeout=self.timeout, - **kwargs + headers=self.headers, **kwargs ) if response.status_code != 200: @@ -99,7 +102,7 @@ class BaiduTieBaClient(AbstractApiClient): self.default_ip_proxy = proxies return res - utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,请尝试更换新的IP代理: {e}") + utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") raise e async def post(self, uri: str, data: dict, **kwargs) -> Dict: @@ -154,7 +157,6 @@ class BaiduTieBaClient(AbstractApiClient): page_size: int = 10, sort: SearchSortType = SearchSortType.TIME_DESC, note_type: SearchNoteType = SearchNoteType.FIXED_THREAD, - random_sleep: bool = True ) -> List[TiebaNote]: """ 根据关键词搜索贴吧帖子 @@ -164,8 +166,6 @@ class BaiduTieBaClient(AbstractApiClient): page_size: 每页大小 sort: 结果排序方式 note_type: 帖子类型(主题贴|主题+回复混合模式) - random_sleep: 是否随机休眠 - Returns: """ @@ -179,8 +179,6 @@ class BaiduTieBaClient(AbstractApiClient): "only_thread": note_type.value } page_content = await self.get(uri, params=params, return_ori_content=True) - if random_sleep: - random.randint(1, 5) return self._page_extractor.extract_search_note_list(page_content) async def get_note_by_id(self, note_id: str) -> TiebaNote: @@ -216,17 +214,20 @@ class BaiduTieBaClient(AbstractApiClient): "pn": current_page } page_content = await self.get(uri, params=params, return_ori_content=True) - comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id) + comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, + note_id=note_detail.note_id) if not comments: break if callback: await callback(note_detail.note_id, comments) result.extend(comments) + # 获取所有子评论 + await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback) await asyncio.sleep(crawl_interval) current_page += 1 return result - async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0, + async def get_comments_all_sub_comments(self, comments: List[TiebaComment], crawl_interval: float = 1.0, callback: Optional[Callable] = None) -> List[TiebaComment]: """ 获取指定评论下的所有子评论 @@ -238,7 +239,37 @@ class BaiduTieBaClient(AbstractApiClient): Returns: """ + uri = "/p/comment" if not config.ENABLE_GET_SUB_COMMENTS: return [] - # todo 未完成子评论的爬取 + # # 贴吧获取所有子评论需要登录态 + # if self.headers.get("Cookies") == "" or not self.pong(): + # raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...") + + all_sub_comments: List[TiebaComment] = [] + for comment in comments: + if comment.sub_comment_count == 0: + continue + + current_page = 1 + max_sub_page_num = comment.sub_comment_count // 10 + 1 + while max_sub_page_num >= current_page: + params = { + "tid": comment.note_id, # 帖子ID + "pid": comment.comment_id, # 父级评论ID + "fid": comment.tieba_id, # 贴吧ID + "pn": current_page # 页码 + } + page_content = await self.get(uri, params=params, return_ori_content=True) + sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, + parent_comment=comment) + + if not sub_comments: + break + if callback: + await callback(comment.note_id, sub_comments) + all_sub_comments.extend(sub_comments) + await asyncio.sleep(crawl_interval) + current_page += 1 + return all_sub_comments diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index 1225e7a..b46081d 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -100,7 +100,7 @@ class TieBaExtractor: comment_field_value: Dict = self.extract_data_field_value(comment_selector) if not comment_field_value: continue - + tieba_name = comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip() other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip() ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content) tieba_comment = TiebaComment( @@ -108,12 +108,15 @@ class TieBaExtractor: sub_comment_count=comment_field_value.get("content").get("comment_num"), content=utils.extract_text_from_html(comment_field_value.get("content").get("content")), note_url=const.TIEBA_URL + f"/p/{note_id}", - user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(), + user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get( + default='').strip(), user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get( default='').strip(), user_avatar=comment_selector.xpath(".//a[@class='p_author_face ']/img/@src").get( default='').strip(), - tieba_name=comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(), + tieba_id=str(comment_field_value.get("content").get("forum_id", "")), + tieba_name=tieba_name, + tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}", ip_location=ip_location, publish_time=publish_time, note_id=note_id, @@ -121,6 +124,45 @@ class TieBaExtractor: result.append(tieba_comment) return result + + def extract_tieba_note_sub_comments(self,page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]: + """ + 提取贴吧帖子二级评论 + Args: + page_content: + parent_comment: + + Returns: + + """ + selector = Selector(page_content) + comments = [] + comment_ele_list = selector.xpath("//li[@class='lzl_single_post j_lzl_s_p first_no_border']") + comment_ele_list.extend(selector.xpath("//li[@class='lzl_single_post j_lzl_s_p ']")) + for comment_ele in comment_ele_list: + comment_value = self.extract_data_field_value(comment_ele) + if not comment_value: + continue + comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0] + content = utils.extract_text_from_html(comment_ele.xpath(".//span[@class='lzl_content_main']").get(default="")) + comment = TiebaComment( + comment_id=str(comment_value.get("spid")), + content=content, + user_link=comment_user_a_selector.xpath("./@href").get(default=""), + user_nickname=comment_value.get("showname"), + user_avatar=comment_user_a_selector.xpath("./img/@src").get(default=""), + publish_time=comment_ele.xpath(".//span[@class='lzl_time']/text()").get(default="").strip(), + parent_comment_id=parent_comment.comment_id, + note_id=parent_comment.note_id, + note_url=parent_comment.note_url, + tieba_id=parent_comment.tieba_id, + tieba_name=parent_comment.tieba_name, + tieba_link=parent_comment.tieba_link + ) + comments.append(comment) + + return comments + @staticmethod def extract_ip_and_pub_time(html_content: str) -> Tuple[str, str]: """ @@ -162,8 +204,6 @@ class TieBaExtractor: return data_field_dict_value - - def test_extract_search_note_list(): with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f: content = f.read() @@ -179,6 +219,7 @@ def test_extract_note_detail(): result = extractor.extract_note_detail(content) print(result.model_dump()) + def test_extract_tieba_note_parment_comments(): with open("test_data/note_comments.html", "r", encoding="utf-8") as f: content = f.read() @@ -186,7 +227,28 @@ def test_extract_tieba_note_parment_comments(): result = extractor.extract_tieba_note_parment_comments(content, "123456") print(result) +def test_extract_tieba_note_sub_comments(): + with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f: + content = f.read() + extractor = TieBaExtractor() + fake_parment_comment = TiebaComment( + comment_id="123456", + content="content", + user_link="user_link", + user_nickname="user_nickname", + user_avatar="user_avatar", + publish_time="publish_time", + parent_comment_id="parent_comment_id", + note_id="note_id", + note_url="note_url", + tieba_id="tieba_id", + tieba_name="tieba_name", + ) + result = extractor.extract_tieba_note_sub_comments(content,fake_parment_comment) + print(result) + if __name__ == '__main__': # test_extract_search_note_list() # test_extract_note_detail() - test_extract_tieba_note_parment_comments() + # test_extract_tieba_note_parment_comments() + test_extract_tieba_note_sub_comments() diff --git a/media_platform/tieba/test_data/note_sub_comments.html b/media_platform/tieba/test_data/note_sub_comments.html new file mode 100644 index 0000000..a8fe3eb --- /dev/null +++ b/media_platform/tieba/test_data/note_sub_comments.html @@ -0,0 +1,189 @@ +
                                                                                                                                                                                                                                                • + + + + +
                                                                                                                                                                                                                                                  + heinzfrentzen + : + + + + +
                                                                                                                                                                                                                                                  + + + 2024-8-6 22:11 + 回复 +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                                • + + + + +
                                                                                                                                                                                                                                                  + 可爱的搬运工94 + :陈芋汐水花也不小 +
                                                                                                                                                                                                                                                  + + + 2024-8-6 22:12 + 回复 +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                                • + + + + +
                                                                                                                                                                                                                                                  + 国际体坛巨星青椒肉丝 + :你怀孕了吗 老是呕吐 +
                                                                                                                                                                                                                                                  + + + 2024-8-6 22:12 + 回复 +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                                • + + + + +
                                                                                                                                                                                                                                                  + 茗花少帅 + :你就只看水花,不看空中姿态吗 +
                                                                                                                                                                                                                                                  + + + 2024-8-6 22:12 + 回复 +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                                • + + + + +
                                                                                                                                                                                                                                                  + 东华武兰 + :经典只看水花 +
                                                                                                                                                                                                                                                  + + + 2024-8-6 22:12 + 回复 +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                                • + + + + +
                                                                                                                                                                                                                                                  + 上下班要注意 + :额,分数正常吧 +
                                                                                                                                                                                                                                                  + + + 2024-8-6 22:13 + 回复 +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                                • + + + + +
                                                                                                                                                                                                                                                  + 静看蚂蚁上树 + : + + 回复 国际体坛巨星青椒肉丝 + :吃酸黄瓜吃多了 + + + +
                                                                                                                                                                                                                                                  + + + 2024-8-6 22:14 + 回复 +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                                • + + + + +
                                                                                                                                                                                                                                                  + 不懂取啥名字😜 + : + + 请你去跟国际泳联投诉 + +
                                                                                                                                                                                                                                                  + + + 2024-8-6 22:15 + 回复 +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                                • + + + + +
                                                                                                                                                                                                                                                  + 💫泽赫拉💯 + :第五跳陈空中分腿了,空中姿态明显全红婵更好 +
                                                                                                                                                                                                                                                  + + + 2024-8-6 22:17 + 回复 +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                                • + + + + +
                                                                                                                                                                                                                                                  + 嗯嗯哦哦啊啊🐶 + : + + 回复 美味蟹黄堡💞 + :你不会看起跳高度和空中姿态? + +
                                                                                                                                                                                                                                                  + + + 2024-8-6 22:17 + 回复 +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                • +
                                                                                                                                                                                                                                                • + + + 我也说一句 + +

                                                                                                                                                                                                                                                  + 1 + 2 + 下一页 + 尾页 +

                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                • diff --git a/model/m_baidu_tieba.py b/model/m_baidu_tieba.py index 95b0175..8153000 100644 --- a/model/m_baidu_tieba.py +++ b/model/m_baidu_tieba.py @@ -28,7 +28,7 @@ class TiebaComment(BaseModel): """ comment_id: str = Field(..., description="评论ID") - parment_comment_id: str = Field(default="", description="父评论ID") + parent_comment_id: str = Field(default="", description="父评论ID") content: str = Field(..., description="评论内容") user_link: str = Field(default="", description="用户主页链接") user_nickname: str = Field(default="", description="用户昵称") @@ -38,5 +38,7 @@ class TiebaComment(BaseModel): sub_comment_count: int = Field(default=0, description="子评论数") note_id: str = Field(..., description="帖子ID") note_url: str = Field(..., description="帖子链接") + tieba_id: str = Field(..., description="所属的贴吧ID") tieba_name: str = Field(..., description="所属的贴吧名称") + tieba_link: str = Field(..., description="贴吧链接") diff --git a/schema/tables.sql b/schema/tables.sql index c5737f9..3fc72da 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -359,9 +359,10 @@ CREATE TABLE tieba_note `desc` TEXT COMMENT '帖子描述', note_url VARCHAR(255) NOT NULL COMMENT '帖子链接', publish_time VARCHAR(255) NOT NULL COMMENT '发布时间', - user_link VARCHAR(255) NOT NULL COMMENT '用户主页链接', - user_nickname VARCHAR(255) NOT NULL COMMENT '用户昵称', - user_avatar VARCHAR(255) NOT NULL COMMENT '用户头像地址', + user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接', + user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称', + user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址', + tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID', tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称', tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接', total_replay_num INT DEFAULT 0 COMMENT '帖子回复总数', @@ -376,22 +377,24 @@ CREATE TABLE tieba_note DROP TABLE IF EXISTS `tieba_comment`; CREATE TABLE tieba_comment ( - id BIGINT AUTO_INCREMENT PRIMARY KEY, - comment_id VARCHAR(255) NOT NULL COMMENT '评论ID', - parment_comment_id VARCHAR(255) DEFAULT '' COMMENT '父评论ID', - content TEXT NOT NULL COMMENT '评论内容', - user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接', - user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称', - user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址', - publish_time VARCHAR(255) DEFAULT '' COMMENT '发布时间', - ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置', - sub_comment_count INT DEFAULT 0 COMMENT '子评论数', - note_id VARCHAR(255) NOT NULL COMMENT '帖子ID', - note_url VARCHAR(255) NOT NULL COMMENT '帖子链接', - tieba_name VARCHAR(255) NOT NULL COMMENT '所属的贴吧名称', - add_ts BIGINT NOT NULL COMMENT '添加时间戳', - last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳', - KEY `idx_tieba_comment_comment_id` (`note_id`), - KEY `idx_tieba_comment_note_id` (`note_id`), - KEY `idx_tieba_comment_publish_time` (`publish_time`) + id BIGINT AUTO_INCREMENT PRIMARY KEY, + comment_id VARCHAR(255) NOT NULL COMMENT '评论ID', + parent_comment_id VARCHAR(255) DEFAULT '' COMMENT '父评论ID', + content TEXT NOT NULL COMMENT '评论内容', + user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接', + user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称', + user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址', + tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID', + tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称', + tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接', + publish_time VARCHAR(255) DEFAULT '' COMMENT '发布时间', + ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置', + sub_comment_count INT DEFAULT 0 COMMENT '子评论数', + note_id VARCHAR(255) NOT NULL COMMENT '帖子ID', + note_url VARCHAR(255) NOT NULL COMMENT '帖子链接', + add_ts BIGINT NOT NULL COMMENT '添加时间戳', + last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳', + KEY `idx_tieba_comment_comment_id` (`note_id`), + KEY `idx_tieba_comment_note_id` (`note_id`), + KEY `idx_tieba_comment_publish_time` (`publish_time`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表'; \ No newline at end of file From 3f42368c021f6e4b7fdeee23c86a608b88618377 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Thu, 8 Aug 2024 14:19:32 +0800 Subject: [PATCH 8/8] =?UTF-8?q?feat:=20=E7=99=BE=E5=BA=A6=E8=B4=B4?= =?UTF-8?q?=E5=90=A7done?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 44 +- config/base_config.py | 25 +- media_platform/tieba/client.py | 40 +- media_platform/tieba/core.py | 31 +- media_platform/tieba/help.py | 65 +- .../tieba/test_data/tieba_note_list.html | 3627 +++++++++++++++++ model/m_baidu_tieba.py | 1 + store/tieba/__init__.py | 3 +- tools/crawler_util.py | 1 + tools/utils.py | 2 +- 10 files changed, 3800 insertions(+), 39 deletions(-) create mode 100644 media_platform/tieba/test_data/tieba_note_list.html diff --git a/README.md b/README.md index 9950b4e..e71f46f 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ > 点击查看更为详细的免责声明。[点击跳转](#disclaimer) # 仓库描述 -**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**...。 +**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**,**百度贴吧**...。 目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。 原理:利用[playwright](https://playwright.dev/)搭桥,保留登录成功后的上下文浏览器环境,通过执行JS表达式获取一些加密参数 @@ -22,6 +22,7 @@ | 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | +| 贴吧 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ## 使用方法 @@ -99,14 +100,51 @@ - [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html) - [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html) - - ## 感谢下列Sponsors对本仓库赞助 - 通过注册安装这个款免费的Sider ChatGPT插件帮我获得一定奖励💰,这个插件我用了大半年,作为谷歌上最火的一款插件,体验非常不错。 > 安装并注册该浏览器插件之后保留一天即可,我就可以获得3元的推广奖励,谢谢大家,支持我继续开源项目。 成为赞助者,展示你的产品在这里,联系作者wx:yzglan +## 打赏 + +如果觉得项目不错的话可以打赏哦。您的支持就是我最大的动力! + +打赏时您可以备注名称,我会将您添加至打赏列表中。 +

                                                                                                                                                                                                                                                  + 打赏-微信 + 打赏-支付宝 +

                                                                                                                                                                                                                                                  + +## 捐赠信息 + +PS:如果打赏时请备注捐赠者,如有遗漏请联系我添加(有时候消息多可能会漏掉,十分抱歉) + +| 捐赠者 | 捐赠金额 | 捐赠日期 | +|-------------|-------|------------| +| *皓 | 50 元 | 2024-03-18 | +| *刚 | 50 元 | 2024-03-18 | +| *乐 | 20 元 | 2024-03-17 | +| *木 | 20 元 | 2024-03-17 | +| *诚 | 20 元 | 2024-03-17 | +| Strem Gamer | 20 元 | 2024-03-16 | +| *鑫 | 20 元 | 2024-03-14 | +| Yuzu | 20 元 | 2024-03-07 | +| **宁 | 100 元 | 2024-03-03 | +| **媛 | 20 元 | 2024-03-03 | +| Scarlett | 20 元 | 2024-02-16 | +| Asun | 20 元 | 2024-01-30 | +| 何* | 100 元 | 2024-01-21 | +| allen | 20 元 | 2024-01-10 | +| llllll | 20 元 | 2024-01-07 | +| 邝*元 | 20 元 | 2023-12-29 | +| 50chen | 50 元 | 2023-12-22 | +| xiongot | 20 元 | 2023-12-17 | +| atom.hu | 20 元 | 2023-12-16 | +| 一呆 | 20 元 | 2023-12-01 | +| 坠落 | 50 元 | 2023-11-08 | + + ## MediaCrawler爬虫项目交流群: > 扫描下方我的个人微信,备注:github,拉你进MediaCrawler项目交流群(请一定备注:github,会有wx小助手自动拉群) diff --git a/config/base_config.py b/config/base_config.py index 53dc8bf..cefc711 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -28,7 +28,7 @@ HEADLESS = False SAVE_LOGIN_STATE = True # 数据保存类型选项配置,支持三种类型:csv、db、json -SAVE_DATA_OPTION = "db" # csv or db or json +SAVE_DATA_OPTION = "csv" # csv or db or json # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name @@ -46,18 +46,18 @@ MAX_CONCURRENCY_NUM = 1 ENABLE_GET_IMAGES = False # 是否开启爬评论模式, 默认不开启爬评论 -ENABLE_GET_COMMENTS = True +ENABLE_GET_COMMENTS = False # 是否开启爬二级评论模式, 默认不开启爬二级评论 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 -ENABLE_GET_SUB_COMMENTS = True +ENABLE_GET_SUB_COMMENTS = False # 指定小红书需要爬虫的笔记ID列表 XHS_SPECIFIED_ID_LIST = [ "6422c2750000000027000d88", "64ca1b73000000000b028dd2", "630d5b85000000001203ab41", - "668fe13000000000030241fa", # 图文混合 + "668fe13000000000030241fa", # 图文混合 # ........................ ] @@ -93,6 +93,10 @@ TIEBA_SPECIFIED_ID_LIST = [ ] +# 指定贴吧名称列表,爬取该贴吧下的帖子 +TIEBA_NAME_LIST = [ + # "盗墓笔记" +] # 指定小红书创作者ID列表 XHS_CREATOR_ID_LIST = [ @@ -118,19 +122,18 @@ KS_CREATOR_ID_LIST = [ # ........................ ] - -#词云相关 -#是否开启生成评论词云图 +# 词云相关 +# 是否开启生成评论词云图 ENABLE_GET_WORDCLOUD = False # 自定义词语及其分组 -#添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。 +# 添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。 CUSTOM_WORDS = { '零几': '年份', # 将“零几”识别为一个整体 '高频词': '专业术语' # 示例自定义词 } -#停用(禁用)词文件路径 +# 停用(禁用)词文件路径 STOP_WORDS_FILE = "./docs/hit_stopwords.txt" -#中文字体文件路径 -FONT_PATH= "./docs/STZHONGS.TTF" +# 中文字体文件路径 +FONT_PATH = "./docs/STZHONGS.TTF" diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index 2ae4304..daa1c4c 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -1,17 +1,15 @@ import asyncio import json -import random from typing import Any, Callable, Dict, List, Optional, Union from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext -from tenacity import (RetryError, retry, stop_after_attempt, - wait_fixed) +from tenacity import RetryError, retry, stop_after_attempt, wait_fixed import config from base.base_crawler import AbstractApiClient -from model.m_baidu_tieba import TiebaNote, TiebaComment +from model.m_baidu_tieba import TiebaComment, TiebaNote from proxy.proxy_ip_pool import ProxyIpPool from tools import utils @@ -103,7 +101,7 @@ class BaiduTieBaClient(AbstractApiClient): return res utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") - raise e + raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") async def post(self, uri: str, data: dict, **kwargs) -> Dict: """ @@ -248,28 +246,44 @@ class BaiduTieBaClient(AbstractApiClient): # raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...") all_sub_comments: List[TiebaComment] = [] - for comment in comments: - if comment.sub_comment_count == 0: + for parment_comment in comments: + if parment_comment.sub_comment_count == 0: continue current_page = 1 - max_sub_page_num = comment.sub_comment_count // 10 + 1 + max_sub_page_num = parment_comment.sub_comment_count // 10 + 1 while max_sub_page_num >= current_page: params = { - "tid": comment.note_id, # 帖子ID - "pid": comment.comment_id, # 父级评论ID - "fid": comment.tieba_id, # 贴吧ID + "tid": parment_comment.note_id, # 帖子ID + "pid": parment_comment.comment_id, # 父级评论ID + "fid": parment_comment.tieba_id, # 贴吧ID "pn": current_page # 页码 } page_content = await self.get(uri, params=params, return_ori_content=True) sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, - parent_comment=comment) + parent_comment=parment_comment) if not sub_comments: break if callback: - await callback(comment.note_id, sub_comments) + await callback(parment_comment.note_id, sub_comments) all_sub_comments.extend(sub_comments) await asyncio.sleep(crawl_interval) current_page += 1 return all_sub_comments + + + + async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]: + """ + 根据贴吧名称获取帖子列表 + Args: + tieba_name: 贴吧名称 + page_num: 分页数量 + + Returns: + + """ + uri = f"/f?kw={tieba_name}&pn={page_num}" + page_content = await self.get(uri, return_ori_content=True) + return self._page_extractor.extract_tieba_note_list(page_content) diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index 2d10a0a..c8b8764 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -53,6 +53,7 @@ class TieBaCrawler(AbstractCrawler): if config.CRAWLER_TYPE == "search": # Search for notes and retrieve their comment information. await self.search() + await self.get_specified_tieba_notes() elif config.CRAWLER_TYPE == "detail": # Get the information and comments of the specified post await self.get_specified_notes() @@ -92,7 +93,7 @@ class TieBaCrawler(AbstractCrawler): if not notes_list: utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty") break - utils.logger.info(f"[BaiduTieBaCrawler.search] Note List: {notes_list}") + utils.logger.info(f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}") await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list]) page += 1 except Exception as ex: @@ -100,6 +101,34 @@ class TieBaCrawler(AbstractCrawler): f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}") break + async def get_specified_tieba_notes(self): + """ + Get the information and comments of the specified post by tieba name + Returns: + + """ + tieba_limit_count = 50 + if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count + for tieba_name in config.TIEBA_NAME_LIST: + utils.logger.info( + f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}") + page_number = 0 + while page_number <= config.CRAWLER_MAX_NOTES_COUNT: + note_list: List[TiebaNote] = await self.tieba_client.get_notes_by_tieba_name( + tieba_name=tieba_name, + page_num=page_number + ) + if not note_list: + utils.logger.info( + f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty") + break + + utils.logger.info( + f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}") + await self.get_specified_notes([note.note_id for note in note_list]) + page_number += tieba_limit_count + async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST): """ Get the information and comments of the specified post diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index b46081d..4f3fe15 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- -import re -import json import html -from typing import List, Dict, Tuple +import json +import re +from typing import Dict, List, Tuple from parsel import Selector -from model.m_baidu_tieba import TiebaNote, TiebaComment from constant import baidu_tieba as const +from model.m_baidu_tieba import TiebaComment, TiebaNote from tools import utils @@ -43,6 +43,42 @@ class TieBaExtractor: result.append(tieba_note) return result + def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]: + """ + 提取贴吧帖子列表 + Args: + page_content: + + Returns: + + """ + page_content = page_content.replace(' + + + + + + + + 盗墓笔记吧-百度贴吧--喜爱盗墓笔记的有爱稻米聚集地--盗墓笔记吧致力于为广大喜爱《盗墓笔记》的吧友服务,传递官方最新资讯,小说相关同人作品,鼓励吧友原创精品,解密分析、图片、文章等。 + + + + + + + + + + + + + + +
                                                                                                                                                                                                                                                  + + + + + + +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  + +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  + + + + + +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  + + +
                                                                                                                                                                                                                                                  + + + + + +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  + +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  + +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  + + + +
                                                                                                                                                                                                                                                  +
                                                                                                                                                                                                                                                  + + + +
                                                                                                                                                                                                                                                  + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/model/m_baidu_tieba.py b/model/m_baidu_tieba.py index 8153000..a32250f 100644 --- a/model/m_baidu_tieba.py +++ b/model/m_baidu_tieba.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- from typing import Optional + from pydantic import BaseModel, Field diff --git a/store/tieba/__init__.py b/store/tieba/__init__.py index e358bb6..788a93d 100644 --- a/store/tieba/__init__.py +++ b/store/tieba/__init__.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- from typing import List -from model.m_baidu_tieba import TiebaNote, TiebaComment +from model.m_baidu_tieba import TiebaComment, TiebaNote + from . import tieba_store_impl from .tieba_store_impl import * diff --git a/tools/crawler_util.py b/tools/crawler_util.py index 9c54f2a..c17fc45 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -14,6 +14,7 @@ from PIL import Image, ImageDraw from playwright.async_api import Cookie, Page from proxy import IpInfoModel + from . import utils diff --git a/tools/utils.py b/tools/utils.py index 572764c..455302b 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -10,7 +10,7 @@ def init_loging_config(): level = logging.INFO logging.basicConfig( level=level, - format="%(asctime)s [%(threadName)s] %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s", + format="%(asctime)s %(name)s %(levelname)s (%(filename)s:%(lineno)d) - %(message)s", datefmt='%Y-%m-%d %H:%M:%S' ) _logger = logging.getLogger("MediaCrawler")