diff --git a/README.md b/README.md index 9950b4e..e71f46f 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ > 点击查看更为详细的免责声明。[点击跳转](#disclaimer) # 仓库描述 -**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**...。 +**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**, **微博爬虫**,**百度贴吧**...。 目前能抓取小红书、抖音、快手、B站、微博的视频、图片、评论、点赞、转发等信息。 原理:利用[playwright](https://playwright.dev/)搭桥,保留登录成功后的上下文浏览器环境,通过执行JS表达式获取一些加密参数 @@ -22,6 +22,7 @@ | 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | +| 贴吧 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ## 使用方法 @@ -99,14 +100,51 @@ - [ MediaCrawler-基于抽象类设计重构项目缓存](https://articles.zsxq.com/id_4ju73oxewt9j.html) - [ 手把手带你撸一个自己的IP代理池](https://articles.zsxq.com/id_38fza371ladm.html) - - ## 感谢下列Sponsors对本仓库赞助 - 通过注册安装这个款免费的Sider ChatGPT插件帮我获得一定奖励💰,这个插件我用了大半年,作为谷歌上最火的一款插件,体验非常不错。 > 安装并注册该浏览器插件之后保留一天即可,我就可以获得3元的推广奖励,谢谢大家,支持我继续开源项目。 成为赞助者,展示你的产品在这里,联系作者wx:yzglan +## 打赏 + +如果觉得项目不错的话可以打赏哦。您的支持就是我最大的动力! + +打赏时您可以备注名称,我会将您添加至打赏列表中。 +
+ + +
+ +## 捐赠信息 + +PS:如果打赏时请备注捐赠者,如有遗漏请联系我添加(有时候消息多可能会漏掉,十分抱歉) + +| 捐赠者 | 捐赠金额 | 捐赠日期 | +|-------------|-------|------------| +| *皓 | 50 元 | 2024-03-18 | +| *刚 | 50 元 | 2024-03-18 | +| *乐 | 20 元 | 2024-03-17 | +| *木 | 20 元 | 2024-03-17 | +| *诚 | 20 元 | 2024-03-17 | +| Strem Gamer | 20 元 | 2024-03-16 | +| *鑫 | 20 元 | 2024-03-14 | +| Yuzu | 20 元 | 2024-03-07 | +| **宁 | 100 元 | 2024-03-03 | +| **媛 | 20 元 | 2024-03-03 | +| Scarlett | 20 元 | 2024-02-16 | +| Asun | 20 元 | 2024-01-30 | +| 何* | 100 元 | 2024-01-21 | +| allen | 20 元 | 2024-01-10 | +| llllll | 20 元 | 2024-01-07 | +| 邝*元 | 20 元 | 2023-12-29 | +| 50chen | 50 元 | 2023-12-22 | +| xiongot | 20 元 | 2023-12-17 | +| atom.hu | 20 元 | 2023-12-16 | +| 一呆 | 20 元 | 2023-12-01 | +| 坠落 | 50 元 | 2023-11-08 | + + ## MediaCrawler爬虫项目交流群: > 扫描下方我的个人微信,备注:github,拉你进MediaCrawler项目交流群(请一定备注:github,会有wx小助手自动拉群) diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 27854f7..65819a1 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -7,8 +7,8 @@ from tools.utils import str2bool async def parse_cmd(): # 读取command arg parser = argparse.ArgumentParser(description='Media crawler program.') - parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb)', - choices=["xhs", "dy", "ks", "bili", "wb"], default=config.PLATFORM) + parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb | tieba)', + choices=["xhs", "dy", "ks", "bili", "wb", "tieba"], default=config.PLATFORM) parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)', diff --git a/config/base_config.py b/config/base_config.py index 076003a..cefc711 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -28,7 +28,7 @@ HEADLESS = False SAVE_LOGIN_STATE = True # 数据保存类型选项配置,支持三种类型:csv、db、json -SAVE_DATA_OPTION = "json" # csv or db or json +SAVE_DATA_OPTION = "csv" # csv or db or json # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name @@ -37,7 +37,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name START_PAGE = 1 # 爬取视频/帖子的数量控制 -CRAWLER_MAX_NOTES_COUNT = 20 +CRAWLER_MAX_NOTES_COUNT = 100 # 并发爬虫数量控制 MAX_CONCURRENCY_NUM = 1 @@ -57,7 +57,7 @@ XHS_SPECIFIED_ID_LIST = [ "6422c2750000000027000d88", "64ca1b73000000000b028dd2", "630d5b85000000001203ab41", - "668fe13000000000030241fa", # 图文混合 + "668fe13000000000030241fa", # 图文混合 # ........................ ] @@ -88,6 +88,16 @@ WEIBO_SPECIFIED_ID_LIST = [ # ........................ ] +# 指定贴吧需要爬取的帖子列表 +TIEBA_SPECIFIED_ID_LIST = [ + +] + +# 指定贴吧名称列表,爬取该贴吧下的帖子 +TIEBA_NAME_LIST = [ + # "盗墓笔记" +] + # 指定小红书创作者ID列表 XHS_CREATOR_ID_LIST = [ "63e36c9a000000002703502b", @@ -112,19 +122,18 @@ KS_CREATOR_ID_LIST = [ # ........................ ] - -#词云相关 -#是否开启生成评论词云图 +# 词云相关 +# 是否开启生成评论词云图 ENABLE_GET_WORDCLOUD = False # 自定义词语及其分组 -#添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。 +# 添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。 CUSTOM_WORDS = { '零几': '年份', # 将“零几”识别为一个整体 '高频词': '专业术语' # 示例自定义词 } -#停用(禁用)词文件路径 +# 停用(禁用)词文件路径 STOP_WORDS_FILE = "./docs/hit_stopwords.txt" -#中文字体文件路径 -FONT_PATH= "./docs/STZHONGS.TTF" +# 中文字体文件路径 +FONT_PATH = "./docs/STZHONGS.TTF" diff --git a/constant/__init__.py b/constant/__init__.py new file mode 100644 index 0000000..40a96af --- /dev/null +++ b/constant/__init__.py @@ -0,0 +1 @@ +# -*- coding: utf-8 -*- diff --git a/constant/baidu_tieba.py b/constant/baidu_tieba.py new file mode 100644 index 0000000..cfd15e1 --- /dev/null +++ b/constant/baidu_tieba.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +TIEBA_URL = 'https://tieba.baidu.com' \ No newline at end of file diff --git a/main.py b/main.py index 27d84ad..e051b5e 100644 --- a/main.py +++ b/main.py @@ -8,6 +8,7 @@ from base.base_crawler import AbstractCrawler from media_platform.bilibili import BilibiliCrawler from media_platform.douyin import DouYinCrawler from media_platform.kuaishou import KuaishouCrawler +from media_platform.tieba import TieBaCrawler from media_platform.weibo import WeiboCrawler from media_platform.xhs import XiaoHongShuCrawler @@ -18,7 +19,8 @@ class CrawlerFactory: "dy": DouYinCrawler, "ks": KuaishouCrawler, "bili": BilibiliCrawler, - "wb": WeiboCrawler + "wb": WeiboCrawler, + "tieba": TieBaCrawler } @staticmethod @@ -28,6 +30,7 @@ class CrawlerFactory: raise ValueError("Invalid Media Platform Currently only supported xhs or dy or ks or bili ...") return crawler_class() + async def main(): # parse cmd await cmd_arg.parse_cmd() @@ -38,7 +41,7 @@ async def main(): crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM) await crawler.start() - + if config.SAVE_DATA_OPTION == "db": await db.close() diff --git a/media_platform/tieba/__init__.py b/media_platform/tieba/__init__.py new file mode 100644 index 0000000..e7e2a44 --- /dev/null +++ b/media_platform/tieba/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +from .core import TieBaCrawler \ No newline at end of file diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py new file mode 100644 index 0000000..daa1c4c --- /dev/null +++ b/media_platform/tieba/client.py @@ -0,0 +1,289 @@ +import asyncio +import json +from typing import Any, Callable, Dict, List, Optional, Union +from urllib.parse import urlencode + +import httpx +from playwright.async_api import BrowserContext +from tenacity import RetryError, retry, stop_after_attempt, wait_fixed + +import config +from base.base_crawler import AbstractApiClient +from model.m_baidu_tieba import TiebaComment, TiebaNote +from proxy.proxy_ip_pool import ProxyIpPool +from tools import utils + +from .field import SearchNoteType, SearchSortType +from .help import TieBaExtractor + + +class BaiduTieBaClient(AbstractApiClient): + def __init__( + self, + timeout=10, + ip_pool=None, + default_ip_proxy=None, + ): + self.ip_pool: Optional[ProxyIpPool] = ip_pool + self.timeout = timeout + self.headers = { + "User-Agent": utils.get_user_agent(), + "Cookies": "", + } + self._host = "https://tieba.baidu.com" + self._page_extractor = TieBaExtractor() + self.default_ip_proxy = default_ip_proxy + + @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) + async def request(self, method, url, return_ori_content=False, proxies=None, **kwargs) -> Union[str, Any]: + """ + 封装httpx的公共请求方法,对请求响应做一些处理 + Args: + method: 请求方法 + url: 请求的URL + return_ori_content: 是否返回原始内容 + proxies: 代理IP + **kwargs: 其他请求参数,例如请求头、请求体等 + + Returns: + + """ + actual_proxies = proxies if proxies else self.default_ip_proxy + async with httpx.AsyncClient(proxies=actual_proxies) as client: + response = await client.request( + method, url, timeout=self.timeout, + headers=self.headers, **kwargs + ) + + if response.status_code != 200: + utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}") + utils.logger.error(f"Request failed, response: {response.text}") + raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}") + + if response.text == "" or response.text == "blocked": + utils.logger.error(f"request params incrr, response.text: {response.text}") + raise Exception("account blocked") + + if return_ori_content: + return response.text + + return response.json() + + async def get(self, uri: str, params=None, return_ori_content=False, **kwargs) -> Any: + """ + GET请求,对请求头签名 + Args: + uri: 请求路由 + params: 请求参数 + return_ori_content: 是否返回原始内容 + + Returns: + + """ + final_uri = uri + if isinstance(params, dict): + final_uri = (f"{uri}?" + f"{urlencode(params)}") + try: + res = await self.request(method="GET", url=f"{self._host}{final_uri}", + return_ori_content=return_ori_content, + **kwargs) + return res + except RetryError as e: + if self.ip_pool: + proxie_model = await self.ip_pool.get_proxy() + _, proxies = utils.format_proxy_info(proxie_model) + res = await self.request(method="GET", url=f"{self._host}{final_uri}", + return_ori_content=return_ori_content, + proxies=proxies, + **kwargs) + self.default_ip_proxy = proxies + return res + + utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") + raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") + + async def post(self, uri: str, data: dict, **kwargs) -> Dict: + """ + POST请求,对请求头签名 + Args: + uri: 请求路由 + data: 请求体参数 + + Returns: + + """ + json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) + return await self.request(method="POST", url=f"{self._host}{uri}", + data=json_str, **kwargs) + + async def pong(self) -> bool: + """ + 用于检查登录态是否失效了 + Returns: + + """ + utils.logger.info("[BaiduTieBaClient.pong] Begin to pong tieba...") + try: + uri = "/mo/q/sync" + res: Dict = await self.get(uri) + utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}") + if res and res.get("no") == 0: + ping_flag = True + else: + utils.logger.info(f"[BaiduTieBaClient.pong] user not login, will try to login again...") + ping_flag = False + except Exception as e: + utils.logger.error(f"[BaiduTieBaClient.pong] Ping tieba failed: {e}, and try to login again...") + ping_flag = False + return ping_flag + + async def update_cookies(self, browser_context: BrowserContext): + """ + API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法 + Args: + browser_context: 浏览器上下文对象 + + Returns: + + """ + pass + + async def get_notes_by_keyword( + self, keyword: str, + page: int = 1, + page_size: int = 10, + sort: SearchSortType = SearchSortType.TIME_DESC, + note_type: SearchNoteType = SearchNoteType.FIXED_THREAD, + ) -> List[TiebaNote]: + """ + 根据关键词搜索贴吧帖子 + Args: + keyword: 关键词 + page: 分页第几页 + page_size: 每页大小 + sort: 结果排序方式 + note_type: 帖子类型(主题贴|主题+回复混合模式) + Returns: + + """ + uri = "/f/search/res" + params = { + "isnew": 1, + "qw": keyword, + "rn": page_size, + "pn": page, + "sm": sort.value, + "only_thread": note_type.value + } + page_content = await self.get(uri, params=params, return_ori_content=True) + return self._page_extractor.extract_search_note_list(page_content) + + async def get_note_by_id(self, note_id: str) -> TiebaNote: + """ + 根据帖子ID获取帖子详情 + Args: + note_id: + + Returns: + + """ + uri = f"/p/{note_id}" + page_content = await self.get(uri, return_ori_content=True) + return self._page_extractor.extract_note_detail(page_content) + + async def get_note_all_comments(self, note_detail: TiebaNote, crawl_interval: float = 1.0, + callback: Optional[Callable] = None) -> List[TiebaComment]: + """ + 获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 + Args: + note_detail: 帖子详情对象 + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + + Returns: + + """ + uri = f"/p/{note_detail.note_id}" + result: List[TiebaComment] = [] + current_page = 1 + while note_detail.total_replay_page >= current_page: + params = { + "pn": current_page + } + page_content = await self.get(uri, params=params, return_ori_content=True) + comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, + note_id=note_detail.note_id) + if not comments: + break + if callback: + await callback(note_detail.note_id, comments) + result.extend(comments) + # 获取所有子评论 + await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback) + await asyncio.sleep(crawl_interval) + current_page += 1 + return result + + async def get_comments_all_sub_comments(self, comments: List[TiebaComment], crawl_interval: float = 1.0, + callback: Optional[Callable] = None) -> List[TiebaComment]: + """ + 获取指定评论下的所有子评论 + Args: + comments: 评论列表 + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + + Returns: + + """ + uri = "/p/comment" + if not config.ENABLE_GET_SUB_COMMENTS: + return [] + + # # 贴吧获取所有子评论需要登录态 + # if self.headers.get("Cookies") == "" or not self.pong(): + # raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...") + + all_sub_comments: List[TiebaComment] = [] + for parment_comment in comments: + if parment_comment.sub_comment_count == 0: + continue + + current_page = 1 + max_sub_page_num = parment_comment.sub_comment_count // 10 + 1 + while max_sub_page_num >= current_page: + params = { + "tid": parment_comment.note_id, # 帖子ID + "pid": parment_comment.comment_id, # 父级评论ID + "fid": parment_comment.tieba_id, # 贴吧ID + "pn": current_page # 页码 + } + page_content = await self.get(uri, params=params, return_ori_content=True) + sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, + parent_comment=parment_comment) + + if not sub_comments: + break + if callback: + await callback(parment_comment.note_id, sub_comments) + all_sub_comments.extend(sub_comments) + await asyncio.sleep(crawl_interval) + current_page += 1 + return all_sub_comments + + + + async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]: + """ + 根据贴吧名称获取帖子列表 + Args: + tieba_name: 贴吧名称 + page_num: 分页数量 + + Returns: + + """ + uri = f"/f?kw={tieba_name}&pn={page_num}" + page_content = await self.get(uri, return_ori_content=True) + return self._page_extractor.extract_tieba_note_list(page_content) diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py new file mode 100644 index 0000000..c8b8764 --- /dev/null +++ b/media_platform/tieba/core.py @@ -0,0 +1,265 @@ +import asyncio +import os +import random +from asyncio import Task +from typing import Dict, List, Optional, Tuple + +from playwright.async_api import (BrowserContext, BrowserType, Page, + async_playwright) + +import config +from base.base_crawler import AbstractCrawler +from model.m_baidu_tieba import TiebaNote +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool +from store import tieba as tieba_store +from tools import utils +from tools.crawler_util import format_proxy_info +from var import crawler_type_var + +from .client import BaiduTieBaClient +from .field import SearchNoteType, SearchSortType +from .login import BaiduTieBaLogin + + +class TieBaCrawler(AbstractCrawler): + context_page: Page + tieba_client: BaiduTieBaClient + browser_context: BrowserContext + + def __init__(self) -> None: + self.index_url = "https://tieba.baidu.com" + self.user_agent = utils.get_user_agent() + + async def start(self) -> None: + """ + Start the crawler + Returns: + + """ + ip_proxy_pool, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + utils.logger.info("[BaiduTieBaCrawler.start] Begin create ip proxy pool ...") + ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + _, httpx_proxy_format = format_proxy_info(ip_proxy_info) + utils.logger.info(f"[BaiduTieBaCrawler.start] Init default ip proxy, value: {httpx_proxy_format}") + + # Create a client to interact with the baidutieba website. + self.tieba_client = BaiduTieBaClient( + ip_pool=ip_proxy_pool, + default_ip_proxy=httpx_proxy_format, + ) + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": + # Search for notes and retrieve their comment information. + await self.search() + await self.get_specified_tieba_notes() + elif config.CRAWLER_TYPE == "detail": + # Get the information and comments of the specified post + await self.get_specified_notes() + else: + pass + + utils.logger.info("[BaiduTieBaCrawler.start] Tieba Crawler finished ...") + + async def search(self) -> None: + """ + Search for notes and retrieve their comment information. + Returns: + + """ + utils.logger.info("[BaiduTieBaCrawler.search] Begin search baidu tieba keywords") + tieba_limit_count = 10 # tieba limit page fixed value + if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count + start_page = config.START_PAGE + for keyword in config.KEYWORDS.split(","): + utils.logger.info(f"[BaiduTieBaCrawler.search] Current search keyword: {keyword}") + page = 1 + while (page - start_page + 1) * tieba_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[BaiduTieBaCrawler.search] Skip page {page}") + page += 1 + continue + try: + utils.logger.info(f"[BaiduTieBaCrawler.search] search tieba keyword: {keyword}, page: {page}") + notes_list: List[TiebaNote] = await self.tieba_client.get_notes_by_keyword( + keyword=keyword, + page=page, + page_size=tieba_limit_count, + sort=SearchSortType.TIME_DESC, + note_type=SearchNoteType.FIXED_THREAD + ) + if not notes_list: + utils.logger.info(f"[BaiduTieBaCrawler.search] Search note list is empty") + break + utils.logger.info(f"[BaiduTieBaCrawler.search] Note list len: {len(notes_list)}") + await self.get_specified_notes(note_id_list=[note_detail.note_id for note_detail in notes_list]) + page += 1 + except Exception as ex: + utils.logger.error( + f"[BaiduTieBaCrawler.search] Search keywords error, current page: {page}, current keyword: {keyword}, err: {ex}") + break + + async def get_specified_tieba_notes(self): + """ + Get the information and comments of the specified post by tieba name + Returns: + + """ + tieba_limit_count = 50 + if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count + for tieba_name in config.TIEBA_NAME_LIST: + utils.logger.info( + f"[BaiduTieBaCrawler.get_specified_tieba_notes] Begin get tieba name: {tieba_name}") + page_number = 0 + while page_number <= config.CRAWLER_MAX_NOTES_COUNT: + note_list: List[TiebaNote] = await self.tieba_client.get_notes_by_tieba_name( + tieba_name=tieba_name, + page_num=page_number + ) + if not note_list: + utils.logger.info( + f"[BaiduTieBaCrawler.get_specified_tieba_notes] Get note list is empty") + break + + utils.logger.info( + f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}") + await self.get_specified_notes([note.note_id for note in note_list]) + page_number += tieba_limit_count + + async def get_specified_notes(self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST): + """ + Get the information and comments of the specified post + Args: + note_id_list: + + Returns: + + """ + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore) for note_id in note_id_list + ] + note_details = await asyncio.gather(*task_list) + note_details_model: List[TiebaNote] = [] + for note_detail in note_details: + if note_detail is not None: + note_details_model.append(note_detail) + await tieba_store.update_tieba_note(note_detail) + await self.batch_get_note_comments(note_details_model) + + async def get_note_detail_async_task(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[TiebaNote]: + """ + Get note detail + Args: + note_id: baidu tieba note id + semaphore: asyncio semaphore + + Returns: + + """ + async with semaphore: + try: + utils.logger.info(f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}") + note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id) + if not note_detail: + utils.logger.error( + f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}") + return None + return note_detail + except Exception as ex: + utils.logger.error(f"[BaiduTieBaCrawler.get_note_detail] Get note detail error: {ex}") + return None + except KeyError as ex: + utils.logger.error( + f"[BaiduTieBaCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}") + return None + + async def batch_get_note_comments(self, note_detail_list: List[TiebaNote]): + """ + Batch get note comments + Args: + note_detail_list: + + Returns: + + """ + if not config.ENABLE_GET_COMMENTS: + return + + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list: List[Task] = [] + for note_detail in note_detail_list: + task = asyncio.create_task(self.get_comments_async_task(note_detail, semaphore), name=note_detail.note_id) + task_list.append(task) + await asyncio.gather(*task_list) + + async def get_comments_async_task(self, note_detail: TiebaNote, semaphore: asyncio.Semaphore): + """ + Get comments async task + Args: + note_detail: + semaphore: + + Returns: + + """ + async with semaphore: + utils.logger.info(f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}") + await self.tieba_client.get_note_all_comments( + note_detail=note_detail, + crawl_interval=random.random(), + callback=tieba_store.batch_update_tieba_note_comments + ) + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True + ) -> BrowserContext: + """ + Launch browser and create browser + Args: + chromium: + playwright_proxy: + user_agent: + headless: + + Returns: + + """ + utils.logger.info("[BaiduTieBaCrawler.launch_browser] Begin create browser context ...") + if config.SAVE_LOGIN_STATE: + # feat issue #14 + # we will save login state to avoid login every time + user_data_dir = os.path.join(os.getcwd(), "browser_data", + config.USER_DATA_DIR % config.PLATFORM) # type: ignore + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, # type: ignore + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent + ) + return browser_context + else: + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore + browser_context = await browser.new_context( + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent + ) + return browser_context + + async def close(self): + """ + Close browser context + Returns: + + """ + await self.browser_context.close() + utils.logger.info("[BaiduTieBaCrawler.close] Browser context closed ...") diff --git a/media_platform/tieba/field.py b/media_platform/tieba/field.py new file mode 100644 index 0000000..824fe88 --- /dev/null +++ b/media_platform/tieba/field.py @@ -0,0 +1,18 @@ +from enum import Enum + + +class SearchSortType(Enum): + """search sort type""" + # 按时间倒序 + TIME_DESC = "1" + # 按时间顺序 + TIME_ASC = "0" + # 按相关性顺序 + RELEVANCE_ORDER = "2" + + +class SearchNoteType(Enum): + # 只看主题贴 + MAIN_THREAD = "1" + # 混合模式(帖子+回复) + FIXED_THREAD = "0" diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py new file mode 100644 index 0000000..4f3fe15 --- /dev/null +++ b/media_platform/tieba/help.py @@ -0,0 +1,301 @@ +# -*- coding: utf-8 -*- +import html +import json +import re +from typing import Dict, List, Tuple + +from parsel import Selector + +from constant import baidu_tieba as const +from model.m_baidu_tieba import TiebaComment, TiebaNote +from tools import utils + + +class TieBaExtractor: + def __init__(self): + pass + + @staticmethod + def extract_search_note_list(page_content: str) -> List[TiebaNote]: + """ + 提取贴吧帖子列表,这里提取的关键词搜索结果页的数据,还缺少帖子的回复数和回复页等数据 + Args: + page_content: 页面内容的HTML字符串 + + Returns: + 包含帖子信息的字典列表 + """ + xpath_selector = "//div[@class='s_post']" + post_list = Selector(text=page_content).xpath(xpath_selector) + result: List[TiebaNote] = [] + for post in post_list: + tieba_note = TiebaNote( + note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(), + title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(), + desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(), + note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get(default=''), + user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip(), + user_link=const.TIEBA_URL + post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default=''), + tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(), + tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get(default=''), + publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip(), + ) + result.append(tieba_note) + return result + + def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]: + """ + 提取贴吧帖子列表 + Args: + page_content: + + Returns: + + """ + page_content = page_content.replace(' + + + + + + + + +