2024-10-19 16:43:25 +00:00
|
|
|
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
|
|
|
|
# 1. 不得用于任何商业用途。
|
|
|
|
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
|
|
|
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
|
|
|
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
|
|
|
|
# 5. 不得用于任何非法或不当的用途。
|
|
|
|
|
#
|
|
|
|
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
|
|
|
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
|
|
|
|
|
|
|
|
|
|
2023-12-24 09:57:48 +00:00
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
# @Author : relakkes@gmail.com
|
|
|
|
|
# @Time : 2023/12/23 15:40
|
|
|
|
|
# @Desc : 微博爬虫 API 请求 client
|
|
|
|
|
|
|
|
|
|
import asyncio
|
2023-12-24 16:02:11 +00:00
|
|
|
|
import copy
|
2023-12-24 09:57:48 +00:00
|
|
|
|
import json
|
2023-12-24 16:02:11 +00:00
|
|
|
|
import re
|
2024-08-23 21:52:11 +00:00
|
|
|
|
from typing import Callable, Dict, List, Optional, Union
|
|
|
|
|
from urllib.parse import parse_qs, unquote, urlencode
|
2023-12-24 09:57:48 +00:00
|
|
|
|
|
|
|
|
|
import httpx
|
2024-08-23 21:52:11 +00:00
|
|
|
|
from httpx import Response
|
2023-12-24 09:57:48 +00:00
|
|
|
|
from playwright.async_api import BrowserContext, Page
|
2024-04-17 15:13:40 +00:00
|
|
|
|
|
2024-08-04 16:48:42 +00:00
|
|
|
|
import config
|
2023-12-24 09:57:48 +00:00
|
|
|
|
from tools import utils
|
|
|
|
|
|
|
|
|
|
from .exception import DataFetchError
|
|
|
|
|
from .field import SearchType
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WeiboClient:
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
timeout=10,
|
|
|
|
|
proxies=None,
|
|
|
|
|
*,
|
|
|
|
|
headers: Dict[str, str],
|
|
|
|
|
playwright_page: Page,
|
|
|
|
|
cookie_dict: Dict[str, str],
|
|
|
|
|
):
|
|
|
|
|
self.proxies = proxies
|
|
|
|
|
self.timeout = timeout
|
|
|
|
|
self.headers = headers
|
|
|
|
|
self._host = "https://m.weibo.cn"
|
|
|
|
|
self.playwright_page = playwright_page
|
|
|
|
|
self.cookie_dict = cookie_dict
|
2024-04-09 09:21:52 +00:00
|
|
|
|
self._image_agent_host = "https://i1.wp.com/"
|
2023-12-24 09:57:48 +00:00
|
|
|
|
|
2024-08-23 21:52:11 +00:00
|
|
|
|
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
|
|
|
|
|
enable_return_response = kwargs.pop("return_response", False)
|
2023-12-24 09:57:48 +00:00
|
|
|
|
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
|
|
|
|
response = await client.request(
|
|
|
|
|
method, url, timeout=self.timeout,
|
|
|
|
|
**kwargs
|
|
|
|
|
)
|
2024-08-23 21:52:11 +00:00
|
|
|
|
|
|
|
|
|
if enable_return_response:
|
|
|
|
|
return response
|
|
|
|
|
|
2023-12-24 09:57:48 +00:00
|
|
|
|
data: Dict = response.json()
|
2024-08-23 21:52:11 +00:00
|
|
|
|
ok_code = data.get("ok")
|
|
|
|
|
if ok_code not in [0, 1]:
|
2023-12-24 09:57:48 +00:00
|
|
|
|
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
|
|
|
|
|
raise DataFetchError(data.get("msg", "unkonw error"))
|
|
|
|
|
else:
|
|
|
|
|
return data.get("data", {})
|
|
|
|
|
|
2024-08-23 21:52:11 +00:00
|
|
|
|
async def get(self, uri: str, params=None, headers=None, **kwargs) -> Union[Response, Dict]:
|
2023-12-24 09:57:48 +00:00
|
|
|
|
final_uri = uri
|
|
|
|
|
if isinstance(params, dict):
|
|
|
|
|
final_uri = (f"{uri}?"
|
|
|
|
|
f"{urlencode(params)}")
|
2023-12-24 16:02:11 +00:00
|
|
|
|
|
|
|
|
|
if headers is None:
|
|
|
|
|
headers = self.headers
|
2024-08-23 21:52:11 +00:00
|
|
|
|
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers, **kwargs)
|
2023-12-24 09:57:48 +00:00
|
|
|
|
|
|
|
|
|
async def post(self, uri: str, data: dict) -> Dict:
|
|
|
|
|
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
|
|
|
|
|
return await self.request(method="POST", url=f"{self._host}{uri}",
|
|
|
|
|
data=json_str, headers=self.headers)
|
|
|
|
|
|
|
|
|
|
async def pong(self) -> bool:
|
|
|
|
|
"""get a note to check if login state is ok"""
|
|
|
|
|
utils.logger.info("[WeiboClient.pong] Begin pong weibo...")
|
|
|
|
|
ping_flag = False
|
|
|
|
|
try:
|
2024-08-04 16:48:42 +00:00
|
|
|
|
uri = "/api/config"
|
2023-12-30 10:54:21 +00:00
|
|
|
|
resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers)
|
|
|
|
|
if resp_data.get("login"):
|
|
|
|
|
ping_flag = True
|
2024-01-06 11:18:07 +00:00
|
|
|
|
else:
|
|
|
|
|
utils.logger.error(f"[WeiboClient.pong] cookie may be invalid and again login...")
|
2023-12-24 09:57:48 +00:00
|
|
|
|
except Exception as e:
|
2023-12-30 10:54:21 +00:00
|
|
|
|
utils.logger.error(f"[WeiboClient.pong] Pong weibo failed: {e}, and try to login again...")
|
2023-12-24 09:57:48 +00:00
|
|
|
|
ping_flag = False
|
|
|
|
|
return ping_flag
|
|
|
|
|
|
|
|
|
|
async def update_cookies(self, browser_context: BrowserContext):
|
|
|
|
|
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
|
|
|
|
|
self.headers["Cookie"] = cookie_str
|
|
|
|
|
self.cookie_dict = cookie_dict
|
|
|
|
|
|
|
|
|
|
async def get_note_by_keyword(
|
|
|
|
|
self,
|
|
|
|
|
keyword: str,
|
|
|
|
|
page: int = 1,
|
|
|
|
|
search_type: SearchType = SearchType.DEFAULT
|
|
|
|
|
) -> Dict:
|
|
|
|
|
"""
|
|
|
|
|
search note by keyword
|
|
|
|
|
:param keyword: 微博搜搜的关键词
|
|
|
|
|
:param page: 分页参数 -当前页码
|
|
|
|
|
:param search_type: 搜索的类型,见 weibo/filed.py 中的枚举SearchType
|
|
|
|
|
:return:
|
|
|
|
|
"""
|
|
|
|
|
uri = "/api/container/getIndex"
|
|
|
|
|
containerid = f"100103type={search_type.value}&q={keyword}"
|
|
|
|
|
params = {
|
|
|
|
|
"containerid": containerid,
|
|
|
|
|
"page_type": "searchall",
|
|
|
|
|
"page": page,
|
|
|
|
|
}
|
|
|
|
|
return await self.get(uri, params)
|
2023-12-24 16:02:11 +00:00
|
|
|
|
|
|
|
|
|
async def get_note_comments(self, mid_id: str, max_id: int) -> Dict:
|
|
|
|
|
"""get notes comments
|
|
|
|
|
:param mid_id: 微博ID
|
|
|
|
|
:param max_id: 分页参数ID
|
|
|
|
|
:return:
|
|
|
|
|
"""
|
|
|
|
|
uri = "/comments/hotflow"
|
|
|
|
|
params = {
|
|
|
|
|
"id": mid_id,
|
|
|
|
|
"mid": mid_id,
|
|
|
|
|
"max_id_type": 0,
|
|
|
|
|
}
|
|
|
|
|
if max_id > 0:
|
|
|
|
|
params.update({"max_id": max_id})
|
|
|
|
|
|
|
|
|
|
referer_url = f"https://m.weibo.cn/detail/{mid_id}"
|
|
|
|
|
headers = copy.copy(self.headers)
|
|
|
|
|
headers["Referer"] = referer_url
|
|
|
|
|
|
|
|
|
|
return await self.get(uri, params, headers=headers)
|
|
|
|
|
|
2024-08-04 16:48:42 +00:00
|
|
|
|
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
|
2024-10-23 08:32:02 +00:00
|
|
|
|
callback: Optional[Callable] = None,
|
|
|
|
|
max_count: int = 10,
|
|
|
|
|
):
|
2023-12-24 16:02:11 +00:00
|
|
|
|
"""
|
|
|
|
|
get note all comments include sub comments
|
|
|
|
|
:param note_id:
|
|
|
|
|
:param crawl_interval:
|
|
|
|
|
:param callback:
|
2024-10-23 08:32:02 +00:00
|
|
|
|
:param max_count:
|
2023-12-24 16:02:11 +00:00
|
|
|
|
:return:
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
result = []
|
|
|
|
|
is_end = False
|
|
|
|
|
max_id = -1
|
2024-10-23 08:32:02 +00:00
|
|
|
|
while not is_end and len(result) < max_count:
|
2023-12-24 16:02:11 +00:00
|
|
|
|
comments_res = await self.get_note_comments(note_id, max_id)
|
|
|
|
|
max_id: int = comments_res.get("max_id")
|
|
|
|
|
comment_list: List[Dict] = comments_res.get("data", [])
|
|
|
|
|
is_end = max_id == 0
|
2024-10-23 08:32:02 +00:00
|
|
|
|
if len(result) + len(comment_list) > max_count:
|
|
|
|
|
comment_list = comment_list[:max_count - len(result)]
|
2023-12-24 16:02:11 +00:00
|
|
|
|
if callback: # 如果有回调函数,就执行回调函数
|
|
|
|
|
await callback(note_id, comment_list)
|
|
|
|
|
await asyncio.sleep(crawl_interval)
|
2024-08-04 16:48:42 +00:00
|
|
|
|
result.extend(comment_list)
|
|
|
|
|
sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback)
|
|
|
|
|
result.extend(sub_comment_result)
|
2023-12-24 16:02:11 +00:00
|
|
|
|
return result
|
|
|
|
|
|
2024-08-04 16:48:42 +00:00
|
|
|
|
@staticmethod
|
|
|
|
|
async def get_comments_all_sub_comments(note_id: str, comment_list: List[Dict],
|
|
|
|
|
callback: Optional[Callable] = None) -> List[Dict]:
|
|
|
|
|
"""
|
|
|
|
|
获取评论的所有子评论
|
|
|
|
|
Args:
|
|
|
|
|
note_id:
|
|
|
|
|
comment_list:
|
|
|
|
|
callback:
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
if not config.ENABLE_GET_SUB_COMMENTS:
|
|
|
|
|
utils.logger.info(
|
|
|
|
|
f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
res_sub_comments = []
|
|
|
|
|
for comment in comment_list:
|
|
|
|
|
sub_comments = comment.get("comments")
|
|
|
|
|
if sub_comments and isinstance(sub_comments, list):
|
|
|
|
|
await callback(note_id, sub_comments)
|
|
|
|
|
res_sub_comments.extend(sub_comments)
|
|
|
|
|
return res_sub_comments
|
|
|
|
|
|
2023-12-24 16:02:11 +00:00
|
|
|
|
async def get_note_info_by_id(self, note_id: str) -> Dict:
|
|
|
|
|
"""
|
|
|
|
|
根据帖子ID获取详情
|
|
|
|
|
:param note_id:
|
|
|
|
|
:return:
|
|
|
|
|
"""
|
|
|
|
|
url = f"{self._host}/detail/{note_id}"
|
|
|
|
|
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
|
|
|
|
response = await client.request(
|
|
|
|
|
"GET", url, timeout=self.timeout, headers=self.headers
|
|
|
|
|
)
|
|
|
|
|
if response.status_code != 200:
|
|
|
|
|
raise DataFetchError(f"get weibo detail err: {response.text}")
|
|
|
|
|
match = re.search(r'var \$render_data = (\[.*?\])\[0\]', response.text, re.DOTALL)
|
|
|
|
|
if match:
|
|
|
|
|
render_data_json = match.group(1)
|
|
|
|
|
render_data_dict = json.loads(render_data_json)
|
|
|
|
|
note_detail = render_data_dict[0].get("status")
|
|
|
|
|
note_item = {
|
|
|
|
|
"mblog": note_detail
|
|
|
|
|
}
|
|
|
|
|
return note_item
|
|
|
|
|
else:
|
|
|
|
|
utils.logger.info(f"[WeiboClient.get_note_info_by_id] 未找到$render_data的值")
|
|
|
|
|
return dict()
|
2024-04-09 09:21:52 +00:00
|
|
|
|
|
|
|
|
|
async def get_note_image(self, image_url: str) -> bytes:
|
2024-08-04 16:48:42 +00:00
|
|
|
|
image_url = image_url[8:] # 去掉 https://
|
2024-04-09 09:21:52 +00:00
|
|
|
|
sub_url = image_url.split("/")
|
|
|
|
|
image_url = ""
|
|
|
|
|
for i in range(len(sub_url)):
|
|
|
|
|
if i == 1:
|
2024-08-04 16:48:42 +00:00
|
|
|
|
image_url += "large/" # 都获取高清大图
|
2024-04-09 09:21:52 +00:00
|
|
|
|
elif i == len(sub_url) - 1:
|
|
|
|
|
image_url += sub_url[i]
|
|
|
|
|
else:
|
|
|
|
|
image_url += sub_url[i] + "/"
|
|
|
|
|
# 微博图床对外存在防盗链,所以需要代理访问
|
|
|
|
|
# 由于微博图片是通过 i1.wp.com 来访问的,所以需要拼接一下
|
|
|
|
|
final_uri = (f"{self._image_agent_host}" f"{image_url}")
|
|
|
|
|
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
|
|
|
|
response = await client.request("GET", final_uri, timeout=self.timeout)
|
|
|
|
|
if not response.reason_phrase == "OK":
|
|
|
|
|
utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
|
|
|
|
|
return None
|
|
|
|
|
else:
|
2024-08-04 16:48:42 +00:00
|
|
|
|
return response.content
|
2024-08-23 21:52:11 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def get_creator_container_info(self, creator_id: str) -> Dict:
|
|
|
|
|
"""
|
|
|
|
|
获取用户的容器ID, 容器信息代表着真实请求的API路径
|
|
|
|
|
fid_container_id:用户的微博详情API的容器ID
|
|
|
|
|
lfid_container_id:用户的微博列表API的容器ID
|
|
|
|
|
Args:
|
|
|
|
|
creator_id:
|
|
|
|
|
|
|
|
|
|
Returns: {
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
response = await self.get(f"/u/{creator_id}", return_response=True)
|
|
|
|
|
m_weibocn_params = response.cookies.get("M_WEIBOCN_PARAMS")
|
|
|
|
|
if not m_weibocn_params:
|
|
|
|
|
raise DataFetchError("get containerid failed")
|
|
|
|
|
m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params))
|
|
|
|
|
return {
|
|
|
|
|
"fid_container_id": m_weibocn_params_dict.get("fid", [""])[0],
|
|
|
|
|
"lfid_container_id": m_weibocn_params_dict.get("lfid", [""])[0]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async def get_creator_info_by_id(self, creator_id: str) -> Dict:
|
|
|
|
|
"""
|
|
|
|
|
根据用户ID获取用户详情
|
|
|
|
|
Args:
|
|
|
|
|
creator_id:
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
uri = "/api/container/getIndex"
|
|
|
|
|
container_info = await self.get_creator_container_info(creator_id)
|
|
|
|
|
if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "":
|
|
|
|
|
utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed")
|
|
|
|
|
raise DataFetchError("get containerid failed")
|
|
|
|
|
params = {
|
|
|
|
|
"jumpfrom": "weibocom",
|
|
|
|
|
"type": "uid",
|
|
|
|
|
"value": creator_id,
|
|
|
|
|
"containerid": container_info["fid_container_id"],
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
user_res = await self.get(uri, params)
|
|
|
|
|
|
|
|
|
|
if user_res.get("tabsInfo"):
|
|
|
|
|
tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", [])
|
|
|
|
|
for tab in tabs:
|
|
|
|
|
if tab.get("tabKey") == "weibo":
|
|
|
|
|
container_info["lfid_container_id"] = tab.get("containerid")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
user_res.update(container_info)
|
|
|
|
|
return user_res
|
|
|
|
|
|
|
|
|
|
async def get_notes_by_creator(self, creator: str, container_id: str, since_id: str = "0", ) -> Dict:
|
|
|
|
|
"""
|
|
|
|
|
获取博主的笔记
|
|
|
|
|
Args:
|
|
|
|
|
creator: 博主ID
|
|
|
|
|
container_id: 容器ID
|
|
|
|
|
since_id: 上一页最后一条笔记的ID
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
uri = "/api/container/getIndex"
|
|
|
|
|
params = {
|
|
|
|
|
"jumpfrom": "weibocom",
|
|
|
|
|
"type": "uid",
|
|
|
|
|
"value": creator,
|
|
|
|
|
"containerid": container_id,
|
|
|
|
|
"since_id": since_id,
|
|
|
|
|
}
|
|
|
|
|
return await self.get(uri, params)
|
|
|
|
|
|
|
|
|
|
async def get_all_notes_by_creator_id(self, creator_id: str, container_id: str, crawl_interval: float = 1.0,
|
|
|
|
|
callback: Optional[Callable] = None) -> List[Dict]:
|
|
|
|
|
"""
|
|
|
|
|
获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息
|
|
|
|
|
Args:
|
|
|
|
|
creator_id:
|
|
|
|
|
container_id:
|
|
|
|
|
crawl_interval:
|
|
|
|
|
callback:
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
result = []
|
|
|
|
|
notes_has_more = True
|
|
|
|
|
since_id = ""
|
|
|
|
|
crawler_total_count = 0
|
|
|
|
|
while notes_has_more:
|
|
|
|
|
notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id)
|
|
|
|
|
if not notes_res:
|
|
|
|
|
utils.logger.error(
|
|
|
|
|
f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
notes_has_more = notes_res.get("cardlistInfo", {}).get("total", 0) > crawler_total_count
|
|
|
|
|
since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0")
|
|
|
|
|
notes_has_more += 10
|
|
|
|
|
if "cards" not in notes_res:
|
|
|
|
|
utils.logger.info(
|
|
|
|
|
f"[WeiboClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
notes = notes_res["cards"]
|
|
|
|
|
utils.logger.info(
|
|
|
|
|
f"[WeiboClient.get_all_notes_by_creator] got user_id:{creator_id} notes len : {len(notes)}")
|
|
|
|
|
notes = [note for note in notes if note.get("card_type") == 9]
|
|
|
|
|
if callback:
|
|
|
|
|
await callback(notes)
|
|
|
|
|
await asyncio.sleep(crawl_interval)
|
|
|
|
|
result.extend(notes)
|
|
|
|
|
return result
|
|
|
|
|
|