feat: weibo支持指定创作者主页

This commit is contained in:
Relakkes 2024-08-24 05:52:11 +08:00
parent 61f023edac
commit ab7d8142af
9 changed files with 368 additions and 16 deletions

View File

@ -27,7 +27,7 @@
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| 微博 | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ |
| 微博 | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ |
| 贴吧 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |

View File

@ -85,6 +85,12 @@ WEIBO_SPECIFIED_ID_LIST = [
# ........................
]
# 指定weibo创作者ID列表
WEIBO_CREATOR_ID_LIST = [
"5533390220",
# ........................
]
# 指定贴吧需要爬取的帖子列表
TIEBA_SPECIFIED_ID_LIST = [

View File

@ -7,10 +7,11 @@ import asyncio
import copy
import json
import re
from typing import Any, Callable, Dict, List, Optional
from urllib.parse import urlencode
from typing import Callable, Dict, List, Optional, Union
from urllib.parse import parse_qs, unquote, urlencode
import httpx
from httpx import Response
from playwright.async_api import BrowserContext, Page
import config
@ -38,20 +39,26 @@ class WeiboClient:
self.cookie_dict = cookie_dict
self._image_agent_host = "https://i1.wp.com/"
async def request(self, method, url, **kwargs) -> Any:
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
enable_return_response = kwargs.pop("return_response", False)
async with httpx.AsyncClient(proxies=self.proxies) as client:
response = await client.request(
method, url, timeout=self.timeout,
**kwargs
)
if enable_return_response:
return response
data: Dict = response.json()
if data.get("ok") != 1:
ok_code = data.get("ok")
if ok_code not in [0, 1]:
utils.logger.error(f"[WeiboClient.request] request {method}:{url} err, res:{data}")
raise DataFetchError(data.get("msg", "unkonw error"))
else:
return data.get("data", {})
async def get(self, uri: str, params=None, headers=None) -> Dict:
async def get(self, uri: str, params=None, headers=None, **kwargs) -> Union[Response, Dict]:
final_uri = uri
if isinstance(params, dict):
final_uri = (f"{uri}?"
@ -59,7 +66,7 @@ class WeiboClient:
if headers is None:
headers = self.headers
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers)
return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers, **kwargs)
async def post(self, uri: str, data: dict) -> Dict:
json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
@ -229,3 +236,123 @@ class WeiboClient:
return None
else:
return response.content
async def get_creator_container_info(self, creator_id: str) -> Dict:
"""
获取用户的容器ID, 容器信息代表着真实请求的API路径
fid_container_id用户的微博详情API的容器ID
lfid_container_id用户的微博列表API的容器ID
Args:
creator_id:
Returns: {
"""
response = await self.get(f"/u/{creator_id}", return_response=True)
m_weibocn_params = response.cookies.get("M_WEIBOCN_PARAMS")
if not m_weibocn_params:
raise DataFetchError("get containerid failed")
m_weibocn_params_dict = parse_qs(unquote(m_weibocn_params))
return {
"fid_container_id": m_weibocn_params_dict.get("fid", [""])[0],
"lfid_container_id": m_weibocn_params_dict.get("lfid", [""])[0]
}
async def get_creator_info_by_id(self, creator_id: str) -> Dict:
"""
根据用户ID获取用户详情
Args:
creator_id:
Returns:
"""
uri = "/api/container/getIndex"
container_info = await self.get_creator_container_info(creator_id)
if container_info.get("fid_container_id") == "" or container_info.get("lfid_container_id") == "":
utils.logger.error(f"[WeiboClient.get_creator_info_by_id] get containerid failed")
raise DataFetchError("get containerid failed")
params = {
"jumpfrom": "weibocom",
"type": "uid",
"value": creator_id,
"containerid": container_info["fid_container_id"],
}
user_res = await self.get(uri, params)
if user_res.get("tabsInfo"):
tabs: List[Dict] = user_res.get("tabsInfo", {}).get("tabs", [])
for tab in tabs:
if tab.get("tabKey") == "weibo":
container_info["lfid_container_id"] = tab.get("containerid")
break
user_res.update(container_info)
return user_res
async def get_notes_by_creator(self, creator: str, container_id: str, since_id: str = "0", ) -> Dict:
"""
获取博主的笔记
Args:
creator: 博主ID
container_id: 容器ID
since_id: 上一页最后一条笔记的ID
Returns:
"""
uri = "/api/container/getIndex"
params = {
"jumpfrom": "weibocom",
"type": "uid",
"value": creator,
"containerid": container_id,
"since_id": since_id,
}
return await self.get(uri, params)
async def get_all_notes_by_creator_id(self, creator_id: str, container_id: str, crawl_interval: float = 1.0,
callback: Optional[Callable] = None) -> List[Dict]:
"""
获取指定用户下的所有发过的帖子该方法会一直查找一个用户下的所有帖子信息
Args:
creator_id:
container_id:
crawl_interval:
callback:
Returns:
"""
result = []
notes_has_more = True
since_id = ""
crawler_total_count = 0
while notes_has_more:
notes_res = await self.get_notes_by_creator(creator_id, container_id, since_id)
if not notes_res:
utils.logger.error(
f"[WeiboClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
break
notes_has_more = notes_res.get("cardlistInfo", {}).get("total", 0) > crawler_total_count
since_id = notes_res.get("cardlistInfo", {}).get("since_id", "0")
notes_has_more += 10
if "cards" not in notes_res:
utils.logger.info(
f"[WeiboClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
break
notes = notes_res["cards"]
utils.logger.info(
f"[WeiboClient.get_all_notes_by_creator] got user_id:{creator_id} notes len : {len(notes)}")
notes = [note for note in notes if note.get("card_type") == 9]
if callback:
await callback(notes)
await asyncio.sleep(crawl_interval)
result.extend(notes)
return result

View File

@ -84,6 +84,9 @@ class WeiboCrawler(AbstractCrawler):
elif config.CRAWLER_TYPE == "detail":
# Get the information and comments of the specified post
await self.get_specified_notes()
elif config.CRAWLER_TYPE == "creator":
# Get creator's information and their notes and comments
await self.get_creators_and_notes()
else:
pass
utils.logger.info("[WeiboCrawler.start] Weibo Crawler finished ...")
@ -221,6 +224,41 @@ class WeiboCrawler(AbstractCrawler):
extension_file_name = url.split(".")[-1]
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
async def get_creators_and_notes(self) -> None:
"""
Get creator's information and their notes and comments
Returns:
"""
utils.logger.info("[WeiboCrawler.get_creators_and_notes] Begin get weibo creators")
for user_id in config.WEIBO_CREATOR_ID_LIST:
createor_info_res: Dict = await self.wb_client.get_creator_info_by_id(creator_id=user_id)
if createor_info_res:
createor_info: Dict = createor_info_res.get("userInfo", {})
utils.logger.info(f"[WeiboCrawler.get_creators_and_notes] creator info: {createor_info}")
if not createor_info:
raise DataFetchError("Get creator info error")
await weibo_store.save_creator(user_id, user_info=createor_info)
# Get all note information of the creator
all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
creator_id=user_id,
container_id=createor_info_res.get("lfid_container_id"),
crawl_interval=0,
callback=weibo_store.batch_update_weibo_notes
)
note_ids = [note_item.get("mlog", {}).get("id") for note_item in all_notes_list if
note_item.get("mlog", {}).get("id")]
await self.batch_get_notes_comments(note_ids)
else:
utils.logger.error(
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_id:{user_id}")
async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
"""Create xhs client"""
utils.logger.info("[WeiboCrawler.create_weibo_client] Begin create weibo API client ...")

View File

@ -185,9 +185,6 @@ class XiaoHongShuCrawler(AbstractCrawler):
async with semaphore:
try:
_note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
print("------------------------")
print(_note_detail)
print("------------------------")
if not _note_detail:
utils.logger.error(
f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error, note_id: {note_id}")

View File

@ -406,3 +406,22 @@ alter table kuaishou_video add column `source_keyword` varchar(255) default '' c
alter table weibo_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
alter table xhs_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
alter table tieba_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
DROP TABLE IF EXISTS `weibo_creator`;
CREATE TABLE `weibo_creator`
(
`id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID',
`user_id` varchar(64) NOT NULL COMMENT '用户ID',
`nickname` varchar(64) DEFAULT NULL COMMENT '用户昵称',
`avatar` varchar(255) DEFAULT NULL COMMENT '用户头像地址',
`ip_location` varchar(255) DEFAULT NULL COMMENT '评论时的IP地址',
`add_ts` bigint NOT NULL COMMENT '记录添加时间戳',
`last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳',
`desc` longtext COMMENT '用户描述',
`gender` varchar(1) DEFAULT NULL COMMENT '性别',
`follows` varchar(16) DEFAULT NULL COMMENT '关注数',
`fans` varchar(16) DEFAULT NULL COMMENT '粉丝数',
`tag_list` longtext COMMENT '标签列表',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='微博博主';

View File

@ -7,6 +7,7 @@ import re
from typing import List
from var import source_keyword_var
from .weibo_store_image import *
from .weibo_store_impl import *
@ -27,7 +28,33 @@ class WeibostoreFactory:
return store_class()
async def batch_update_weibo_notes(note_list: List[Dict]):
"""
Batch update weibo notes
Args:
note_list:
Returns:
"""
if not note_list:
return
for note_item in note_list:
await update_weibo_note(note_item)
async def update_weibo_note(note_item: Dict):
"""
Update weibo note
Args:
note_item:
Returns:
"""
if not note_item:
return
mblog: Dict = note_item.get("mblog")
user_info: Dict = mblog.get("user")
note_id = mblog.get("id")
@ -61,6 +88,15 @@ async def update_weibo_note(note_item: Dict):
async def batch_update_weibo_note_comments(note_id: str, comments: List[Dict]):
"""
Batch update weibo note comments
Args:
note_id:
comments:
Returns:
"""
if not comments:
return
for comment_item in comments:
@ -68,6 +104,17 @@ async def batch_update_weibo_note_comments(note_id: str, comments: List[Dict]):
async def update_weibo_note_comment(note_id: str, comment_item: Dict):
"""
Update weibo note comment
Args:
note_id: weibo note id
comment_item: weibo comment item
Returns:
"""
if not comment_item or not note_id:
return
comment_id = str(comment_item.get("id"))
user_info: Dict = comment_item.get("user")
content_text = comment_item.get("text")
@ -95,5 +142,43 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
f"[store.weibo.update_weibo_note_comment] Weibo note comment: {comment_id}, content: {save_comment_item.get('content', '')[:24]} ...")
await WeibostoreFactory.create_store().store_comment(comment_item=save_comment_item)
async def update_weibo_note_image(picid: str, pic_content, extension_file_name):
await WeiboStoreImage().store_image({"pic_id": picid, "pic_content": pic_content, "extension_file_name": extension_file_name})
"""
Save weibo note image to local
Args:
picid:
pic_content:
extension_file_name:
Returns:
"""
await WeiboStoreImage().store_image(
{"pic_id": picid, "pic_content": pic_content, "extension_file_name": extension_file_name})
async def save_creator(user_id: str, user_info: Dict):
"""
Save creator information to local
Args:
user_id:
user_info:
Returns:
"""
local_db_item = {
'user_id': user_id,
'nickname': user_info.get('screen_name'),
'gender': '' if user_info.get('gender') == "f" else '',
'avatar': user_info.get('avatar_hd'),
'desc': user_info.get('description'),
'ip_location': user_info.get("source", "").replace("来自", ""),
'follows': user_info.get('follow_count', ''),
'fans': user_info.get('followers_count', ''),
'tag_list': '',
"last_modify_ts": utils.get_current_timestamp(),
}
utils.logger.info(f"[store.weibo.save_creator] creator:{local_db_item}")
await WeibostoreFactory.create_store().store_creator(local_db_item)

View File

@ -33,9 +33,6 @@ def calculate_number_of_files(file_store_path: str) -> int:
class WeiboCsvStoreImplement(AbstractStore):
async def store_creator(self, creator: Dict):
pass
csv_store_path: str = "data/weibo"
file_count: int = calculate_number_of_files(csv_store_path)
@ -91,6 +88,17 @@ class WeiboCsvStoreImplement(AbstractStore):
"""
await self.save_data_to_csv(save_item=comment_item, store_type="comments")
async def store_creator(self, creator: Dict):
"""
Weibo creator CSV storage implementation
Args:
creator:
Returns:
"""
await self.save_data_to_csv(save_item=creator, store_type="creators")
class WeiboDbStoreImplement(AbstractStore):
@ -136,7 +144,25 @@ class WeiboDbStoreImplement(AbstractStore):
await update_comment_by_comment_id(comment_id, comment_item=comment_item)
async def store_creator(self, creator: Dict):
pass
"""
Weibo creator DB storage implementation
Args:
creator:
Returns:
"""
from .weibo_store_sql import (add_new_creator,
query_creator_by_user_id,
update_creator_by_user_id)
user_id = creator.get("user_id")
user_detail: Dict = await query_creator_by_user_id(user_id)
if not user_detail:
creator["add_ts"] = utils.get_current_timestamp()
await add_new_creator(creator)
else:
await update_creator_by_user_id(user_id, creator)
class WeiboJsonStoreImplement(AbstractStore):
@ -214,4 +240,12 @@ class WeiboJsonStoreImplement(AbstractStore):
await self.save_data_to_json(comment_item, "comments")
async def store_creator(self, creator: Dict):
pass
"""
creator JSON storage implementation
Args:
creator:
Returns:
"""
await self.save_data_to_json(creator, "creators")

View File

@ -100,3 +100,49 @@ async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> i
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
effect_row: int = await async_db_conn.update_table("weibo_note_comment", comment_item, "comment_id", comment_id)
return effect_row
async def query_creator_by_user_id(user_id: str) -> Dict:
"""
查询一条创作者记录
Args:
user_id:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
sql: str = f"select * from weibo_creator where user_id = '{user_id}'"
rows: List[Dict] = await async_db_conn.query(sql)
if len(rows) > 0:
return rows[0]
return dict()
async def add_new_creator(creator_item: Dict) -> int:
"""
新增一条创作者信息
Args:
creator_item:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
last_row_id: int = await async_db_conn.item_to_table("weibo_creator", creator_item)
return last_row_id
async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int:
"""
更新一条创作者信息
Args:
user_id:
creator_item:
Returns:
"""
async_db_conn: AsyncMysqlDB = media_crawler_db_var.get()
effect_row: int = await async_db_conn.update_table("weibo_creator", creator_item, "user_id", user_id)
return effect_row