fix: xhs帖子详情问题更新
This commit is contained in:
parent
9fe3e47b0f
commit
03e393949a
|
@ -57,18 +57,26 @@ MAX_CONCURRENCY_NUM = 1
|
||||||
ENABLE_GET_IMAGES = False
|
ENABLE_GET_IMAGES = False
|
||||||
|
|
||||||
# 是否开启爬评论模式, 默认开启爬评论
|
# 是否开启爬评论模式, 默认开启爬评论
|
||||||
ENABLE_GET_COMMENTS = False
|
ENABLE_GET_COMMENTS = True
|
||||||
|
|
||||||
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
||||||
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
||||||
ENABLE_GET_SUB_COMMENTS = False
|
ENABLE_GET_SUB_COMMENTS = False
|
||||||
|
|
||||||
# 指定小红书需要爬虫的笔记ID列表
|
# 已废弃⚠️⚠️⚠️指定小红书需要爬虫的笔记ID列表
|
||||||
XHS_SPECIFIED_ID_LIST = [
|
# 已废弃⚠️⚠️⚠️ 指定笔记ID笔记列表会因为缺少xsec_token和xsec_source参数导致爬取失败
|
||||||
"6422c2750000000027000d88",
|
# XHS_SPECIFIED_ID_LIST = [
|
||||||
|
# "66fad51c000000001b0224b8",
|
||||||
|
# # ........................
|
||||||
|
# ]
|
||||||
|
|
||||||
|
# 指定小红书需要爬虫的笔记URL列表, 目前要携带xsec_token和xsec_source参数
|
||||||
|
XHS_SPECIFIED_NOTE_URL_LIST = [
|
||||||
|
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
|
||||||
# ........................
|
# ........................
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
# 指定抖音需要爬取的ID列表
|
# 指定抖音需要爬取的ID列表
|
||||||
DY_SPECIFIED_ID_LIST = [
|
DY_SPECIFIED_ID_LIST = [
|
||||||
"7280854932641664319",
|
"7280854932641664319",
|
||||||
|
|
|
@ -99,6 +99,13 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if response.status_code == 471 or response.status_code == 461:
|
||||||
|
# someday someone maybe will bypass captcha
|
||||||
|
verify_type = response.headers['Verifytype']
|
||||||
|
verify_uuid = response.headers['Verifyuuid']
|
||||||
|
raise Exception(
|
||||||
|
f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}")
|
||||||
|
|
||||||
if return_response:
|
if return_response:
|
||||||
return response.text
|
return response.text
|
||||||
data: Dict = response.json()
|
data: Dict = response.json()
|
||||||
|
@ -228,8 +235,8 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||||
"source_note_id": note_id,
|
"source_note_id": note_id,
|
||||||
"image_formats": ["jpg", "webp", "avif"],
|
"image_formats": ["jpg", "webp", "avif"],
|
||||||
"extra": {"need_body_topic": 1},
|
"extra": {"need_body_topic": 1},
|
||||||
# "xsec_source": xsec_source,
|
"xsec_source": xsec_source,
|
||||||
# "xsec_token": xsec_token
|
"xsec_token": xsec_token
|
||||||
}
|
}
|
||||||
uri = "/api/sns/web/v1/feed"
|
uri = "/api/sns/web/v1/feed"
|
||||||
res = await self.post(uri, data)
|
res = await self.post(uri, data)
|
||||||
|
@ -454,13 +461,15 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||||
return await self.post(uri, data=data, return_response=True)
|
return await self.post(uri, data=data, return_response=True)
|
||||||
|
|
||||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||||
async def get_note_by_id_from_html(self, note_id: str):
|
async def get_note_by_id_from_html(self, note_id: str, xsec_source: str, xsec_token: str) -> Dict:
|
||||||
"""
|
"""
|
||||||
通过解析网页版的笔记详情页HTML,获取笔记详情, 该接口可能会出现失败的情况,这里尝试重试3次
|
通过解析网页版的笔记详情页HTML,获取笔记详情, 该接口可能会出现失败的情况,这里尝试重试3次
|
||||||
copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259
|
copy from https://github.com/ReaJason/xhs/blob/eb1c5a0213f6fbb592f0a2897ee552847c69ea2d/xhs/core.py#L217-L259
|
||||||
thanks for ReaJason
|
thanks for ReaJason
|
||||||
Args:
|
Args:
|
||||||
note_id:
|
note_id:
|
||||||
|
xsec_source:
|
||||||
|
xsec_token:
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
|
@ -488,7 +497,7 @@ class XiaoHongShuClient(AbstractApiClient):
|
||||||
dict_new[new_key] = value
|
dict_new[new_key] = value
|
||||||
return dict_new
|
return dict_new
|
||||||
|
|
||||||
url = "https://www.xiaohongshu.com/explore/" + note_id
|
url = "https://www.xiaohongshu.com/explore/" + note_id + f"?xsec_token={xsec_token}&xsec_source={xsec_source}"
|
||||||
html = await self.request(method="GET", url=url, return_response=True, headers=self.headers)
|
html = await self.request(method="GET", url=url, return_response=True, headers=self.headers)
|
||||||
state = re.findall(r"window.__INITIAL_STATE__=({.*})</script>", html)[0].replace("undefined", '""')
|
state = re.findall(r"window.__INITIAL_STATE__=({.*})</script>", html)[0].replace("undefined", '""')
|
||||||
if state != "{}":
|
if state != "{}":
|
||||||
|
|
|
@ -21,6 +21,7 @@ from tenacity import RetryError
|
||||||
|
|
||||||
import config
|
import config
|
||||||
from base.base_crawler import AbstractCrawler
|
from base.base_crawler import AbstractCrawler
|
||||||
|
from model.m_xiaohongshu import NoteUrlInfo
|
||||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||||
from store import xhs as xhs_store
|
from store import xhs as xhs_store
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
@ -29,6 +30,7 @@ from var import crawler_type_var, source_keyword_var
|
||||||
from .client import XiaoHongShuClient
|
from .client import XiaoHongShuClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
from .field import SearchSortType
|
from .field import SearchSortType
|
||||||
|
from .help import parse_note_info_from_note_url
|
||||||
from .login import XiaoHongShuLogin
|
from .login import XiaoHongShuLogin
|
||||||
|
|
||||||
|
|
||||||
|
@ -191,48 +193,40 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
await xhs_store.update_xhs_note(note_detail)
|
await xhs_store.update_xhs_note(note_detail)
|
||||||
|
|
||||||
async def get_specified_notes(self):
|
async def get_specified_notes(self):
|
||||||
"""Get the information and comments of the specified post"""
|
"""
|
||||||
|
Get the information and comments of the specified post
|
||||||
|
must be specified note_id, xsec_source, xsec_token⚠️⚠️⚠️
|
||||||
|
Returns:
|
||||||
|
|
||||||
async def get_note_detail_from_html_task(note_id: str, semaphore: asyncio.Semaphore) -> Dict:
|
"""
|
||||||
async with semaphore:
|
get_note_detail_task_list = []
|
||||||
try:
|
for full_note_url in config.XHS_SPECIFIED_NOTE_URL_LIST:
|
||||||
_note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
|
note_url_info: NoteUrlInfo = parse_note_info_from_note_url(full_note_url)
|
||||||
if not _note_detail:
|
utils.logger.info(f"[XiaoHongShuCrawler.get_specified_notes] Parse note url info: {note_url_info}")
|
||||||
utils.logger.error(
|
crawler_task = self.get_note_detail_async_task(
|
||||||
f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error, note_id: {note_id}")
|
note_id=note_url_info.note_id,
|
||||||
return {}
|
xsec_source=note_url_info.xsec_source,
|
||||||
return _note_detail
|
xsec_token=note_url_info.xsec_token,
|
||||||
except DataFetchError as ex:
|
semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail_from_html] Get note detail error: {ex}")
|
)
|
||||||
return {}
|
get_note_detail_task_list.append(crawler_task)
|
||||||
except KeyError as ex:
|
|
||||||
utils.logger.error(
|
|
||||||
f"[XiaoHongShuCrawler.get_note_detail_from_html] have not fund note detail note_id:{note_id}, err: {ex}")
|
|
||||||
return {}
|
|
||||||
except RetryError as ex:
|
|
||||||
utils.logger.error(
|
|
||||||
f"[XiaoHongShuCrawler.get_note_detail_from_html] Retry error, note_id:{note_id}, err: {ex}")
|
|
||||||
|
|
||||||
get_note_detail_task_list = [
|
|
||||||
get_note_detail_from_html_task(note_id=note_id, semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)) for
|
|
||||||
note_id in config.XHS_SPECIFIED_ID_LIST
|
|
||||||
]
|
|
||||||
|
|
||||||
need_get_comment_note_ids = []
|
need_get_comment_note_ids = []
|
||||||
note_details = await asyncio.gather(*get_note_detail_task_list)
|
note_details = await asyncio.gather(*get_note_detail_task_list)
|
||||||
for note_detail in note_details:
|
for note_detail in note_details:
|
||||||
if note_detail:
|
if note_detail:
|
||||||
need_get_comment_note_ids.append(note_detail.get("note_id"))
|
need_get_comment_note_ids.append(note_detail.get("note_id", ""))
|
||||||
await xhs_store.update_xhs_note(note_detail)
|
await xhs_store.update_xhs_note(note_detail)
|
||||||
await self.batch_get_note_comments(need_get_comment_note_ids)
|
await self.batch_get_note_comments(need_get_comment_note_ids)
|
||||||
|
|
||||||
|
|
||||||
async def get_note_detail_async_task(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \
|
async def get_note_detail_async_task(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \
|
||||||
Optional[Dict]:
|
Optional[Dict]:
|
||||||
"""Get note detail"""
|
"""Get note detail"""
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
try:
|
try:
|
||||||
# note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id)
|
note_detail: Dict = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token)
|
||||||
note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
|
# note_detail: Dict = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
|
||||||
if not note_detail:
|
if not note_detail:
|
||||||
utils.logger.error(
|
utils.logger.error(
|
||||||
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}")
|
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}")
|
||||||
|
|
|
@ -15,6 +15,9 @@ import random
|
||||||
import time
|
import time
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
|
from model.m_xiaohongshu import NoteUrlInfo
|
||||||
|
from tools.crawler_util import extract_url_params_to_dict
|
||||||
|
|
||||||
|
|
||||||
def sign(a1="", b1="", x_s="", x_t=""):
|
def sign(a1="", b1="", x_s="", x_t=""):
|
||||||
"""
|
"""
|
||||||
|
@ -288,6 +291,21 @@ def get_trace_id(img_url: str):
|
||||||
return f"spectrum/{img_url.split('/')[-1]}" if img_url.find("spectrum") != -1 else img_url.split("/")[-1]
|
return f"spectrum/{img_url.split('/')[-1]}" if img_url.find("spectrum") != -1 else img_url.split("/")[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_note_info_from_note_url(url: str) -> NoteUrlInfo:
|
||||||
|
"""
|
||||||
|
从小红书笔记url中解析出笔记信息
|
||||||
|
Args:
|
||||||
|
url: "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
note_id = url.split("/")[-1].split("?")[0]
|
||||||
|
params = extract_url_params_to_dict(url)
|
||||||
|
xsec_token = params.get("xsec_token", "")
|
||||||
|
xsec_source = params.get("xsec_source", "")
|
||||||
|
return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
_img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
|
_img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
|
||||||
# 获取一个图片地址在多个cdn下的url地址
|
# 获取一个图片地址在多个cdn下的url地址
|
||||||
|
|
|
@ -10,3 +10,12 @@
|
||||||
|
|
||||||
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
class NoteUrlInfo(BaseModel):
|
||||||
|
note_id: str = Field(title="note id")
|
||||||
|
xsec_token: str = Field(title="xsec token")
|
||||||
|
xsec_source: str = Field(title="xsec source")
|
|
@ -18,6 +18,8 @@ import base64
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import re
|
import re
|
||||||
|
import urllib
|
||||||
|
import urllib.parse
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
@ -192,3 +194,12 @@ def extract_text_from_html(html: str) -> str:
|
||||||
# Remove all other tags
|
# Remove all other tags
|
||||||
clean_text = re.sub(r'<[^>]+>', '', clean_html).strip()
|
clean_text = re.sub(r'<[^>]+>', '', clean_html).strip()
|
||||||
return clean_text
|
return clean_text
|
||||||
|
|
||||||
|
def extract_url_params_to_dict(url: str) -> Dict:
|
||||||
|
"""Extract URL parameters to dict"""
|
||||||
|
url_params_dict = dict()
|
||||||
|
if not url:
|
||||||
|
return url_params_dict
|
||||||
|
parsed_url = urllib.parse.urlparse(url)
|
||||||
|
url_params_dict = dict(urllib.parse.parse_qsl(parsed_url.query))
|
||||||
|
return url_params_dict
|
||||||
|
|
Loading…
Reference in New Issue