fix: 评论移除html标签内容
This commit is contained in:
parent
026d81e131
commit
1208682a9a
|
@ -8,6 +8,7 @@ from parsel import Selector
|
||||||
|
|
||||||
from model.m_baidu_tieba import TiebaNote, TiebaComment
|
from model.m_baidu_tieba import TiebaNote, TiebaComment
|
||||||
from constant import baidu_tieba as const
|
from constant import baidu_tieba as const
|
||||||
|
from tools import utils
|
||||||
|
|
||||||
|
|
||||||
class TieBaExtractor:
|
class TieBaExtractor:
|
||||||
|
@ -105,7 +106,7 @@ class TieBaExtractor:
|
||||||
tieba_comment = TiebaComment(
|
tieba_comment = TiebaComment(
|
||||||
comment_id=str(comment_field_value.get("content").get("post_id")),
|
comment_id=str(comment_field_value.get("content").get("post_id")),
|
||||||
sub_comment_count=comment_field_value.get("content").get("comment_num"),
|
sub_comment_count=comment_field_value.get("content").get("comment_num"),
|
||||||
content=comment_field_value.get("content").get("content"),
|
content=utils.extract_text_from_html(comment_field_value.get("content").get("content")),
|
||||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||||
user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(),
|
user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(),
|
||||||
user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get(
|
user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get(
|
||||||
|
@ -117,7 +118,6 @@ class TieBaExtractor:
|
||||||
publish_time=publish_time,
|
publish_time=publish_time,
|
||||||
note_id=note_id,
|
note_id=note_id,
|
||||||
)
|
)
|
||||||
print(tieba_comment.model_dump())
|
|
||||||
result.append(tieba_comment)
|
result.append(tieba_comment)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
|
@ -147,3 +147,11 @@ def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optio
|
||||||
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
|
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
|
||||||
}
|
}
|
||||||
return playwright_proxy, httpx_proxy
|
return playwright_proxy, httpx_proxy
|
||||||
|
|
||||||
|
def extract_text_from_html(html: str) -> str:
|
||||||
|
"""Extract text from HTML, removing all tags."""
|
||||||
|
# Remove script and style elements
|
||||||
|
clean_html = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', html, flags=re.DOTALL)
|
||||||
|
# Remove all other tags
|
||||||
|
clean_text = re.sub(r'<[^>]+>', '', clean_html).strip()
|
||||||
|
return clean_text
|
||||||
|
|
Loading…
Reference in New Issue