fix: 评论移除html标签内容

This commit is contained in:
Relakkes 2024-08-07 02:39:50 +08:00
parent 026d81e131
commit 1208682a9a
2 changed files with 11 additions and 3 deletions

View File

@ -8,6 +8,7 @@ from parsel import Selector
from model.m_baidu_tieba import TiebaNote, TiebaComment from model.m_baidu_tieba import TiebaNote, TiebaComment
from constant import baidu_tieba as const from constant import baidu_tieba as const
from tools import utils
class TieBaExtractor: class TieBaExtractor:
@ -105,7 +106,7 @@ class TieBaExtractor:
tieba_comment = TiebaComment( tieba_comment = TiebaComment(
comment_id=str(comment_field_value.get("content").get("post_id")), comment_id=str(comment_field_value.get("content").get("post_id")),
sub_comment_count=comment_field_value.get("content").get("comment_num"), sub_comment_count=comment_field_value.get("content").get("comment_num"),
content=comment_field_value.get("content").get("content"), content=utils.extract_text_from_html(comment_field_value.get("content").get("content")),
note_url=const.TIEBA_URL + f"/p/{note_id}", note_url=const.TIEBA_URL + f"/p/{note_id}",
user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(), user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(),
user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get( user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get(
@ -117,7 +118,6 @@ class TieBaExtractor:
publish_time=publish_time, publish_time=publish_time,
note_id=note_id, note_id=note_id,
) )
print(tieba_comment.model_dump())
result.append(tieba_comment) result.append(tieba_comment)
return result return result

View File

@ -146,4 +146,12 @@ def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optio
httpx_proxy = { httpx_proxy = {
f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
} }
return playwright_proxy, httpx_proxy return playwright_proxy, httpx_proxy
def extract_text_from_html(html: str) -> str:
"""Extract text from HTML, removing all tags."""
# Remove script and style elements
clean_html = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', html, flags=re.DOTALL)
# Remove all other tags
clean_text = re.sub(r'<[^>]+>', '', clean_html).strip()
return clean_text