From 1208682a9a844b0d22cb67eb79386a2244c25c93 Mon Sep 17 00:00:00 2001
From: Relakkes <relakkes@gmail.com>
Date: Wed, 7 Aug 2024 02:39:50 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E8=AF=84=E8=AE=BA=E7=A7=BB=E9=99=A4html?=
 =?UTF-8?q?=E6=A0=87=E7=AD=BE=E5=86=85=E5=AE=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 media_platform/tieba/help.py |  4 ++--
 tools/crawler_util.py        | 10 +++++++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py
index 2297855..1225e7a 100644
--- a/media_platform/tieba/help.py
+++ b/media_platform/tieba/help.py
@@ -8,6 +8,7 @@ from parsel import Selector
 
 from model.m_baidu_tieba import TiebaNote, TiebaComment
 from constant import baidu_tieba as const
+from tools import utils
 
 
 class TieBaExtractor:
@@ -105,7 +106,7 @@ class TieBaExtractor:
             tieba_comment = TiebaComment(
                 comment_id=str(comment_field_value.get("content").get("post_id")),
                 sub_comment_count=comment_field_value.get("content").get("comment_num"),
-                content=comment_field_value.get("content").get("content"),
+                content=utils.extract_text_from_html(comment_field_value.get("content").get("content")),
                 note_url=const.TIEBA_URL + f"/p/{note_id}",
                 user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(),
                 user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get(
@@ -117,7 +118,6 @@ class TieBaExtractor:
                 publish_time=publish_time,
                 note_id=note_id,
             )
-            print(tieba_comment.model_dump())
             result.append(tieba_comment)
         return result
 
diff --git a/tools/crawler_util.py b/tools/crawler_util.py
index 8e37881..9c54f2a 100644
--- a/tools/crawler_util.py
+++ b/tools/crawler_util.py
@@ -146,4 +146,12 @@ def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optio
     httpx_proxy = {
         f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}"
     }
-    return playwright_proxy, httpx_proxy
\ No newline at end of file
+    return playwright_proxy, httpx_proxy
+
+def extract_text_from_html(html: str) -> str:
+    """Extract text from HTML, removing all tags."""
+    # Remove script and style elements
+    clean_html = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', html, flags=re.DOTALL)
+    # Remove all other tags
+    clean_text = re.sub(r'<[^>]+>', '', clean_html).strip()
+    return clean_text