From 1208682a9a844b0d22cb67eb79386a2244c25c93 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Wed, 7 Aug 2024 02:39:50 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E8=AF=84=E8=AE=BA=E7=A7=BB=E9=99=A4html?= =?UTF-8?q?=E6=A0=87=E7=AD=BE=E5=86=85=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- media_platform/tieba/help.py | 4 ++-- tools/crawler_util.py | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index 2297855..1225e7a 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -8,6 +8,7 @@ from parsel import Selector from model.m_baidu_tieba import TiebaNote, TiebaComment from constant import baidu_tieba as const +from tools import utils class TieBaExtractor: @@ -105,7 +106,7 @@ class TieBaExtractor: tieba_comment = TiebaComment( comment_id=str(comment_field_value.get("content").get("post_id")), sub_comment_count=comment_field_value.get("content").get("comment_num"), - content=comment_field_value.get("content").get("content"), + content=utils.extract_text_from_html(comment_field_value.get("content").get("content")), note_url=const.TIEBA_URL + f"/p/{note_id}", user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(), user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get( @@ -117,7 +118,6 @@ class TieBaExtractor: publish_time=publish_time, note_id=note_id, ) - print(tieba_comment.model_dump()) result.append(tieba_comment) return result diff --git a/tools/crawler_util.py b/tools/crawler_util.py index 8e37881..9c54f2a 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -146,4 +146,12 @@ def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optio httpx_proxy = { f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" } - return playwright_proxy, httpx_proxy \ No newline at end of file + return playwright_proxy, httpx_proxy + +def extract_text_from_html(html: str) -> str: + """Extract text from HTML, removing all tags.""" + # Remove script and style elements + clean_html = re.sub(r'<(script|style)[^>]*>.*?', '', html, flags=re.DOTALL) + # Remove all other tags + clean_text = re.sub(r'<[^>]+>', '', clean_html).strip() + return clean_text