# -*- coding: utf-8 -*-
import html
import json
import re
from typing import Dict, List, Tuple
from parsel import Selector
from constant import baidu_tieba as const
from model.m_baidu_tieba import TiebaComment, TiebaNote
from tools import utils
class TieBaExtractor:
def __init__(self):
pass
@staticmethod
def extract_search_note_list(page_content: str) -> List[TiebaNote]:
"""
提取贴吧帖子列表,这里提取的关键词搜索结果页的数据,还缺少帖子的回复数和回复页等数据
Args:
page_content: 页面内容的HTML字符串
Returns:
包含帖子信息的字典列表
"""
xpath_selector = "//div[@class='s_post']"
post_list = Selector(text=page_content).xpath(xpath_selector)
result: List[TiebaNote] = []
for post in post_list:
tieba_note = TiebaNote(
note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(),
title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(),
desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(),
note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get(default=''),
user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip(),
user_link=const.TIEBA_URL + post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default=''),
tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(),
tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get(default=''),
publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip(),
)
result.append(tieba_note)
return result
def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
"""
提取贴吧帖子列表
Args:
page_content:
Returns:
"""
page_content = page_content.replace('