# -*- coding: utf-8 -*- import html import json import re from typing import Dict, List, Tuple from urllib.parse import unquote, parse_qs from parsel import Selector from constant import baidu_tieba as const from model.m_baidu_tieba import TiebaComment, TiebaNote, TiebaCreator from tools import utils GENDER_MALE = "sex_male" GENDER_FMALE = "sex_fmale" class TieBaExtractor: def __init__(self): pass @staticmethod def extract_search_note_list(page_content: str) -> List[TiebaNote]: """ 提取贴吧帖子列表,这里提取的关键词搜索结果页的数据,还缺少帖子的回复数和回复页等数据 Args: page_content: 页面内容的HTML字符串 Returns: 包含帖子信息的字典列表 """ xpath_selector = "//div[@class='s_post']" post_list = Selector(text=page_content).xpath(xpath_selector) result: List[TiebaNote] = [] for post in post_list: tieba_note = TiebaNote( note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(), title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(), desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(), note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get(default=''), user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip(), user_link=const.TIEBA_URL + post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default=''), tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(), tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get(default=''), publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip(), ) result.append(tieba_note) return result def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]: """ 提取贴吧帖子列表 Args: page_content: Returns: """ page_content = page_content.replace('