From df0f5c1113099daa8aafc63dcbc725270c55a4ca Mon Sep 17 00:00:00 2001 From: Relakkes Date: Wed, 7 Aug 2024 04:13:15 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E7=99=BE=E5=BA=A6=E8=B4=B4=E5=90=A7?= =?UTF-8?q?=E5=AD=90=E8=AF=84=E8=AE=BAdone?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/base_config.py | 4 +- media_platform/tieba/client.py | 53 ++++- media_platform/tieba/help.py | 74 ++++++- .../tieba/test_data/note_sub_comments.html | 189 ++++++++++++++++++ model/m_baidu_tieba.py | 4 +- schema/tables.sql | 45 +++-- 6 files changed, 328 insertions(+), 41 deletions(-) create mode 100644 media_platform/tieba/test_data/note_sub_comments.html diff --git a/config/base_config.py b/config/base_config.py index 96d87b1..53dc8bf 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -1,6 +1,6 @@ # 基础配置 PLATFORM = "xhs" -KEYWORDS = "缅甸边境,缅北边境,缅北边境线,缅甸边境线" +KEYWORDS = "编程副业,编程兼职" LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" # 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书 @@ -50,7 +50,7 @@ ENABLE_GET_COMMENTS = True # 是否开启爬二级评论模式, 默认不开启爬二级评论 # 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段 -ENABLE_GET_SUB_COMMENTS = False +ENABLE_GET_SUB_COMMENTS = True # 指定小红书需要爬虫的笔记ID列表 XHS_SPECIFIED_ID_LIST = [ diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index 2aa6cde..2ae4304 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -28,7 +28,10 @@ class BaiduTieBaClient(AbstractApiClient): ): self.ip_pool: Optional[ProxyIpPool] = ip_pool self.timeout = timeout - self.headers = utils.get_user_agent() + self.headers = { + "User-Agent": utils.get_user_agent(), + "Cookies": "", + } self._host = "https://tieba.baidu.com" self._page_extractor = TieBaExtractor() self.default_ip_proxy = default_ip_proxy @@ -51,7 +54,7 @@ class BaiduTieBaClient(AbstractApiClient): async with httpx.AsyncClient(proxies=actual_proxies) as client: response = await client.request( method, url, timeout=self.timeout, - **kwargs + headers=self.headers, **kwargs ) if response.status_code != 200: @@ -99,7 +102,7 @@ class BaiduTieBaClient(AbstractApiClient): self.default_ip_proxy = proxies return res - utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,请尝试更换新的IP代理: {e}") + utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") raise e async def post(self, uri: str, data: dict, **kwargs) -> Dict: @@ -154,7 +157,6 @@ class BaiduTieBaClient(AbstractApiClient): page_size: int = 10, sort: SearchSortType = SearchSortType.TIME_DESC, note_type: SearchNoteType = SearchNoteType.FIXED_THREAD, - random_sleep: bool = True ) -> List[TiebaNote]: """ 根据关键词搜索贴吧帖子 @@ -164,8 +166,6 @@ class BaiduTieBaClient(AbstractApiClient): page_size: 每页大小 sort: 结果排序方式 note_type: 帖子类型(主题贴|主题+回复混合模式) - random_sleep: 是否随机休眠 - Returns: """ @@ -179,8 +179,6 @@ class BaiduTieBaClient(AbstractApiClient): "only_thread": note_type.value } page_content = await self.get(uri, params=params, return_ori_content=True) - if random_sleep: - random.randint(1, 5) return self._page_extractor.extract_search_note_list(page_content) async def get_note_by_id(self, note_id: str) -> TiebaNote: @@ -216,17 +214,20 @@ class BaiduTieBaClient(AbstractApiClient): "pn": current_page } page_content = await self.get(uri, params=params, return_ori_content=True) - comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id) + comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, + note_id=note_detail.note_id) if not comments: break if callback: await callback(note_detail.note_id, comments) result.extend(comments) + # 获取所有子评论 + await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback) await asyncio.sleep(crawl_interval) current_page += 1 return result - async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0, + async def get_comments_all_sub_comments(self, comments: List[TiebaComment], crawl_interval: float = 1.0, callback: Optional[Callable] = None) -> List[TiebaComment]: """ 获取指定评论下的所有子评论 @@ -238,7 +239,37 @@ class BaiduTieBaClient(AbstractApiClient): Returns: """ + uri = "/p/comment" if not config.ENABLE_GET_SUB_COMMENTS: return [] - # todo 未完成子评论的爬取 + # # 贴吧获取所有子评论需要登录态 + # if self.headers.get("Cookies") == "" or not self.pong(): + # raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...") + + all_sub_comments: List[TiebaComment] = [] + for comment in comments: + if comment.sub_comment_count == 0: + continue + + current_page = 1 + max_sub_page_num = comment.sub_comment_count // 10 + 1 + while max_sub_page_num >= current_page: + params = { + "tid": comment.note_id, # 帖子ID + "pid": comment.comment_id, # 父级评论ID + "fid": comment.tieba_id, # 贴吧ID + "pn": current_page # 页码 + } + page_content = await self.get(uri, params=params, return_ori_content=True) + sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, + parent_comment=comment) + + if not sub_comments: + break + if callback: + await callback(comment.note_id, sub_comments) + all_sub_comments.extend(sub_comments) + await asyncio.sleep(crawl_interval) + current_page += 1 + return all_sub_comments diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index 1225e7a..b46081d 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -100,7 +100,7 @@ class TieBaExtractor: comment_field_value: Dict = self.extract_data_field_value(comment_selector) if not comment_field_value: continue - + tieba_name = comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip() other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip() ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content) tieba_comment = TiebaComment( @@ -108,12 +108,15 @@ class TieBaExtractor: sub_comment_count=comment_field_value.get("content").get("comment_num"), content=utils.extract_text_from_html(comment_field_value.get("content").get("content")), note_url=const.TIEBA_URL + f"/p/{note_id}", - user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(), + user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get( + default='').strip(), user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get( default='').strip(), user_avatar=comment_selector.xpath(".//a[@class='p_author_face ']/img/@src").get( default='').strip(), - tieba_name=comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(), + tieba_id=str(comment_field_value.get("content").get("forum_id", "")), + tieba_name=tieba_name, + tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}", ip_location=ip_location, publish_time=publish_time, note_id=note_id, @@ -121,6 +124,45 @@ class TieBaExtractor: result.append(tieba_comment) return result + + def extract_tieba_note_sub_comments(self,page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]: + """ + 提取贴吧帖子二级评论 + Args: + page_content: + parent_comment: + + Returns: + + """ + selector = Selector(page_content) + comments = [] + comment_ele_list = selector.xpath("//li[@class='lzl_single_post j_lzl_s_p first_no_border']") + comment_ele_list.extend(selector.xpath("//li[@class='lzl_single_post j_lzl_s_p ']")) + for comment_ele in comment_ele_list: + comment_value = self.extract_data_field_value(comment_ele) + if not comment_value: + continue + comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0] + content = utils.extract_text_from_html(comment_ele.xpath(".//span[@class='lzl_content_main']").get(default="")) + comment = TiebaComment( + comment_id=str(comment_value.get("spid")), + content=content, + user_link=comment_user_a_selector.xpath("./@href").get(default=""), + user_nickname=comment_value.get("showname"), + user_avatar=comment_user_a_selector.xpath("./img/@src").get(default=""), + publish_time=comment_ele.xpath(".//span[@class='lzl_time']/text()").get(default="").strip(), + parent_comment_id=parent_comment.comment_id, + note_id=parent_comment.note_id, + note_url=parent_comment.note_url, + tieba_id=parent_comment.tieba_id, + tieba_name=parent_comment.tieba_name, + tieba_link=parent_comment.tieba_link + ) + comments.append(comment) + + return comments + @staticmethod def extract_ip_and_pub_time(html_content: str) -> Tuple[str, str]: """ @@ -162,8 +204,6 @@ class TieBaExtractor: return data_field_dict_value - - def test_extract_search_note_list(): with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f: content = f.read() @@ -179,6 +219,7 @@ def test_extract_note_detail(): result = extractor.extract_note_detail(content) print(result.model_dump()) + def test_extract_tieba_note_parment_comments(): with open("test_data/note_comments.html", "r", encoding="utf-8") as f: content = f.read() @@ -186,7 +227,28 @@ def test_extract_tieba_note_parment_comments(): result = extractor.extract_tieba_note_parment_comments(content, "123456") print(result) +def test_extract_tieba_note_sub_comments(): + with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f: + content = f.read() + extractor = TieBaExtractor() + fake_parment_comment = TiebaComment( + comment_id="123456", + content="content", + user_link="user_link", + user_nickname="user_nickname", + user_avatar="user_avatar", + publish_time="publish_time", + parent_comment_id="parent_comment_id", + note_id="note_id", + note_url="note_url", + tieba_id="tieba_id", + tieba_name="tieba_name", + ) + result = extractor.extract_tieba_note_sub_comments(content,fake_parment_comment) + print(result) + if __name__ == '__main__': # test_extract_search_note_list() # test_extract_note_detail() - test_extract_tieba_note_parment_comments() + # test_extract_tieba_note_parment_comments() + test_extract_tieba_note_sub_comments() diff --git a/media_platform/tieba/test_data/note_sub_comments.html b/media_platform/tieba/test_data/note_sub_comments.html new file mode 100644 index 0000000..a8fe3eb --- /dev/null +++ b/media_platform/tieba/test_data/note_sub_comments.html @@ -0,0 +1,189 @@ +
  • + + + + +
    + heinzfrentzen + : + + + + +
    + + + 2024-8-6 22:11 + 回复 +
    +
    +
  • +
  • + + + + +
    + 可爱的搬运工94 + :陈芋汐水花也不小 +
    + + + 2024-8-6 22:12 + 回复 +
    +
    +
  • +
  • + + + + +
    + 国际体坛巨星青椒肉丝 + :你怀孕了吗 老是呕吐 +
    + + + 2024-8-6 22:12 + 回复 +
    +
    +
  • +
  • + + + + +
    + 茗花少帅 + :你就只看水花,不看空中姿态吗 +
    + + + 2024-8-6 22:12 + 回复 +
    +
    +
  • +
  • + + + + +
    + 东华武兰 + :经典只看水花 +
    + + + 2024-8-6 22:12 + 回复 +
    +
    +
  • +
  • + + + + +
    + 上下班要注意 + :额,分数正常吧 +
    + + + 2024-8-6 22:13 + 回复 +
    +
    +
  • +
  • + + + + +
    + 静看蚂蚁上树 + : + + 回复 国际体坛巨星青椒肉丝 + :吃酸黄瓜吃多了 + + + +
    + + + 2024-8-6 22:14 + 回复 +
    +
    +
  • +
  • + + + + +
    + 不懂取啥名字😜 + : + + 请你去跟国际泳联投诉 + +
    + + + 2024-8-6 22:15 + 回复 +
    +
    +
  • +
  • + + + + +
    + 💫泽赫拉💯 + :第五跳陈空中分腿了,空中姿态明显全红婵更好 +
    + + + 2024-8-6 22:17 + 回复 +
    +
    +
  • +
  • + + + + +
    + 嗯嗯哦哦啊啊🐶 + : + + 回复 美味蟹黄堡💞 + :你不会看起跳高度和空中姿态? + +
    + + + 2024-8-6 22:17 + 回复 +
    +
    +
  • +
  • + + + 我也说一句 + +

    + 1 + 2 + 下一页 + 尾页 +

    +
  • diff --git a/model/m_baidu_tieba.py b/model/m_baidu_tieba.py index 95b0175..8153000 100644 --- a/model/m_baidu_tieba.py +++ b/model/m_baidu_tieba.py @@ -28,7 +28,7 @@ class TiebaComment(BaseModel): """ comment_id: str = Field(..., description="评论ID") - parment_comment_id: str = Field(default="", description="父评论ID") + parent_comment_id: str = Field(default="", description="父评论ID") content: str = Field(..., description="评论内容") user_link: str = Field(default="", description="用户主页链接") user_nickname: str = Field(default="", description="用户昵称") @@ -38,5 +38,7 @@ class TiebaComment(BaseModel): sub_comment_count: int = Field(default=0, description="子评论数") note_id: str = Field(..., description="帖子ID") note_url: str = Field(..., description="帖子链接") + tieba_id: str = Field(..., description="所属的贴吧ID") tieba_name: str = Field(..., description="所属的贴吧名称") + tieba_link: str = Field(..., description="贴吧链接") diff --git a/schema/tables.sql b/schema/tables.sql index c5737f9..3fc72da 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -359,9 +359,10 @@ CREATE TABLE tieba_note `desc` TEXT COMMENT '帖子描述', note_url VARCHAR(255) NOT NULL COMMENT '帖子链接', publish_time VARCHAR(255) NOT NULL COMMENT '发布时间', - user_link VARCHAR(255) NOT NULL COMMENT '用户主页链接', - user_nickname VARCHAR(255) NOT NULL COMMENT '用户昵称', - user_avatar VARCHAR(255) NOT NULL COMMENT '用户头像地址', + user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接', + user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称', + user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址', + tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID', tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称', tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接', total_replay_num INT DEFAULT 0 COMMENT '帖子回复总数', @@ -376,22 +377,24 @@ CREATE TABLE tieba_note DROP TABLE IF EXISTS `tieba_comment`; CREATE TABLE tieba_comment ( - id BIGINT AUTO_INCREMENT PRIMARY KEY, - comment_id VARCHAR(255) NOT NULL COMMENT '评论ID', - parment_comment_id VARCHAR(255) DEFAULT '' COMMENT '父评论ID', - content TEXT NOT NULL COMMENT '评论内容', - user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接', - user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称', - user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址', - publish_time VARCHAR(255) DEFAULT '' COMMENT '发布时间', - ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置', - sub_comment_count INT DEFAULT 0 COMMENT '子评论数', - note_id VARCHAR(255) NOT NULL COMMENT '帖子ID', - note_url VARCHAR(255) NOT NULL COMMENT '帖子链接', - tieba_name VARCHAR(255) NOT NULL COMMENT '所属的贴吧名称', - add_ts BIGINT NOT NULL COMMENT '添加时间戳', - last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳', - KEY `idx_tieba_comment_comment_id` (`note_id`), - KEY `idx_tieba_comment_note_id` (`note_id`), - KEY `idx_tieba_comment_publish_time` (`publish_time`) + id BIGINT AUTO_INCREMENT PRIMARY KEY, + comment_id VARCHAR(255) NOT NULL COMMENT '评论ID', + parent_comment_id VARCHAR(255) DEFAULT '' COMMENT '父评论ID', + content TEXT NOT NULL COMMENT '评论内容', + user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接', + user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称', + user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址', + tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID', + tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称', + tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接', + publish_time VARCHAR(255) DEFAULT '' COMMENT '发布时间', + ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置', + sub_comment_count INT DEFAULT 0 COMMENT '子评论数', + note_id VARCHAR(255) NOT NULL COMMENT '帖子ID', + note_url VARCHAR(255) NOT NULL COMMENT '帖子链接', + add_ts BIGINT NOT NULL COMMENT '添加时间戳', + last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳', + KEY `idx_tieba_comment_comment_id` (`note_id`), + KEY `idx_tieba_comment_note_id` (`note_id`), + KEY `idx_tieba_comment_publish_time` (`publish_time`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表'; \ No newline at end of file