diff --git a/config/base_config.py b/config/base_config.py index 08dd421..2985d40 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -1,6 +1,6 @@ # 基础配置 PLATFORM = "xhs" -KEYWORDS = "编程副业,编程兼职" +KEYWORDS = "缅甸边境,缅北边境,缅北边境线,缅甸边境线" LOGIN_TYPE = "qrcode" # qrcode or phone or cookie COOKIES = "" # 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书 @@ -28,7 +28,7 @@ HEADLESS = False SAVE_LOGIN_STATE = True # 数据保存类型选项配置,支持三种类型:csv、db、json -SAVE_DATA_OPTION = "json" # csv or db or json +SAVE_DATA_OPTION = "db" # csv or db or json # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name @@ -37,7 +37,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name START_PAGE = 1 # 爬取视频/帖子的数量控制 -CRAWLER_MAX_NOTES_COUNT = 20 +CRAWLER_MAX_NOTES_COUNT = 100 # 并发爬虫数量控制 MAX_CONCURRENCY_NUM = 1 diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index a02e243..f9e0375 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -1,3 +1,4 @@ +import asyncio import json import random from typing import Any, Callable, Dict, List, Optional, Union @@ -188,6 +189,9 @@ class BaiduTieBaClient(AbstractApiClient): Returns: """ + uri = f"/p/{note_id}" + page_content = await self.get(uri, return_ori_content=True) + return self._page_extractor.extract_note_detail(page_content) # todo impl it return {} @@ -203,5 +207,45 @@ class BaiduTieBaClient(AbstractApiClient): Returns: """ - # todo impl it - return [] + uri = f"/p/{note_id}" + result = [] + comments_has_more = True + comments_cursor = 1 + while comments_has_more: + comments_res = await self.get(uri, params={"pn": comments_cursor}) + comments_has_more = comments_res.get("has_more", False) + comments_cursor = comments_res.get("cursor", "") + if "comments" not in comments_res: + utils.logger.info( + f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}") + break + comments = comments_res["comments"] + if callback: + await callback(note_id, comments) + await asyncio.sleep(crawl_interval) + result.extend(comments) + sub_comments = await self.get_comments_all_sub_comments(comments, crawl_interval, callback) + result.extend(sub_comments) + return result + + async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0, + callback: Optional[Callable] = None) -> List[Dict]: + """ + 获取指定评论下的所有子评论 + Args: + comments: 评论列表 + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + + Returns: + + """ + result = [] + for comment in comments: + sub_comments = comment.get("comments") + if sub_comments: + if callback: + await callback(comment.get("id"), sub_comments) + await asyncio.sleep(crawl_interval) + result.extend(sub_comments) + return result diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py index 59eabdb..2c1144d 100644 --- a/media_platform/tieba/help.py +++ b/media_platform/tieba/help.py @@ -32,7 +32,6 @@ class TieBaExtractor: author = post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip() author_link = post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default='') date = post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip() - result.append({ "note_id": post_id, "title": title, @@ -47,6 +46,25 @@ class TieBaExtractor: return result + @staticmethod + def extract_note_detail(page_content: str) -> Dict: + """ + 提取贴吧帖子详情 + Args: + page_content: + + Returns: + + """ + content_selector = Selector(text=page_content) + # 查看楼主的链接: only_view_author_link: / p / 9117905169?see_lz = 1 + only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip() # + note_id = only_view_author_link.split("?")[0].split("/")[-1] + title = content_selector.xpath("//*[@id='j_core_title_wrap']/h3").get(default='').strip() + desc = content_selector.xpath("//meta[@name='description']").get(default='').strip() + note_url = f"/p/{note_id}" + pass + @staticmethod def extract_tieba_note_comments(page_content: str) -> List[Dict]: """ @@ -57,7 +75,24 @@ class TieBaExtractor: Returns: """ - pass + xpath_selector = "//div[@id='j_p_postlist']/div[@class='l_post l_post_bright j_l_post clearfix']" + comment_list = Selector(text=page_content).xpath(xpath_selector) + result = [] + for comment in comment_list: + comment_id = comment.xpath(".//@data-pid").get(default='').strip() + author = comment.xpath(".//a[@data-field]/text()").get(default='').strip() + author_link = comment.xpath(".//a[@data-field]/@href").get(default='') + content = comment.xpath(".//div[@class='d_post_content j_d_post_content ']/text()").get(default='').strip() + date = comment.xpath(".//span[@class='tail-info']/text()").get(default='').strip() + + result.append({ + "comment_id": comment_id, + "author": author, + "author_link": author_link, + "content": content, + "time": date, + }) + if __name__ == '__main__': diff --git a/media_platform/tieba/test_data/note_detail_and_comments.html b/media_platform/tieba/test_data/note_detail_and_comments.html new file mode 100644 index 0000000..132068a --- /dev/null +++ b/media_platform/tieba/test_data/note_detail_and_comments.html @@ -0,0 +1,7558 @@ +