temp commit
This commit is contained in:
parent
d347cf5a2c
commit
1b585cb215
|
@ -1,6 +1,6 @@
|
||||||
# 基础配置
|
# 基础配置
|
||||||
PLATFORM = "xhs"
|
PLATFORM = "xhs"
|
||||||
KEYWORDS = "编程副业,编程兼职"
|
KEYWORDS = "缅甸边境,缅北边境,缅北边境线,缅甸边境线"
|
||||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
||||||
COOKIES = ""
|
COOKIES = ""
|
||||||
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书
|
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书
|
||||||
|
@ -28,7 +28,7 @@ HEADLESS = False
|
||||||
SAVE_LOGIN_STATE = True
|
SAVE_LOGIN_STATE = True
|
||||||
|
|
||||||
# 数据保存类型选项配置,支持三种类型:csv、db、json
|
# 数据保存类型选项配置,支持三种类型:csv、db、json
|
||||||
SAVE_DATA_OPTION = "json" # csv or db or json
|
SAVE_DATA_OPTION = "db" # csv or db or json
|
||||||
|
|
||||||
# 用户浏览器缓存的浏览器文件配置
|
# 用户浏览器缓存的浏览器文件配置
|
||||||
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||||
|
@ -37,7 +37,7 @@ USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name
|
||||||
START_PAGE = 1
|
START_PAGE = 1
|
||||||
|
|
||||||
# 爬取视频/帖子的数量控制
|
# 爬取视频/帖子的数量控制
|
||||||
CRAWLER_MAX_NOTES_COUNT = 20
|
CRAWLER_MAX_NOTES_COUNT = 100
|
||||||
|
|
||||||
# 并发爬虫数量控制
|
# 并发爬虫数量控制
|
||||||
MAX_CONCURRENCY_NUM = 1
|
MAX_CONCURRENCY_NUM = 1
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
from typing import Any, Callable, Dict, List, Optional, Union
|
from typing import Any, Callable, Dict, List, Optional, Union
|
||||||
|
@ -188,6 +189,9 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
uri = f"/p/{note_id}"
|
||||||
|
page_content = await self.get(uri, return_ori_content=True)
|
||||||
|
return self._page_extractor.extract_note_detail(page_content)
|
||||||
# todo impl it
|
# todo impl it
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@ -203,5 +207,45 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# todo impl it
|
uri = f"/p/{note_id}"
|
||||||
return []
|
result = []
|
||||||
|
comments_has_more = True
|
||||||
|
comments_cursor = 1
|
||||||
|
while comments_has_more:
|
||||||
|
comments_res = await self.get(uri, params={"pn": comments_cursor})
|
||||||
|
comments_has_more = comments_res.get("has_more", False)
|
||||||
|
comments_cursor = comments_res.get("cursor", "")
|
||||||
|
if "comments" not in comments_res:
|
||||||
|
utils.logger.info(
|
||||||
|
f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
|
||||||
|
break
|
||||||
|
comments = comments_res["comments"]
|
||||||
|
if callback:
|
||||||
|
await callback(note_id, comments)
|
||||||
|
await asyncio.sleep(crawl_interval)
|
||||||
|
result.extend(comments)
|
||||||
|
sub_comments = await self.get_comments_all_sub_comments(comments, crawl_interval, callback)
|
||||||
|
result.extend(sub_comments)
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0,
|
||||||
|
callback: Optional[Callable] = None) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
获取指定评论下的所有子评论
|
||||||
|
Args:
|
||||||
|
comments: 评论列表
|
||||||
|
crawl_interval: 爬取一次笔记的延迟单位(秒)
|
||||||
|
callback: 一次笔记爬取结束后
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
result = []
|
||||||
|
for comment in comments:
|
||||||
|
sub_comments = comment.get("comments")
|
||||||
|
if sub_comments:
|
||||||
|
if callback:
|
||||||
|
await callback(comment.get("id"), sub_comments)
|
||||||
|
await asyncio.sleep(crawl_interval)
|
||||||
|
result.extend(sub_comments)
|
||||||
|
return result
|
||||||
|
|
|
@ -32,7 +32,6 @@ class TieBaExtractor:
|
||||||
author = post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip()
|
author = post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(default='').strip()
|
||||||
author_link = post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default='')
|
author_link = post.xpath(".//a[starts-with(@href, '/home/main')]/@href").get(default='')
|
||||||
date = post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip()
|
date = post.xpath(".//font[@class='p_green p_date']/text()").get(default='').strip()
|
||||||
|
|
||||||
result.append({
|
result.append({
|
||||||
"note_id": post_id,
|
"note_id": post_id,
|
||||||
"title": title,
|
"title": title,
|
||||||
|
@ -47,6 +46,25 @@ class TieBaExtractor:
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_note_detail(page_content: str) -> Dict:
|
||||||
|
"""
|
||||||
|
提取贴吧帖子详情
|
||||||
|
Args:
|
||||||
|
page_content:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
content_selector = Selector(text=page_content)
|
||||||
|
# 查看楼主的链接: only_view_author_link: / p / 9117905169?see_lz = 1
|
||||||
|
only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip() #
|
||||||
|
note_id = only_view_author_link.split("?")[0].split("/")[-1]
|
||||||
|
title = content_selector.xpath("//*[@id='j_core_title_wrap']/h3").get(default='').strip()
|
||||||
|
desc = content_selector.xpath("//meta[@name='description']").get(default='').strip()
|
||||||
|
note_url = f"/p/{note_id}"
|
||||||
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def extract_tieba_note_comments(page_content: str) -> List[Dict]:
|
def extract_tieba_note_comments(page_content: str) -> List[Dict]:
|
||||||
"""
|
"""
|
||||||
|
@ -57,7 +75,24 @@ class TieBaExtractor:
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
pass
|
xpath_selector = "//div[@id='j_p_postlist']/div[@class='l_post l_post_bright j_l_post clearfix']"
|
||||||
|
comment_list = Selector(text=page_content).xpath(xpath_selector)
|
||||||
|
result = []
|
||||||
|
for comment in comment_list:
|
||||||
|
comment_id = comment.xpath(".//@data-pid").get(default='').strip()
|
||||||
|
author = comment.xpath(".//a[@data-field]/text()").get(default='').strip()
|
||||||
|
author_link = comment.xpath(".//a[@data-field]/@href").get(default='')
|
||||||
|
content = comment.xpath(".//div[@class='d_post_content j_d_post_content ']/text()").get(default='').strip()
|
||||||
|
date = comment.xpath(".//span[@class='tail-info']/text()").get(default='').strip()
|
||||||
|
|
||||||
|
result.append({
|
||||||
|
"comment_id": comment_id,
|
||||||
|
"author": author,
|
||||||
|
"author_link": author_link,
|
||||||
|
"content": content,
|
||||||
|
"time": date,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue