feat: 百度贴吧子评论done

This commit is contained in:
Relakkes 2024-08-07 04:13:15 +08:00
parent 1208682a9a
commit df0f5c1113
6 changed files with 328 additions and 41 deletions

View File

@ -1,6 +1,6 @@
# 基础配置
PLATFORM = "xhs"
KEYWORDS = "缅甸边境,缅北边境,缅北边境线,缅甸边境线"
KEYWORDS = "编程副业,编程兼职"
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
COOKIES = ""
# 具体值参见media_platform.xxx.field下的枚举值暂时只支持小红书
@ -50,7 +50,7 @@ ENABLE_GET_COMMENTS = True
# 是否开启爬二级评论模式, 默认不开启爬二级评论
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
ENABLE_GET_SUB_COMMENTS = False
ENABLE_GET_SUB_COMMENTS = True
# 指定小红书需要爬虫的笔记ID列表
XHS_SPECIFIED_ID_LIST = [

View File

@ -28,7 +28,10 @@ class BaiduTieBaClient(AbstractApiClient):
):
self.ip_pool: Optional[ProxyIpPool] = ip_pool
self.timeout = timeout
self.headers = utils.get_user_agent()
self.headers = {
"User-Agent": utils.get_user_agent(),
"Cookies": "",
}
self._host = "https://tieba.baidu.com"
self._page_extractor = TieBaExtractor()
self.default_ip_proxy = default_ip_proxy
@ -51,7 +54,7 @@ class BaiduTieBaClient(AbstractApiClient):
async with httpx.AsyncClient(proxies=actual_proxies) as client:
response = await client.request(
method, url, timeout=self.timeout,
**kwargs
headers=self.headers, **kwargs
)
if response.status_code != 200:
@ -99,7 +102,7 @@ class BaiduTieBaClient(AbstractApiClient):
self.default_ip_proxy = proxies
return res
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,请尝试更换新的IP代理: {e}")
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block请尝试更换新的IP代理: {e}")
raise e
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
@ -154,7 +157,6 @@ class BaiduTieBaClient(AbstractApiClient):
page_size: int = 10,
sort: SearchSortType = SearchSortType.TIME_DESC,
note_type: SearchNoteType = SearchNoteType.FIXED_THREAD,
random_sleep: bool = True
) -> List[TiebaNote]:
"""
根据关键词搜索贴吧帖子
@ -164,8 +166,6 @@ class BaiduTieBaClient(AbstractApiClient):
page_size: 每页大小
sort: 结果排序方式
note_type: 帖子类型主题贴主题+回复混合模式
random_sleep: 是否随机休眠
Returns:
"""
@ -179,8 +179,6 @@ class BaiduTieBaClient(AbstractApiClient):
"only_thread": note_type.value
}
page_content = await self.get(uri, params=params, return_ori_content=True)
if random_sleep:
random.randint(1, 5)
return self._page_extractor.extract_search_note_list(page_content)
async def get_note_by_id(self, note_id: str) -> TiebaNote:
@ -216,17 +214,20 @@ class BaiduTieBaClient(AbstractApiClient):
"pn": current_page
}
page_content = await self.get(uri, params=params, return_ori_content=True)
comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id)
comments = self._page_extractor.extract_tieba_note_parment_comments(page_content,
note_id=note_detail.note_id)
if not comments:
break
if callback:
await callback(note_detail.note_id, comments)
result.extend(comments)
# 获取所有子评论
await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback)
await asyncio.sleep(crawl_interval)
current_page += 1
return result
async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0,
async def get_comments_all_sub_comments(self, comments: List[TiebaComment], crawl_interval: float = 1.0,
callback: Optional[Callable] = None) -> List[TiebaComment]:
"""
获取指定评论下的所有子评论
@ -238,7 +239,37 @@ class BaiduTieBaClient(AbstractApiClient):
Returns:
"""
uri = "/p/comment"
if not config.ENABLE_GET_SUB_COMMENTS:
return []
# todo 未完成子评论的爬取
# # 贴吧获取所有子评论需要登录态
# if self.headers.get("Cookies") == "" or not self.pong():
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
all_sub_comments: List[TiebaComment] = []
for comment in comments:
if comment.sub_comment_count == 0:
continue
current_page = 1
max_sub_page_num = comment.sub_comment_count // 10 + 1
while max_sub_page_num >= current_page:
params = {
"tid": comment.note_id, # 帖子ID
"pid": comment.comment_id, # 父级评论ID
"fid": comment.tieba_id, # 贴吧ID
"pn": current_page # 页码
}
page_content = await self.get(uri, params=params, return_ori_content=True)
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content,
parent_comment=comment)
if not sub_comments:
break
if callback:
await callback(comment.note_id, sub_comments)
all_sub_comments.extend(sub_comments)
await asyncio.sleep(crawl_interval)
current_page += 1
return all_sub_comments

View File

@ -100,7 +100,7 @@ class TieBaExtractor:
comment_field_value: Dict = self.extract_data_field_value(comment_selector)
if not comment_field_value:
continue
tieba_name = comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip()
other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
tieba_comment = TiebaComment(
@ -108,12 +108,15 @@ class TieBaExtractor:
sub_comment_count=comment_field_value.get("content").get("comment_num"),
content=utils.extract_text_from_html(comment_field_value.get("content").get("content")),
note_url=const.TIEBA_URL + f"/p/{note_id}",
user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(),
user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(
default='').strip(),
user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get(
default='').strip(),
user_avatar=comment_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(
default='').strip(),
tieba_name=comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(),
tieba_id=str(comment_field_value.get("content").get("forum_id", "")),
tieba_name=tieba_name,
tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}",
ip_location=ip_location,
publish_time=publish_time,
note_id=note_id,
@ -121,6 +124,45 @@ class TieBaExtractor:
result.append(tieba_comment)
return result
def extract_tieba_note_sub_comments(self,page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
"""
提取贴吧帖子二级评论
Args:
page_content:
parent_comment:
Returns:
"""
selector = Selector(page_content)
comments = []
comment_ele_list = selector.xpath("//li[@class='lzl_single_post j_lzl_s_p first_no_border']")
comment_ele_list.extend(selector.xpath("//li[@class='lzl_single_post j_lzl_s_p ']"))
for comment_ele in comment_ele_list:
comment_value = self.extract_data_field_value(comment_ele)
if not comment_value:
continue
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
content = utils.extract_text_from_html(comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
comment = TiebaComment(
comment_id=str(comment_value.get("spid")),
content=content,
user_link=comment_user_a_selector.xpath("./@href").get(default=""),
user_nickname=comment_value.get("showname"),
user_avatar=comment_user_a_selector.xpath("./img/@src").get(default=""),
publish_time=comment_ele.xpath(".//span[@class='lzl_time']/text()").get(default="").strip(),
parent_comment_id=parent_comment.comment_id,
note_id=parent_comment.note_id,
note_url=parent_comment.note_url,
tieba_id=parent_comment.tieba_id,
tieba_name=parent_comment.tieba_name,
tieba_link=parent_comment.tieba_link
)
comments.append(comment)
return comments
@staticmethod
def extract_ip_and_pub_time(html_content: str) -> Tuple[str, str]:
"""
@ -162,8 +204,6 @@ class TieBaExtractor:
return data_field_dict_value
def test_extract_search_note_list():
with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f:
content = f.read()
@ -179,6 +219,7 @@ def test_extract_note_detail():
result = extractor.extract_note_detail(content)
print(result.model_dump())
def test_extract_tieba_note_parment_comments():
with open("test_data/note_comments.html", "r", encoding="utf-8") as f:
content = f.read()
@ -186,7 +227,28 @@ def test_extract_tieba_note_parment_comments():
result = extractor.extract_tieba_note_parment_comments(content, "123456")
print(result)
def test_extract_tieba_note_sub_comments():
with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f:
content = f.read()
extractor = TieBaExtractor()
fake_parment_comment = TiebaComment(
comment_id="123456",
content="content",
user_link="user_link",
user_nickname="user_nickname",
user_avatar="user_avatar",
publish_time="publish_time",
parent_comment_id="parent_comment_id",
note_id="note_id",
note_url="note_url",
tieba_id="tieba_id",
tieba_name="tieba_name",
)
result = extractor.extract_tieba_note_sub_comments(content,fake_parment_comment)
print(result)
if __name__ == '__main__':
# test_extract_search_note_list()
# test_extract_note_detail()
test_extract_tieba_note_parment_comments()
# test_extract_tieba_note_parment_comments()
test_extract_tieba_note_sub_comments()

View File

@ -0,0 +1,189 @@
<li class="lzl_single_post j_lzl_s_p first_no_border" data-field='{&quot;spid&quot;:150726504693,&quot;showname&quot;:&quot;heinzfrentzen&quot;,&quot;user_name&quot;:&quot;heinzfrentzen&quot;,&quot;portrait&quot;:&quot;tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&quot;}'>
<a rel="noopener" name="150726504693"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;heinzfrentzen&quot;,&quot;id&quot;:&quot;tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&fr=pb" username="heinzfrentzen">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;heinzfrentzen&quot;,&quot;id&quot;:&quot;tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&quot;}' href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&ie=utf-8&fr=pb" target="_blank" username="heinzfrentzen">heinzfrentzen</a>
:
<span class="lzl_content_main" data-username="">
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
</span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:11</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726506822,&quot;showname&quot;:&quot;\u53ef\u7231\u7684\u642c\u8fd0\u5de594&quot;,&quot;user_name&quot;:&quot;\u53ef\u7231\u7684\u642c\u8fd0\u5de594&quot;,&quot;portrait&quot;:&quot;tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&quot;}'>
<a rel="noopener" name="150726506822"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u53ef\u7231\u7684\u642c\u8fd0\u5de594&quot;,&quot;id&quot;:&quot;tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&fr=pb" username="可爱的搬运工94">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u53ef\u7231\u7684\u642c\u8fd0\u5de594&quot;,&quot;id&quot;:&quot;tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&quot;}' href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&ie=utf-8&fr=pb" target="_blank" username="可爱的搬运工94">可爱的搬运工94</a>
:<span class="lzl_content_main" data-username="">陈芋汐水花也不小 </span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:12</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726508024,&quot;showname&quot;:&quot;\u56fd\u9645\u4f53\u575b\u5de8\u661f\u9752\u6912\u8089\u4e1d&quot;,&quot;user_name&quot;:&quot;\u8682\u8681\u96c5\u864e\u54c8\u54c8&quot;,&quot;portrait&quot;:&quot;tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&quot;}'>
<a rel="noopener" name="150726508024"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u8682\u8681\u96c5\u864e\u54c8\u54c8&quot;,&quot;id&quot;:&quot;tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&fr=pb" username="蚂蚁雅虎哈哈">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u8682\u8681\u96c5\u864e\u54c8\u54c8&quot;,&quot;id&quot;:&quot;tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&quot;}' href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&ie=utf-8&fr=pb" target="_blank" username="蚂蚁雅虎哈哈">国际体坛巨星青椒肉丝</a>
:<span class="lzl_content_main" data-username="">你怀孕了吗 老是呕吐 </span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:12</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726509762,&quot;showname&quot;:&quot;\u8317\u82b1\u5c11\u5e05&quot;,&quot;user_name&quot;:&quot;\u8317\u82b1\u5c11\u5e05&quot;,&quot;portrait&quot;:&quot;tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&quot;}'>
<a rel="noopener" name="150726509762"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u8317\u82b1\u5c11\u5e05&quot;,&quot;id&quot;:&quot;tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&fr=pb" username="茗花少帅">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:{&quot;all_level&quot;:{&quot;2&quot;:{&quot;end_time&quot;:&quot;1421248220&quot;,&quot;level&quot;:2,&quot;pic_url&quot;:&quot;http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;score_limit&quot;:8000}},&quot;level&quot;:{&quot;end_time&quot;:&quot;1421248220&quot;,&quot;pic_url&quot;:&quot;http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;props_id&quot;:2}},&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u8317\u82b1\u5c11\u5e05&quot;,&quot;id&quot;:&quot;tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&quot;}' href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&ie=utf-8&fr=pb" target="_blank" username="茗花少帅">茗花少帅</a>
:<span class="lzl_content_main" data-username="">你就只看水花不看空中姿态吗 </span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:12</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726510645,&quot;showname&quot;:&quot;\u4e1c\u534e\u6b66\u5170&quot;,&quot;user_name&quot;:&quot;\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e&quot;,&quot;portrait&quot;:&quot;tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&quot;}'>
<a rel="noopener" name="150726510645"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e&quot;,&quot;id&quot;:&quot;tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&fr=pb" username="西安交大前一百">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:{&quot;all_level&quot;:{&quot;2&quot;:{&quot;end_time&quot;:&quot;1644033630&quot;,&quot;level&quot;:2,&quot;pic_url&quot;:&quot;http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;score_limit&quot;:8000}},&quot;level&quot;:{&quot;end_time&quot;:&quot;1644033630&quot;,&quot;pic_url&quot;:&quot;http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;props_id&quot;:2}},&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e&quot;,&quot;id&quot;:&quot;tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&quot;}' href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&ie=utf-8&fr=pb" target="_blank" username="西安交大前一百">东华武兰</a>
:<span class="lzl_content_main" data-username="">经典只看水花 </span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:12</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726514057,&quot;showname&quot;:&quot;\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f&quot;,&quot;user_name&quot;:&quot;\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f&quot;,&quot;portrait&quot;:&quot;tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&quot;}'>
<a rel="noopener" name="150726514057"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f&quot;,&quot;id&quot;:&quot;tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&fr=pb" username="上下班要注意">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f&quot;,&quot;id&quot;:&quot;tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&quot;}' href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&ie=utf-8&fr=pb" target="_blank" username="上下班要注意">上下班要注意</a>
:<span class="lzl_content_main" data-username="">分数正常吧 </span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:13</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726520372,&quot;showname&quot;:&quot;\u9759\u770b\u8682\u8681\u4e0a\u6811&quot;,&quot;user_name&quot;:&quot;\u9759\u770b\u8682\u8681\u4e0a\u6811&quot;,&quot;portrait&quot;:&quot;tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&quot;}'>
<a rel="noopener" name="150726520372"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u9759\u770b\u8682\u8681\u4e0a\u6811&quot;,&quot;id&quot;:&quot;tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&fr=pb" username="静看蚂蚁上树">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u9759\u770b\u8682\u8681\u4e0a\u6811&quot;,&quot;id&quot;:&quot;tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&quot;}' href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&ie=utf-8&fr=pb" target="_blank" username="静看蚂蚁上树">静看蚂蚁上树</a>
:
<span class="lzl_content_main" data-username="">
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg" target="_blank" class="at">国际体坛巨星青椒肉丝</a>
:吃酸黄瓜吃多了<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
</span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:14</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726524963,&quot;showname&quot;:&quot;\u4e0d\u61c2\u53d6\u5565\u540d\u5b57\ud83d\ude1c&quot;,&quot;user_name&quot;:&quot;\u9ec4\u5c0f\u6e2forz&quot;,&quot;portrait&quot;:&quot;tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&quot;}'>
<a rel="noopener" name="150726524963"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u9ec4\u5c0f\u6e2forz&quot;,&quot;id&quot;:&quot;tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&fr=pb" username="黄小港orz">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:[],&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u9ec4\u5c0f\u6e2forz&quot;,&quot;id&quot;:&quot;tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&quot;}' href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&ie=utf-8&fr=pb" target="_blank" username="黄小港orz">不懂取啥名字😜</a>
:
<span class="lzl_content_main" data-username="">
请你去跟国际泳联投诉<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
</span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:15</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726535666,&quot;showname&quot;:&quot;\ud83d\udcab\u6cfd\u8d6b\u62c9\ud83d\udcaf&quot;,&quot;user_name&quot;:&quot;\u5feb\u770b\u5361\u5361\u5361\u5361&quot;,&quot;portrait&quot;:&quot;tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&quot;}'>
<a rel="noopener" name="150726535666"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u5feb\u770b\u5361\u5361\u5361\u5361&quot;,&quot;id&quot;:&quot;tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&fr=pb" username="快看卡卡卡卡">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:{&quot;all_level&quot;:{&quot;2&quot;:{&quot;end_time&quot;:&quot;1539783937&quot;,&quot;level&quot;:2,&quot;pic_url&quot;:&quot;http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;score_limit&quot;:8000}},&quot;level&quot;:{&quot;end_time&quot;:&quot;1539783937&quot;,&quot;pic_url&quot;:&quot;http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg&quot;,&quot;props_id&quot;:2}},&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u5feb\u770b\u5361\u5361\u5361\u5361&quot;,&quot;id&quot;:&quot;tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&quot;}' href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&ie=utf-8&fr=pb" target="_blank" username="快看卡卡卡卡">💫泽赫拉💯</a>
:<span class="lzl_content_main" data-username="">第五跳陈空中分腿了空中姿态明显全红婵更好 </span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:17</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_single_post j_lzl_s_p " data-field='{&quot;spid&quot;:150726536076,&quot;showname&quot;:&quot;\u55ef\u55ef\u54e6\u54e6\u554a\u554a\ud83d\udc36&quot;,&quot;user_name&quot;:&quot;\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc&quot;,&quot;portrait&quot;:&quot;tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&quot;}'>
<a rel="noopener" name="150726536076"></a>
<a rel="noopener" data-field='{&quot;un&quot;:&quot;\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc&quot;,&quot;id&quot;:&quot;tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&quot;}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&fr=pb" username="嗯嗯哦哦啊啊哼">
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"/>
</a>
<div class="lzl_cnt" data-field='{&quot;iconArr&quot;:null,&quot;free_flag&quot;:null}'>
<a rel="noopener" class="at j_user_card " data-field='{&quot;un&quot;:&quot;\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc&quot;,&quot;id&quot;:&quot;tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&quot;}' href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&ie=utf-8&fr=pb" target="_blank" username="嗯嗯哦哦啊啊哼">嗯嗯哦哦啊啊🐶</a>
:
<span class="lzl_content_main" data-username="">
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.84497425.b5GLK5lGm90mTB2BhjrgpA" target="_blank" class="at">美味蟹黄堡💞</a>
:你不会看起跳高度和空中姿态
</span>
<div class="lzl_content_reply">
<span class="lzl_jb" style="display:none;"></span>
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
<span class="lzl_time">2024-8-6 22:17</span>
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
</div>
</div>
</li>
<li class="lzl_li_pager j_lzl_l_p lzl_li_pager_s" data-field='{&quot;total_num&quot;:16,&quot;total_page&quot;:2}'>
<a rel="noopener" class="j_lzl_p btn-sub btn-small pull-right" href="##">
<i class="icon-reply"></i>
我也说一句
</a>
<p class="j_pager l_pager pager_theme_2">
<span class="tP">1</span>
<a href="#2">2</a>
<a href="#2">下一页</a>
<a href="#2">尾页</a>
</p>
</li>

View File

@ -28,7 +28,7 @@ class TiebaComment(BaseModel):
"""
comment_id: str = Field(..., description="评论ID")
parment_comment_id: str = Field(default="", description="父评论ID")
parent_comment_id: str = Field(default="", description="父评论ID")
content: str = Field(..., description="评论内容")
user_link: str = Field(default="", description="用户主页链接")
user_nickname: str = Field(default="", description="用户昵称")
@ -38,5 +38,7 @@ class TiebaComment(BaseModel):
sub_comment_count: int = Field(default=0, description="子评论数")
note_id: str = Field(..., description="帖子ID")
note_url: str = Field(..., description="帖子链接")
tieba_id: str = Field(..., description="所属的贴吧ID")
tieba_name: str = Field(..., description="所属的贴吧名称")
tieba_link: str = Field(..., description="贴吧链接")

View File

@ -359,9 +359,10 @@ CREATE TABLE tieba_note
`desc` TEXT COMMENT '帖子描述',
note_url VARCHAR(255) NOT NULL COMMENT '帖子链接',
publish_time VARCHAR(255) NOT NULL COMMENT '发布时间',
user_link VARCHAR(255) NOT NULL COMMENT '用户主页链接',
user_nickname VARCHAR(255) NOT NULL COMMENT '用户昵称',
user_avatar VARCHAR(255) NOT NULL COMMENT '用户头像地址',
user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接',
user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称',
user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址',
tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID',
tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称',
tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接',
total_replay_num INT DEFAULT 0 COMMENT '帖子回复总数',
@ -378,17 +379,19 @@ CREATE TABLE tieba_comment
(
id BIGINT AUTO_INCREMENT PRIMARY KEY,
comment_id VARCHAR(255) NOT NULL COMMENT '评论ID',
parment_comment_id VARCHAR(255) DEFAULT '' COMMENT '父评论ID',
parent_comment_id VARCHAR(255) DEFAULT '' COMMENT '父评论ID',
content TEXT NOT NULL COMMENT '评论内容',
user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接',
user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称',
user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址',
tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID',
tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称',
tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接',
publish_time VARCHAR(255) DEFAULT '' COMMENT '发布时间',
ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置',
sub_comment_count INT DEFAULT 0 COMMENT '子评论数',
note_id VARCHAR(255) NOT NULL COMMENT '帖子ID',
note_url VARCHAR(255) NOT NULL COMMENT '帖子链接',
tieba_name VARCHAR(255) NOT NULL COMMENT '所属的贴吧名称',
add_ts BIGINT NOT NULL COMMENT '添加时间戳',
last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳',
KEY `idx_tieba_comment_comment_id` (`note_id`),