feat: 百度贴吧子评论done
This commit is contained in:
parent
1208682a9a
commit
df0f5c1113
|
@ -1,6 +1,6 @@
|
|||
# 基础配置
|
||||
PLATFORM = "xhs"
|
||||
KEYWORDS = "缅甸边境,缅北边境,缅北边境线,缅甸边境线"
|
||||
KEYWORDS = "编程副业,编程兼职"
|
||||
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
|
||||
COOKIES = ""
|
||||
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书
|
||||
|
@ -50,7 +50,7 @@ ENABLE_GET_COMMENTS = True
|
|||
|
||||
# 是否开启爬二级评论模式, 默认不开启爬二级评论
|
||||
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
|
||||
ENABLE_GET_SUB_COMMENTS = False
|
||||
ENABLE_GET_SUB_COMMENTS = True
|
||||
|
||||
# 指定小红书需要爬虫的笔记ID列表
|
||||
XHS_SPECIFIED_ID_LIST = [
|
||||
|
|
|
@ -28,7 +28,10 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||
):
|
||||
self.ip_pool: Optional[ProxyIpPool] = ip_pool
|
||||
self.timeout = timeout
|
||||
self.headers = utils.get_user_agent()
|
||||
self.headers = {
|
||||
"User-Agent": utils.get_user_agent(),
|
||||
"Cookies": "",
|
||||
}
|
||||
self._host = "https://tieba.baidu.com"
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.default_ip_proxy = default_ip_proxy
|
||||
|
@ -51,7 +54,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||
async with httpx.AsyncClient(proxies=actual_proxies) as client:
|
||||
response = await client.request(
|
||||
method, url, timeout=self.timeout,
|
||||
**kwargs
|
||||
headers=self.headers, **kwargs
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
|
@ -99,7 +102,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||
self.default_ip_proxy = proxies
|
||||
return res
|
||||
|
||||
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,请尝试更换新的IP代理: {e}")
|
||||
utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}")
|
||||
raise e
|
||||
|
||||
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
||||
|
@ -154,7 +157,6 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||
page_size: int = 10,
|
||||
sort: SearchSortType = SearchSortType.TIME_DESC,
|
||||
note_type: SearchNoteType = SearchNoteType.FIXED_THREAD,
|
||||
random_sleep: bool = True
|
||||
) -> List[TiebaNote]:
|
||||
"""
|
||||
根据关键词搜索贴吧帖子
|
||||
|
@ -164,8 +166,6 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||
page_size: 每页大小
|
||||
sort: 结果排序方式
|
||||
note_type: 帖子类型(主题贴|主题+回复混合模式)
|
||||
random_sleep: 是否随机休眠
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
@ -179,8 +179,6 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||
"only_thread": note_type.value
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
if random_sleep:
|
||||
random.randint(1, 5)
|
||||
return self._page_extractor.extract_search_note_list(page_content)
|
||||
|
||||
async def get_note_by_id(self, note_id: str) -> TiebaNote:
|
||||
|
@ -216,17 +214,20 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||
"pn": current_page
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id)
|
||||
comments = self._page_extractor.extract_tieba_note_parment_comments(page_content,
|
||||
note_id=note_detail.note_id)
|
||||
if not comments:
|
||||
break
|
||||
if callback:
|
||||
await callback(note_detail.note_id, comments)
|
||||
result.extend(comments)
|
||||
# 获取所有子评论
|
||||
await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
current_page += 1
|
||||
return result
|
||||
|
||||
async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0,
|
||||
async def get_comments_all_sub_comments(self, comments: List[TiebaComment], crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None) -> List[TiebaComment]:
|
||||
"""
|
||||
获取指定评论下的所有子评论
|
||||
|
@ -238,7 +239,37 @@ class BaiduTieBaClient(AbstractApiClient):
|
|||
Returns:
|
||||
|
||||
"""
|
||||
uri = "/p/comment"
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
return []
|
||||
|
||||
# todo 未完成子评论的爬取
|
||||
# # 贴吧获取所有子评论需要登录态
|
||||
# if self.headers.get("Cookies") == "" or not self.pong():
|
||||
# raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
|
||||
|
||||
all_sub_comments: List[TiebaComment] = []
|
||||
for comment in comments:
|
||||
if comment.sub_comment_count == 0:
|
||||
continue
|
||||
|
||||
current_page = 1
|
||||
max_sub_page_num = comment.sub_comment_count // 10 + 1
|
||||
while max_sub_page_num >= current_page:
|
||||
params = {
|
||||
"tid": comment.note_id, # 帖子ID
|
||||
"pid": comment.comment_id, # 父级评论ID
|
||||
"fid": comment.tieba_id, # 贴吧ID
|
||||
"pn": current_page # 页码
|
||||
}
|
||||
page_content = await self.get(uri, params=params, return_ori_content=True)
|
||||
sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content,
|
||||
parent_comment=comment)
|
||||
|
||||
if not sub_comments:
|
||||
break
|
||||
if callback:
|
||||
await callback(comment.note_id, sub_comments)
|
||||
all_sub_comments.extend(sub_comments)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
current_page += 1
|
||||
return all_sub_comments
|
||||
|
|
|
@ -100,7 +100,7 @@ class TieBaExtractor:
|
|||
comment_field_value: Dict = self.extract_data_field_value(comment_selector)
|
||||
if not comment_field_value:
|
||||
continue
|
||||
|
||||
tieba_name = comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip()
|
||||
other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
|
||||
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
|
||||
tieba_comment = TiebaComment(
|
||||
|
@ -108,12 +108,15 @@ class TieBaExtractor:
|
|||
sub_comment_count=comment_field_value.get("content").get("comment_num"),
|
||||
content=utils.extract_text_from_html(comment_field_value.get("content").get("content")),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(default='').strip(),
|
||||
user_link=const.TIEBA_URL + comment_selector.xpath(".//a[@class='p_author_face ']/@href").get(
|
||||
default='').strip(),
|
||||
user_nickname=comment_selector.xpath(".//a[@class='p_author_name j_user_card']/text()").get(
|
||||
default='').strip(),
|
||||
user_avatar=comment_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(
|
||||
default='').strip(),
|
||||
tieba_name=comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip(),
|
||||
tieba_id=str(comment_field_value.get("content").get("forum_id", "")),
|
||||
tieba_name=tieba_name,
|
||||
tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}",
|
||||
ip_location=ip_location,
|
||||
publish_time=publish_time,
|
||||
note_id=note_id,
|
||||
|
@ -121,6 +124,45 @@ class TieBaExtractor:
|
|||
result.append(tieba_comment)
|
||||
return result
|
||||
|
||||
|
||||
def extract_tieba_note_sub_comments(self,page_content: str, parent_comment: TiebaComment) -> List[TiebaComment]:
|
||||
"""
|
||||
提取贴吧帖子二级评论
|
||||
Args:
|
||||
page_content:
|
||||
parent_comment:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
selector = Selector(page_content)
|
||||
comments = []
|
||||
comment_ele_list = selector.xpath("//li[@class='lzl_single_post j_lzl_s_p first_no_border']")
|
||||
comment_ele_list.extend(selector.xpath("//li[@class='lzl_single_post j_lzl_s_p ']"))
|
||||
for comment_ele in comment_ele_list:
|
||||
comment_value = self.extract_data_field_value(comment_ele)
|
||||
if not comment_value:
|
||||
continue
|
||||
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
|
||||
content = utils.extract_text_from_html(comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
|
||||
comment = TiebaComment(
|
||||
comment_id=str(comment_value.get("spid")),
|
||||
content=content,
|
||||
user_link=comment_user_a_selector.xpath("./@href").get(default=""),
|
||||
user_nickname=comment_value.get("showname"),
|
||||
user_avatar=comment_user_a_selector.xpath("./img/@src").get(default=""),
|
||||
publish_time=comment_ele.xpath(".//span[@class='lzl_time']/text()").get(default="").strip(),
|
||||
parent_comment_id=parent_comment.comment_id,
|
||||
note_id=parent_comment.note_id,
|
||||
note_url=parent_comment.note_url,
|
||||
tieba_id=parent_comment.tieba_id,
|
||||
tieba_name=parent_comment.tieba_name,
|
||||
tieba_link=parent_comment.tieba_link
|
||||
)
|
||||
comments.append(comment)
|
||||
|
||||
return comments
|
||||
|
||||
@staticmethod
|
||||
def extract_ip_and_pub_time(html_content: str) -> Tuple[str, str]:
|
||||
"""
|
||||
|
@ -162,8 +204,6 @@ class TieBaExtractor:
|
|||
return data_field_dict_value
|
||||
|
||||
|
||||
|
||||
|
||||
def test_extract_search_note_list():
|
||||
with open("test_data/search_keyword_notes.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
@ -179,6 +219,7 @@ def test_extract_note_detail():
|
|||
result = extractor.extract_note_detail(content)
|
||||
print(result.model_dump())
|
||||
|
||||
|
||||
def test_extract_tieba_note_parment_comments():
|
||||
with open("test_data/note_comments.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
@ -186,7 +227,28 @@ def test_extract_tieba_note_parment_comments():
|
|||
result = extractor.extract_tieba_note_parment_comments(content, "123456")
|
||||
print(result)
|
||||
|
||||
def test_extract_tieba_note_sub_comments():
|
||||
with open("test_data/note_sub_comments.html", "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
extractor = TieBaExtractor()
|
||||
fake_parment_comment = TiebaComment(
|
||||
comment_id="123456",
|
||||
content="content",
|
||||
user_link="user_link",
|
||||
user_nickname="user_nickname",
|
||||
user_avatar="user_avatar",
|
||||
publish_time="publish_time",
|
||||
parent_comment_id="parent_comment_id",
|
||||
note_id="note_id",
|
||||
note_url="note_url",
|
||||
tieba_id="tieba_id",
|
||||
tieba_name="tieba_name",
|
||||
)
|
||||
result = extractor.extract_tieba_note_sub_comments(content,fake_parment_comment)
|
||||
print(result)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# test_extract_search_note_list()
|
||||
# test_extract_note_detail()
|
||||
test_extract_tieba_note_parment_comments()
|
||||
# test_extract_tieba_note_parment_comments()
|
||||
test_extract_tieba_note_sub_comments()
|
||||
|
|
|
@ -0,0 +1,189 @@
|
|||
<li class="lzl_single_post j_lzl_s_p first_no_border" data-field='{"spid":150726504693,"showname":"heinzfrentzen","user_name":"heinzfrentzen","portrait":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}'>
|
||||
<a rel="noopener" name="150726504693"></a>
|
||||
<a rel="noopener" data-field='{"un":"heinzfrentzen","id":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&fr=pb" username="heinzfrentzen">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"heinzfrentzen","id":"tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA"}' href="/home/main?id=tb.1.b08d8f12.IR-tbLlZ2GkD6ARA-mfGOA&ie=utf-8&fr=pb" target="_blank" username="heinzfrentzen">heinzfrentzen</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon25.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:11</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726506822,"showname":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","user_name":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","portrait":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}'>
|
||||
<a rel="noopener" name="150726506822"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","id":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&fr=pb" username="可爱的搬运工94">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u53ef\u7231\u7684\u642c\u8fd0\u5de594","id":"tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA"}' href="/home/main?id=tb.1.f1b47a84.Rixjf6fMP-PfH8fnS1CgRA&ie=utf-8&fr=pb" target="_blank" username="可爱的搬运工94">可爱的搬运工94</a>
|
||||
:<span class="lzl_content_main" data-username="">陈芋汐水花也不小 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726508024,"showname":"\u56fd\u9645\u4f53\u575b\u5de8\u661f\u9752\u6912\u8089\u4e1d","user_name":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","portrait":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}'>
|
||||
<a rel="noopener" name="150726508024"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","id":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&fr=pb" username="蚂蚁雅虎哈哈">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u8682\u8681\u96c5\u864e\u54c8\u54c8","id":"tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg"}' href="/home/main?id=tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg&ie=utf-8&fr=pb" target="_blank" username="蚂蚁雅虎哈哈">国际体坛巨星青椒肉丝</a>
|
||||
:<span class="lzl_content_main" data-username="">你怀孕了吗 老是呕吐 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726509762,"showname":"\u8317\u82b1\u5c11\u5e05","user_name":"\u8317\u82b1\u5c11\u5e05","portrait":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}'>
|
||||
<a rel="noopener" name="150726509762"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u8317\u82b1\u5c11\u5e05","id":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&fr=pb" username="茗花少帅">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1421248220","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1421248220","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u8317\u82b1\u5c11\u5e05","id":"tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA"}' href="/home/main?id=tb.1.a0b6ca3c.54TCKizU2c9oSYWqNF7NqA&ie=utf-8&fr=pb" target="_blank" username="茗花少帅">茗花少帅</a>
|
||||
:<span class="lzl_content_main" data-username="">你就只看水花,不看空中姿态吗 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726510645,"showname":"\u4e1c\u534e\u6b66\u5170","user_name":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","portrait":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}'>
|
||||
<a rel="noopener" name="150726510645"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","id":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&fr=pb" username="西安交大前一百">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1644033630","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1644033630","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u897f\u5b89\u4ea4\u5927\u524d\u4e00\u767e","id":"tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw"}' href="/home/main?id=tb.1.774316af.RqsfwTN2w3AJQFmXAO_MHw&ie=utf-8&fr=pb" target="_blank" username="西安交大前一百">东华武兰</a>
|
||||
:<span class="lzl_content_main" data-username="">经典只看水花 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:12</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726514057,"showname":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","user_name":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","portrait":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}'>
|
||||
<a rel="noopener" name="150726514057"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","id":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&fr=pb" username="上下班要注意">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u4e0a\u4e0b\u73ed\u8981\u6ce8\u610f","id":"tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg"}' href="/home/main?id=tb.1.bcab9641.aHxSViAprkm6E0KQWrw3pg&ie=utf-8&fr=pb" target="_blank" username="上下班要注意">上下班要注意</a>
|
||||
:<span class="lzl_content_main" data-username="">额,分数正常吧 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:13</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726520372,"showname":"\u9759\u770b\u8682\u8681\u4e0a\u6811","user_name":"\u9759\u770b\u8682\u8681\u4e0a\u6811","portrait":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}'>
|
||||
<a rel="noopener" name="150726520372"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u9759\u770b\u8682\u8681\u4e0a\u6811","id":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&fr=pb" username="静看蚂蚁上树">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u9759\u770b\u8682\u8681\u4e0a\u6811","id":"tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ"}' href="/home/main?id=tb.1.7ea539b2.dHz6uxKdbItmtGkwZeV6oQ&ie=utf-8&fr=pb" target="_blank" username="静看蚂蚁上树">静看蚂蚁上树</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.c5c485ab.Cf5aDgd1NxLxZlej8r4LWg" target="_blank" class="at">国际体坛巨星青椒肉丝</a>
|
||||
:吃酸黄瓜吃多了<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:14</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726524963,"showname":"\u4e0d\u61c2\u53d6\u5565\u540d\u5b57\ud83d\ude1c","user_name":"\u9ec4\u5c0f\u6e2forz","portrait":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}'>
|
||||
<a rel="noopener" name="150726524963"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u9ec4\u5c0f\u6e2forz","id":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&fr=pb" username="黄小港orz">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":[],"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u9ec4\u5c0f\u6e2forz","id":"tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA"}' href="/home/main?id=tb.1.e74fa44d.lLp46IIhj8NhhHk12z_qRA&ie=utf-8&fr=pb" target="_blank" username="黄小港orz">不懂取啥名字😜</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
请你去跟国际泳联投诉<img class="BDE_Smiley" width="30" height="30" changedsize="false" src="https://gsp0.baidu.com/5aAHeD3nKhI2p27j8IqW0jdnxx1xbK/tb/editor/images/client/image_emoticon22.png">
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:15</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726535666,"showname":"\ud83d\udcab\u6cfd\u8d6b\u62c9\ud83d\udcaf","user_name":"\u5feb\u770b\u5361\u5361\u5361\u5361","portrait":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}'>
|
||||
<a rel="noopener" name="150726535666"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u5feb\u770b\u5361\u5361\u5361\u5361","id":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&fr=pb" username="快看卡卡卡卡">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":{"all_level":{"2":{"end_time":"1539783937","level":2,"pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","score_limit":8000}},"level":{"end_time":"1539783937","pic_url":"http:\/\/imgsrc.baidu.com\/forum\/pic\/item\/6afa80cb39dbb6fdf9de234d0b24ab18962b37f0.jpg","props_id":2}},"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u5feb\u770b\u5361\u5361\u5361\u5361","id":"tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ"}' href="/home/main?id=tb.1.5f510507.B4GLS91flqmWc5QXoaRCoQ&ie=utf-8&fr=pb" target="_blank" username="快看卡卡卡卡">💫泽赫拉💯</a>
|
||||
:<span class="lzl_content_main" data-username="">第五跳陈空中分腿了,空中姿态明显全红婵更好 </span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:17</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_single_post j_lzl_s_p " data-field='{"spid":150726536076,"showname":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\ud83d\udc36","user_name":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","portrait":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}'>
|
||||
<a rel="noopener" name="150726536076"></a>
|
||||
<a rel="noopener" data-field='{"un":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","id":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}' target="_blank" class="j_user_card lzl_p_p" href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&fr=pb" username="嗯嗯哦哦啊啊哼">
|
||||
<img src="https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"/>
|
||||
</a>
|
||||
<div class="lzl_cnt" data-field='{"iconArr":null,"free_flag":null}'>
|
||||
<a rel="noopener" class="at j_user_card " data-field='{"un":"\u55ef\u55ef\u54e6\u54e6\u554a\u554a\u54fc","id":"tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ"}' href="/home/main?id=tb.1.ba071e03._M1o8S5FX4p57pZBJa91CQ&ie=utf-8&fr=pb" target="_blank" username="嗯嗯哦哦啊啊哼">嗯嗯哦哦啊啊🐶</a>
|
||||
:
|
||||
<span class="lzl_content_main" data-username="">
|
||||
回复 <a href="http://tieba.baidu.com/i/sys/jump?un= " onclick="Stats.sendRequest('fr=tb0_forum&st_mod=pb&st_value=atlink');" onmouseover="showattip(this)" onmouseout="hideattip(this)" username=" " portrait="tb.1.84497425.b5GLK5lGm90mTB2BhjrgpA" target="_blank" class="at">美味蟹黄堡💞</a>
|
||||
:你不会看起跳高度和空中姿态?
|
||||
</span>
|
||||
<div class="lzl_content_reply">
|
||||
<span class="lzl_jb" style="display:none;"></span>
|
||||
<span class="lzl_op_list j_lzl_o_l" style="display:none;"></span>
|
||||
<span class="lzl_time">2024-8-6 22:17</span>
|
||||
<a rel="noopener" href="#" class="lzl_s_r">回复</a>
|
||||
</div>
|
||||
</div>
|
||||
</li>
|
||||
<li class="lzl_li_pager j_lzl_l_p lzl_li_pager_s" data-field='{"total_num":16,"total_page":2}'>
|
||||
<a rel="noopener" class="j_lzl_p btn-sub btn-small pull-right" href="##">
|
||||
<i class="icon-reply"></i>
|
||||
我也说一句
|
||||
</a>
|
||||
<p class="j_pager l_pager pager_theme_2">
|
||||
<span class="tP">1</span>
|
||||
<a href="#2">2</a>
|
||||
<a href="#2">下一页</a>
|
||||
<a href="#2">尾页</a>
|
||||
</p>
|
||||
</li>
|
|
@ -28,7 +28,7 @@ class TiebaComment(BaseModel):
|
|||
"""
|
||||
|
||||
comment_id: str = Field(..., description="评论ID")
|
||||
parment_comment_id: str = Field(default="", description="父评论ID")
|
||||
parent_comment_id: str = Field(default="", description="父评论ID")
|
||||
content: str = Field(..., description="评论内容")
|
||||
user_link: str = Field(default="", description="用户主页链接")
|
||||
user_nickname: str = Field(default="", description="用户昵称")
|
||||
|
@ -38,5 +38,7 @@ class TiebaComment(BaseModel):
|
|||
sub_comment_count: int = Field(default=0, description="子评论数")
|
||||
note_id: str = Field(..., description="帖子ID")
|
||||
note_url: str = Field(..., description="帖子链接")
|
||||
tieba_id: str = Field(..., description="所属的贴吧ID")
|
||||
tieba_name: str = Field(..., description="所属的贴吧名称")
|
||||
tieba_link: str = Field(..., description="贴吧链接")
|
||||
|
||||
|
|
|
@ -359,9 +359,10 @@ CREATE TABLE tieba_note
|
|||
`desc` TEXT COMMENT '帖子描述',
|
||||
note_url VARCHAR(255) NOT NULL COMMENT '帖子链接',
|
||||
publish_time VARCHAR(255) NOT NULL COMMENT '发布时间',
|
||||
user_link VARCHAR(255) NOT NULL COMMENT '用户主页链接',
|
||||
user_nickname VARCHAR(255) NOT NULL COMMENT '用户昵称',
|
||||
user_avatar VARCHAR(255) NOT NULL COMMENT '用户头像地址',
|
||||
user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接',
|
||||
user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称',
|
||||
user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址',
|
||||
tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID',
|
||||
tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称',
|
||||
tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接',
|
||||
total_replay_num INT DEFAULT 0 COMMENT '帖子回复总数',
|
||||
|
@ -378,17 +379,19 @@ CREATE TABLE tieba_comment
|
|||
(
|
||||
id BIGINT AUTO_INCREMENT PRIMARY KEY,
|
||||
comment_id VARCHAR(255) NOT NULL COMMENT '评论ID',
|
||||
parment_comment_id VARCHAR(255) DEFAULT '' COMMENT '父评论ID',
|
||||
parent_comment_id VARCHAR(255) DEFAULT '' COMMENT '父评论ID',
|
||||
content TEXT NOT NULL COMMENT '评论内容',
|
||||
user_link VARCHAR(255) DEFAULT '' COMMENT '用户主页链接',
|
||||
user_nickname VARCHAR(255) DEFAULT '' COMMENT '用户昵称',
|
||||
user_avatar VARCHAR(255) DEFAULT '' COMMENT '用户头像地址',
|
||||
tieba_id VARCHAR(255) DEFAULT '' COMMENT '贴吧ID',
|
||||
tieba_name VARCHAR(255) NOT NULL COMMENT '贴吧名称',
|
||||
tieba_link VARCHAR(255) NOT NULL COMMENT '贴吧链接',
|
||||
publish_time VARCHAR(255) DEFAULT '' COMMENT '发布时间',
|
||||
ip_location VARCHAR(255) DEFAULT '' COMMENT 'IP地理位置',
|
||||
sub_comment_count INT DEFAULT 0 COMMENT '子评论数',
|
||||
note_id VARCHAR(255) NOT NULL COMMENT '帖子ID',
|
||||
note_url VARCHAR(255) NOT NULL COMMENT '帖子链接',
|
||||
tieba_name VARCHAR(255) NOT NULL COMMENT '所属的贴吧名称',
|
||||
add_ts BIGINT NOT NULL COMMENT '添加时间戳',
|
||||
last_modify_ts BIGINT NOT NULL COMMENT '最后修改时间戳',
|
||||
KEY `idx_tieba_comment_comment_id` (`note_id`),
|
||||
|
|
Loading…
Reference in New Issue