parent
273c9a316b
commit
aba9f14f50
|
@ -34,7 +34,7 @@ MAX_COMMENTS_PER_POST = 10
|
|||
|
||||
# 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
|
||||
COMMENT_KEYWORDS = [
|
||||
"我"
|
||||
# "真棒"
|
||||
# ........................
|
||||
]
|
||||
|
||||
|
@ -56,9 +56,10 @@ DY_SPECIFIED_ID_LIST = [
|
|||
# 指定快手平台需要爬取的ID列表
|
||||
KS_SPECIFIED_ID_LIST = []
|
||||
|
||||
# 指定B站平台需要爬取的视频ID列表
|
||||
# 指定B站平台需要爬取的视频bvid列表
|
||||
BILI_SPECIFIED_ID_LIST = [
|
||||
"416252543",
|
||||
"976148468"
|
||||
"BV1d54y1g7db",
|
||||
"BV1Sz4y1U77N",
|
||||
"BV14Q4y1n7jz",
|
||||
# ........................
|
||||
]
|
||||
|
|
2
db.py
2
db.py
|
@ -16,7 +16,7 @@ async def init_db(create_db: bool = False) -> None:
|
|||
async def init():
|
||||
await init_db(create_db=True)
|
||||
await Tortoise.generate_schemas()
|
||||
utils.logger.info("Init DB Success!")
|
||||
utils.logger.info("[db.init] Init DB Success!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
# @Desc : bilibili 请求客户端
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import httpx
|
||||
|
@ -94,16 +94,16 @@ class BilibiliClient:
|
|||
|
||||
async def pong(self) -> bool:
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("Begin pong bilibili...")
|
||||
utils.logger.info("[BilibiliClient.pong] Begin pong bilibili...")
|
||||
ping_flag = False
|
||||
try:
|
||||
check_login_uri = "/x/web-interface/nav"
|
||||
response = await self.get(check_login_uri)
|
||||
if response.get("isLogin"):
|
||||
utils.logger.info("use cache login state get web interface successfull!")
|
||||
utils.logger.info("[BilibiliClient.pong] Use cache login state get web interface successfull!")
|
||||
ping_flag = True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"Pong bilibili failed: {e}, and try to login again...")
|
||||
utils.logger.error(f"[BilibiliClient.pong] Pong bilibili failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
|
@ -132,16 +132,22 @@ class BilibiliClient:
|
|||
}
|
||||
return await self.get(uri, post_data)
|
||||
|
||||
async def get_video_info(self, video_id: str) -> Dict:
|
||||
async def get_video_info(self, aid: Union[int, None] = None, bvid: Union[str, None] = None) -> Dict:
|
||||
"""
|
||||
Bilibli web video detail api
|
||||
:param video_id:
|
||||
Bilibli web video detail api, aid 和 bvid任选一个参数
|
||||
:param aid: 稿件avid
|
||||
:param bvid: 稿件bvid
|
||||
:return:
|
||||
"""
|
||||
if not aid and not bvid:
|
||||
raise ValueError("请提供 aid 或 bvid 中的至少一个参数")
|
||||
|
||||
uri = "/x/web-interface/view/detail"
|
||||
params = {
|
||||
"aid": video_id
|
||||
}
|
||||
params = dict()
|
||||
if aid:
|
||||
params.update({"aid": aid})
|
||||
else:
|
||||
params.update({"bvid": bvid})
|
||||
return await self.get(uri, params, enable_params_sign=False)
|
||||
|
||||
async def get_video_comments(self,
|
||||
|
|
|
@ -8,7 +8,7 @@ import os
|
|||
import random
|
||||
import time
|
||||
from asyncio import Task
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
from playwright.async_api import (BrowserContext, BrowserType, Page,
|
||||
async_playwright)
|
||||
|
@ -94,10 +94,10 @@ class BilibiliCrawler(AbstractCrawler):
|
|||
search bilibili video with keywords
|
||||
:return:
|
||||
"""
|
||||
utils.logger.info("Begin search bilibli keywords")
|
||||
utils.logger.info("[BilibiliCrawler.search] Begin search bilibli keywords")
|
||||
bili_limit_count = 20 # bilibili limit page fixed value
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
utils.logger.info(f"Current search keyword: {keyword}")
|
||||
utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
video_id_list: List[str] = []
|
||||
|
@ -111,7 +111,7 @@ class BilibiliCrawler(AbstractCrawler):
|
|||
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_video_info_task(video_item.get("aid"), semaphore)
|
||||
self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore)
|
||||
for video_item in video_list
|
||||
]
|
||||
video_items = await asyncio.gather(*task_list)
|
||||
|
@ -129,7 +129,7 @@ class BilibiliCrawler(AbstractCrawler):
|
|||
:param video_id_list:
|
||||
:return:
|
||||
"""
|
||||
utils.logger.info(f"[batch_get_video_comments] video ids:{video_id_list}")
|
||||
utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for video_id in video_id_list:
|
||||
|
@ -146,7 +146,7 @@ class BilibiliCrawler(AbstractCrawler):
|
|||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[get_comments] begin get video_id: {video_id} comments ...")
|
||||
utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
|
||||
# Read keyword and quantity from config
|
||||
keywords = config.COMMENT_KEYWORDS
|
||||
max_comments = config.MAX_COMMENTS_PER_POST
|
||||
|
@ -174,9 +174,9 @@ class BilibiliCrawler(AbstractCrawler):
|
|||
await bilibili.batch_update_bilibili_video_comments(video_id, filtered_comments)
|
||||
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}")
|
||||
utils.logger.error(f"[BilibiliCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[get_comments] may be been blocked, err:{e}")
|
||||
utils.logger.error(f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
|
||||
|
||||
async def get_specified_videos(self):
|
||||
"""
|
||||
|
@ -185,35 +185,42 @@ class BilibiliCrawler(AbstractCrawler):
|
|||
"""
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_video_info_task(video_id=video_id, semaphore=semaphore) for video_id in config.BILI_SPECIFIED_ID_LIST
|
||||
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
|
||||
config.BILI_SPECIFIED_ID_LIST
|
||||
]
|
||||
video_details = await asyncio.gather(*task_list)
|
||||
video_aids_list = []
|
||||
for video_detail in video_details:
|
||||
if video_detail is not None:
|
||||
video_item_view: Dict = video_detail.get("View")
|
||||
video_aid: str = video_item_view.get("aid")
|
||||
if video_aid:
|
||||
video_aids_list.append(video_aid)
|
||||
await bilibili.update_bilibili_video(video_detail)
|
||||
await self.batch_get_video_comments(config.BILI_SPECIFIED_ID_LIST)
|
||||
await self.batch_get_video_comments(video_aids_list)
|
||||
|
||||
async def get_video_info_task(self, video_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
||||
async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
||||
"""
|
||||
Get video detail task
|
||||
:param video_id:
|
||||
:param aid:
|
||||
:param bvid:
|
||||
:param semaphore:
|
||||
:return:
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
result = await self.bili_client.get_video_info(video_id)
|
||||
result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
|
||||
return result
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"Get video detail error: {ex}")
|
||||
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"have not fund note detail video_id:{video_id}, err: {ex}")
|
||||
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
|
||||
"""Create xhs client"""
|
||||
utils.logger.info("Begin create xiaohongshu API client ...")
|
||||
utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create xiaohongshu API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
bilibili_client_obj = BilibiliClient(
|
||||
proxies=httpx_proxy,
|
||||
|
@ -250,7 +257,7 @@ class BilibiliCrawler(AbstractCrawler):
|
|||
headless: bool = True
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info("Begin create browser context ...")
|
||||
utils.logger.info("[BilibiliCrawler.launch_browser] Begin create browser context ...")
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
|
|
|
@ -34,7 +34,7 @@ class BilibiliLogin(AbstractLogin):
|
|||
|
||||
async def begin(self):
|
||||
"""Start login xiaohongshu"""
|
||||
utils.logger.info("Begin login Bilibili ...")
|
||||
utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
|
||||
if self.login_type == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif self.login_type == "phone":
|
||||
|
@ -42,7 +42,7 @@ class BilibiliLogin(AbstractLogin):
|
|||
elif self.login_type == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
raise ValueError("[BilibiliLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
|
@ -59,7 +59,7 @@ class BilibiliLogin(AbstractLogin):
|
|||
|
||||
async def login_by_qrcode(self):
|
||||
"""login bilibili website and keep webdriver login state"""
|
||||
utils.logger.info("Begin login bilibili by qrcode ...")
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...")
|
||||
|
||||
# click login button
|
||||
login_button_ele = self.context_page.locator(
|
||||
|
@ -74,29 +74,29 @@ class BilibiliLogin(AbstractLogin):
|
|||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("login failed , have not found qrcode please check ....")
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"Waiting for scan code login, remaining time is 20s")
|
||||
utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("Login bilibili failed by qrcode login method ...")
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] Login bilibili failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
pass
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("Begin login bilibili by cookie ...")
|
||||
utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
|
|
|
@ -75,12 +75,12 @@ class DouYinCrawler(AbstractCrawler):
|
|||
# Get the information and comments of the specified post
|
||||
await self.get_specified_awemes()
|
||||
|
||||
utils.logger.info("Douyin Crawler finished ...")
|
||||
utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
utils.logger.info("Begin search douyin keywords")
|
||||
utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords")
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
utils.logger.info(f"Current keyword: {keyword}")
|
||||
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
|
||||
aweme_list: List[str] = []
|
||||
dy_limit_count = 10
|
||||
page = 0
|
||||
|
@ -89,7 +89,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
|
||||
offset=page * dy_limit_count)
|
||||
except DataFetchError:
|
||||
utils.logger.error(f"search douyin keyword: {keyword} failed")
|
||||
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
|
||||
break
|
||||
page += 1
|
||||
for post_item in posts_res.get("data"):
|
||||
|
@ -100,7 +100,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||
continue
|
||||
aweme_list.append(aweme_info.get("aweme_id", ""))
|
||||
await douyin.update_douyin_aweme(aweme_item=aweme_info)
|
||||
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
|
||||
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
|
||||
await self.batch_get_note_comments(aweme_list)
|
||||
|
||||
async def get_specified_awemes(self):
|
||||
|
@ -121,10 +121,10 @@ class DouYinCrawler(AbstractCrawler):
|
|||
try:
|
||||
return await self.dy_client.get_video_by_id(aweme_id)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"Get aweme detail error: {ex}")
|
||||
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"have not fund note detail aweme_id:{aweme_id}, err: {ex}")
|
||||
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
|
||||
|
@ -147,9 +147,9 @@ class DouYinCrawler(AbstractCrawler):
|
|||
)
|
||||
# 现在返回的 comments 已经是经过关键词筛选的
|
||||
await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
|
||||
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained and filtered ...")
|
||||
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
|
||||
except DataFetchError as e:
|
||||
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
|
||||
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
|
||||
|
||||
@staticmethod
|
||||
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
|
||||
|
@ -213,4 +213,4 @@ class DouYinCrawler(AbstractCrawler):
|
|||
async def close(self) -> None:
|
||||
"""Close browser context"""
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("Browser context closed ...")
|
||||
utils.logger.info("[DouYinCrawler.close] Browser context closed ...")
|
||||
|
|
|
@ -47,7 +47,7 @@ class DouYinLogin(AbstractLogin):
|
|||
elif self.login_type == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
# 如果页面重定向到滑动验证码页面,需要再次滑动滑块
|
||||
await asyncio.sleep(6)
|
||||
|
@ -56,16 +56,16 @@ class DouYinLogin(AbstractLogin):
|
|||
await self.check_page_display_slider(move_step=3, slider_level="hard")
|
||||
|
||||
# check login state
|
||||
utils.logger.info(f"login finished then check login state ...")
|
||||
utils.logger.info(f"[DouYinLogin.begin] login finished then check login state ...")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("login failed please confirm ...")
|
||||
utils.logger.info("[DouYinLogin.begin] login failed please confirm ...")
|
||||
sys.exit()
|
||||
|
||||
# wait for redirect
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
utils.logger.info(f"[DouYinLogin.begin] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
|
@ -84,21 +84,21 @@ class DouYinLogin(AbstractLogin):
|
|||
# check dialog box is auto popup and wait for 10 seconds
|
||||
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10)
|
||||
except Exception as e:
|
||||
utils.logger.error(f"login dialog box does not pop up automatically, error: {e}")
|
||||
utils.logger.info("login dialog box does not pop up automatically, we will manually click the login button")
|
||||
utils.logger.error(f"[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
|
||||
utils.logger.info("[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
|
||||
login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']")
|
||||
await login_button_ele.click()
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
utils.logger.info("Begin login douyin by qrcode...")
|
||||
utils.logger.info("[DouYinLogin.login_by_qrcode] Begin login douyin by qrcode...")
|
||||
qrcode_img_selector = "xpath=//article[@class='web-login']//img"
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("login qrcode not found please confirm ...")
|
||||
utils.logger.info("[DouYinLogin.login_by_qrcode] login qrcode not found please confirm ...")
|
||||
sys.exit()
|
||||
|
||||
# show login qrcode
|
||||
|
@ -109,7 +109,7 @@ class DouYinLogin(AbstractLogin):
|
|||
await asyncio.sleep(2)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
utils.logger.info("Begin login douyin by mobile ...")
|
||||
utils.logger.info("[DouYinLogin.login_by_mobile] Begin login douyin by mobile ...")
|
||||
mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']")
|
||||
await mobile_tap_ele.click()
|
||||
await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']")
|
||||
|
@ -124,7 +124,7 @@ class DouYinLogin(AbstractLogin):
|
|||
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
|
||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
utils.logger.info(f"[DouYinLogin.login_by_mobile] get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
await asyncio.sleep(1)
|
||||
sms_code_key = f"dy_{self.login_phone}"
|
||||
sms_code_value = redis_obj.get(sms_code_key)
|
||||
|
@ -157,7 +157,7 @@ class DouYinLogin(AbstractLogin):
|
|||
slider_verify_success = False
|
||||
while not slider_verify_success:
|
||||
if max_slider_try_times <= 0:
|
||||
utils.logger.error("slider verify failed ...")
|
||||
utils.logger.error("[DouYinLogin.check_page_display_slider] slider verify failed ...")
|
||||
sys.exit()
|
||||
try:
|
||||
await self.move_slider(back_selector, gap_selector, move_step, slider_level)
|
||||
|
@ -166,20 +166,20 @@ class DouYinLogin(AbstractLogin):
|
|||
# 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮
|
||||
page_content = await self.context_page.content()
|
||||
if "操作过慢" in page_content or "提示重新操作" in page_content:
|
||||
utils.logger.info("slider verify failed, retry ...")
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...")
|
||||
await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]")
|
||||
continue
|
||||
|
||||
# 滑动成功后,等待滑块消失
|
||||
await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000)
|
||||
# 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码
|
||||
utils.logger.info("slider verify success ...")
|
||||
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify success ...")
|
||||
slider_verify_success = True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"slider verify failed, error: {e}")
|
||||
utils.logger.error(f"[DouYinLogin.check_page_display_slider] slider verify failed, error: {e}")
|
||||
await asyncio.sleep(1)
|
||||
max_slider_try_times -= 1
|
||||
utils.logger.info(f"remaining slider try times: {max_slider_try_times}")
|
||||
utils.logger.info(f"[DouYinLogin.check_page_display_slider] remaining slider try times: {max_slider_try_times}")
|
||||
continue
|
||||
|
||||
async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"):
|
||||
|
@ -236,7 +236,7 @@ class DouYinLogin(AbstractLogin):
|
|||
await self.context_page.mouse.up()
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("Begin login douyin by cookie ...")
|
||||
utils.logger.info("[DouYinLogin.login_by_cookies] Begin login douyin by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
|
|
|
@ -59,12 +59,12 @@ class KuaiShouClient:
|
|||
@staticmethod
|
||||
async def pong() -> bool:
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("Begin pong kuaishou...")
|
||||
utils.logger.info("[KuaiShouClient.pong] Begin pong kuaishou...")
|
||||
ping_flag = False
|
||||
try:
|
||||
pass
|
||||
except Exception as e:
|
||||
utils.logger.error(f"Pong kuaishou failed: {e}, and try to login again...")
|
||||
utils.logger.error(f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
|
|
|
@ -81,13 +81,13 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("Kuaishou Crawler finished ...")
|
||||
utils.logger.info("[KuaishouCrawler.start] Kuaishou Crawler finished ...")
|
||||
|
||||
async def search(self):
|
||||
utils.logger.info("Begin search kuaishou keywords")
|
||||
utils.logger.info("[KuaishouCrawler.search] Begin search kuaishou keywords")
|
||||
ks_limit_count = 20 # kuaishou limit page fixed value
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
utils.logger.info(f"Current search keyword: {keyword}")
|
||||
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while page * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
video_id_list: List[str] = []
|
||||
|
@ -96,12 +96,12 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
pcursor=str(page),
|
||||
)
|
||||
if not videos_res:
|
||||
utils.logger.error(f"search info by keyword:{keyword} not found data")
|
||||
utils.logger.error(f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data")
|
||||
continue
|
||||
|
||||
vision_search_photo: Dict = videos_res.get("visionSearchPhoto")
|
||||
if vision_search_photo.get("result") != 1:
|
||||
utils.logger.error(f"search info by keyword:{keyword} not found data ")
|
||||
utils.logger.error(f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data ")
|
||||
continue
|
||||
|
||||
for video_detail in vision_search_photo.get("feeds"):
|
||||
|
@ -129,13 +129,13 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
async with semaphore:
|
||||
try:
|
||||
result = await self.ks_client.get_video_info(video_id)
|
||||
utils.logger.info(f"Get video_id:{video_id} info result: {result} ...")
|
||||
utils.logger.info(f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ...")
|
||||
return result.get("visionVideoDetail")
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"Get video detail error: {ex}")
|
||||
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"have not fund note detail video_id:{video_id}, err: {ex}")
|
||||
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund note detail video_id:{video_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_video_comments(self, video_id_list: List[str]):
|
||||
|
@ -144,7 +144,7 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
:param video_id_list:
|
||||
:return:
|
||||
"""
|
||||
utils.logger.info(f"[batch_get_video_comments] video ids:{video_id_list}")
|
||||
utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for video_id in video_id_list:
|
||||
|
@ -163,16 +163,16 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
utils.logger.info(f"[get_comments] bengin get video_id: {video_id} comments ...")
|
||||
utils.logger.info(f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ...")
|
||||
await self.ks_client.get_video_all_comments(
|
||||
photo_id=video_id,
|
||||
crawl_interval=random.random(),
|
||||
callback=kuaishou.batch_update_ks_video_comments
|
||||
)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}")
|
||||
utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[get_comments] may be been blocked, err:{e}")
|
||||
utils.logger.error(f"[KuaishouCrawler.get_comments] may be been blocked, err:{e}")
|
||||
# use time.sleeep block main coroutine instead of asyncio.sleep and cacel running comment task
|
||||
# maybe kuaishou block our request, we will take a nap and update the cookie again
|
||||
current_running_tasks = comment_tasks_var.get()
|
||||
|
@ -197,7 +197,7 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
|
||||
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
|
||||
"""Create xhs client"""
|
||||
utils.logger.info("Begin create kuaishou API client ...")
|
||||
utils.logger.info("[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
xhs_client_obj = KuaiShouClient(
|
||||
proxies=httpx_proxy,
|
||||
|
@ -221,7 +221,7 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
headless: bool = True
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info("Begin create browser context ...")
|
||||
utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...")
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
user_data_dir = os.path.join(os.getcwd(), "browser_data",
|
||||
config.USER_DATA_DIR % self.platform) # type: ignore
|
||||
|
@ -245,4 +245,4 @@ class KuaishouCrawler(AbstractCrawler):
|
|||
async def close(self):
|
||||
"""Close browser context"""
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("Browser context closed ...")
|
||||
utils.logger.info("[KuaishouCrawler.close] Browser context closed ...")
|
||||
|
|
|
@ -29,7 +29,7 @@ class KuaishouLogin(AbstractLogin):
|
|||
|
||||
async def begin(self):
|
||||
"""Start login xiaohongshu"""
|
||||
utils.logger.info("Begin login kuaishou ...")
|
||||
utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
|
||||
if self.login_type == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif self.login_type == "phone":
|
||||
|
@ -37,7 +37,7 @@ class KuaishouLogin(AbstractLogin):
|
|||
elif self.login_type == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
|
||||
|
||||
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
|
||||
async def check_login_state(self) -> bool:
|
||||
|
@ -55,7 +55,7 @@ class KuaishouLogin(AbstractLogin):
|
|||
|
||||
async def login_by_qrcode(self):
|
||||
"""login kuaishou website and keep webdriver login state"""
|
||||
utils.logger.info("Begin login kuaishou by qrcode ...")
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")
|
||||
|
||||
# click login button
|
||||
login_button_ele = self.context_page.locator(
|
||||
|
@ -70,7 +70,7 @@ class KuaishouLogin(AbstractLogin):
|
|||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("login failed , have not found qrcode please check ....")
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
|
||||
|
||||
|
@ -78,22 +78,22 @@ class KuaishouLogin(AbstractLogin):
|
|||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"waiting for scan code login, remaining time is 20s")
|
||||
utils.logger.info(f"[KuaishouLogin.login_by_qrcode] waiting for scan code login, remaining time is 20s")
|
||||
try:
|
||||
await self.check_login_state()
|
||||
except RetryError:
|
||||
utils.logger.info("Login kuaishou failed by qrcode login method ...")
|
||||
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login kuaishou failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
utils.logger.info(f"[KuaishouLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_mobile(self):
|
||||
pass
|
||||
|
||||
async def login_by_cookies(self):
|
||||
utils.logger.info("Begin login kuaishou by cookie ...")
|
||||
utils.logger.info("[KuaishouLogin.login_by_cookies] Begin login kuaishou by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
|
|
|
@ -83,14 +83,14 @@ class XHSClient:
|
|||
|
||||
async def pong(self) -> bool:
|
||||
"""get a note to check if login state is ok"""
|
||||
utils.logger.info("Begin to pong xhs...")
|
||||
utils.logger.info("[XHSClient.pong] Begin to pong xhs...")
|
||||
ping_flag = False
|
||||
try:
|
||||
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
|
||||
if note_card.get("items"):
|
||||
ping_flag = True
|
||||
except Exception as e:
|
||||
utils.logger.error(f"Ping xhs failed: {e}, and try to login again...")
|
||||
utils.logger.error(f"[XHSClient.pong] Ping xhs failed: {e}, and try to login again...")
|
||||
ping_flag = False
|
||||
return ping_flag
|
||||
|
||||
|
@ -136,7 +136,7 @@ class XHSClient:
|
|||
if res and res.get("items"):
|
||||
res_dict: Dict = res["items"][0]["note_card"]
|
||||
return res_dict
|
||||
utils.logger.error(f"[xhs.client.get_note_by_id] get note empty and res:{res}")
|
||||
utils.logger.error(f"[XHSClient.get_note_by_id] get note empty and res:{res}")
|
||||
return dict()
|
||||
|
||||
async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
|
||||
|
@ -195,7 +195,7 @@ class XHSClient:
|
|||
# Handle the absence of 'comments' key appropriately
|
||||
# For example, log an error message, break from the loop, etc.
|
||||
# This is just an example:
|
||||
print(f"No 'comments' key found in response: {comments_res}")
|
||||
utils.logger.info(f"[XHSClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
|
||||
break
|
||||
comments = comments_res["comments"]
|
||||
if not is_fetch_sub_comments:
|
||||
|
|
|
@ -87,14 +87,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
else:
|
||||
pass
|
||||
|
||||
utils.logger.info("Xhs Crawler finished ...")
|
||||
utils.logger.info("[XiaoHongShuCrawler.start] Xhs Crawler finished ...")
|
||||
|
||||
async def search(self) -> None:
|
||||
"""Search for notes and retrieve their comment information."""
|
||||
utils.logger.info("Begin search xiaohongshu keywords")
|
||||
utils.logger.info("[XiaoHongShuCrawler.search] Begin search xiaohongshu keywords")
|
||||
xhs_limit_count = 20 # xhs limit page fixed value
|
||||
for keyword in config.KEYWORDS.split(","):
|
||||
utils.logger.info(f"Current search keyword: {keyword}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
|
||||
page = 1
|
||||
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||
note_id_list: List[str] = []
|
||||
|
@ -102,7 +102,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
keyword=keyword,
|
||||
page=page,
|
||||
)
|
||||
utils.logger.info(f"Search notes res:{notes_res}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail(post_item.get("id"), semaphore)
|
||||
|
@ -115,7 +115,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
await xhs_model.update_xhs_note(note_detail)
|
||||
note_id_list.append(note_detail.get("note_id"))
|
||||
page += 1
|
||||
utils.logger.info(f"Note details: {note_details}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
|
||||
await self.batch_get_note_comments(note_id_list)
|
||||
|
||||
async def get_specified_notes(self):
|
||||
|
@ -136,15 +136,15 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
try:
|
||||
return await self.xhs_client.get_note_by_id(note_id)
|
||||
except DataFetchError as ex:
|
||||
utils.logger.error(f"Get note detail error: {ex}")
|
||||
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}")
|
||||
return None
|
||||
except KeyError as ex:
|
||||
utils.logger.error(f"have not fund note detail note_id:{note_id}, err: {ex}")
|
||||
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}")
|
||||
return None
|
||||
|
||||
async def batch_get_note_comments(self, note_list: List[str]):
|
||||
"""Batch get note comments"""
|
||||
utils.logger.info(f"Begin batch get note comments, note list: {note_list}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list: List[Task] = []
|
||||
for note_id in note_list:
|
||||
|
@ -155,7 +155,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
|
||||
"""Get note comments with keyword filtering and quantity limitation"""
|
||||
async with semaphore:
|
||||
utils.logger.info(f"Begin get note id comments {note_id}")
|
||||
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
|
||||
all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
|
||||
|
||||
# 从配置文件中读取关键词和数量限制
|
||||
|
@ -191,7 +191,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
|
||||
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XHSClient:
|
||||
"""Create xhs client"""
|
||||
utils.logger.info("Begin create xiaohongshu API client ...")
|
||||
utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
|
||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||
xhs_client_obj = XHSClient(
|
||||
proxies=httpx_proxy,
|
||||
|
@ -215,7 +215,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
headless: bool = True
|
||||
) -> BrowserContext:
|
||||
"""Launch browser and create browser context"""
|
||||
utils.logger.info("Begin create browser context ...")
|
||||
utils.logger.info("[XiaoHongShuCrawler.launch_browser] Begin create browser context ...")
|
||||
if config.SAVE_LOGIN_STATE:
|
||||
# feat issue #14
|
||||
# we will save login state to avoid login every time
|
||||
|
@ -241,4 +241,4 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||
async def close(self):
|
||||
"""Close browser context"""
|
||||
await self.browser_context.close()
|
||||
utils.logger.info("Browser context closed ...")
|
||||
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
|
||||
|
|
|
@ -37,7 +37,7 @@ class XHSLogin(AbstractLogin):
|
|||
"""
|
||||
|
||||
if "请通过验证" in await self.context_page.content():
|
||||
utils.logger.info("登录过程中出现验证码,请手动验证")
|
||||
utils.logger.info("[XHSLogin.check_login_state] 登录过程中出现验证码,请手动验证")
|
||||
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
|
@ -48,7 +48,7 @@ class XHSLogin(AbstractLogin):
|
|||
|
||||
async def begin(self):
|
||||
"""Start login xiaohongshu"""
|
||||
utils.logger.info("Begin login xiaohongshu ...")
|
||||
utils.logger.info("[XHSLogin.begin] Begin login xiaohongshu ...")
|
||||
if self.login_type == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif self.login_type == "phone":
|
||||
|
@ -56,11 +56,11 @@ class XHSLogin(AbstractLogin):
|
|||
elif self.login_type == "cookie":
|
||||
await self.login_by_cookies()
|
||||
else:
|
||||
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||
raise ValueError("[XHSLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||
|
||||
async def login_by_mobile(self):
|
||||
"""Login xiaohongshu by mobile"""
|
||||
utils.logger.info("Begin login xiaohongshu by mobile ...")
|
||||
utils.logger.info("[XHSLogin.login_by_mobile] Begin login xiaohongshu by mobile ...")
|
||||
await asyncio.sleep(1)
|
||||
try:
|
||||
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
|
||||
|
@ -77,7 +77,7 @@ class XHSLogin(AbstractLogin):
|
|||
)
|
||||
await element.click()
|
||||
except Exception as e:
|
||||
utils.logger.info("have not found mobile button icon and keep going ...")
|
||||
utils.logger.info("[XHSLogin.login_by_mobile] have not found mobile button icon and keep going ...")
|
||||
|
||||
await asyncio.sleep(1)
|
||||
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
|
||||
|
@ -93,7 +93,7 @@ class XHSLogin(AbstractLogin):
|
|||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
no_logged_in_session = ""
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
utils.logger.info(f"[XHSLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
await asyncio.sleep(1)
|
||||
sms_code_key = f"xhs_{self.login_phone}"
|
||||
sms_code_value = redis_obj.get(sms_code_key)
|
||||
|
@ -119,16 +119,16 @@ class XHSLogin(AbstractLogin):
|
|||
try:
|
||||
await self.check_login_state(no_logged_in_session)
|
||||
except RetryError:
|
||||
utils.logger.info("Login xiaohongshu failed by mobile login method ...")
|
||||
utils.logger.info("[XHSLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
utils.logger.info(f"[XHSLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login xiaohongshu website and keep webdriver login state"""
|
||||
utils.logger.info("Begin login xiaohongshu by qrcode ...")
|
||||
utils.logger.info("[XHSLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...")
|
||||
# login_selector = "div.login-container > div.left > div.qrcode > img"
|
||||
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
|
||||
# find login qrcode
|
||||
|
@ -137,7 +137,7 @@ class XHSLogin(AbstractLogin):
|
|||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
utils.logger.info("login failed , have not found qrcode please check ....")
|
||||
utils.logger.info("[XHSLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||
# if this website does not automatically popup login dialog box, we will manual click login button
|
||||
await asyncio.sleep(0.5)
|
||||
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
|
||||
|
@ -161,20 +161,20 @@ class XHSLogin(AbstractLogin):
|
|||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||
|
||||
utils.logger.info(f"waiting for scan code login, remaining time is 120s")
|
||||
utils.logger.info(f"[XHSLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
||||
try:
|
||||
await self.check_login_state(no_logged_in_session)
|
||||
except RetryError:
|
||||
utils.logger.info("Login xiaohongshu failed by qrcode login method ...")
|
||||
utils.logger.info("[XHSLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...")
|
||||
sys.exit()
|
||||
|
||||
wait_redirect_seconds = 5
|
||||
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
utils.logger.info(f"[XHSLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||
await asyncio.sleep(wait_redirect_seconds)
|
||||
|
||||
async def login_by_cookies(self):
|
||||
"""login xiaohongshu website by cookies"""
|
||||
utils.logger.info("Begin login xiaohongshu by cookie ...")
|
||||
utils.logger.info("[XHSLogin.login_by_cookies] Begin login xiaohongshu by cookie ...")
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
if key != "web_session": # only set web_session cookie attr
|
||||
continue
|
||||
|
|
|
@ -85,7 +85,7 @@ async def update_bilibili_video(video_item: Dict):
|
|||
"video_url": f"https://www.bilibili.com/video/av{video_id}",
|
||||
"video_cover_url": video_item_view.get("pic", ""),
|
||||
}
|
||||
utils.logger.info(f"bilibili video id:{video_id}, title:{local_db_item.get('title')}")
|
||||
utils.logger.info(f"[models.bilibili.update_bilibili_video] bilibili video id:{video_id}, title:{local_db_item.get('title')}")
|
||||
if config.IS_SAVED_DATABASED:
|
||||
if not await BilibiliVideo.filter(video_id=video_id).exists():
|
||||
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||
|
@ -131,7 +131,7 @@ async def update_bilibili_video_comment(video_id: str, comment_item: Dict):
|
|||
"sub_comment_count": str(comment_item.get("rcount", 0)),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
utils.logger.info(f"Bilibili video comment: {comment_id}, content: {local_db_item.get('content')}")
|
||||
utils.logger.info(f"[models.bilibili.update_bilibili_video_comment] Bilibili video comment: {comment_id}, content: {local_db_item.get('content')}")
|
||||
if config.IS_SAVED_DATABASED:
|
||||
if not await BilibiliComment.filter(comment_id=comment_id).exists():
|
||||
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||
|
|
|
@ -88,7 +88,7 @@ async def update_douyin_aweme(aweme_item: Dict):
|
|||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
"aweme_url": f"https://www.douyin.com/video/{aweme_id}"
|
||||
}
|
||||
print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
|
||||
utils.logger.info(f"[models.douyin.update_douyin_aweme] douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
|
||||
if config.IS_SAVED_DATABASED:
|
||||
if not await DouyinAweme.filter(aweme_id=aweme_id).exists():
|
||||
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||
|
@ -123,7 +123,7 @@ async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
|
|||
async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
|
||||
comment_aweme_id = comment_item.get("aweme_id")
|
||||
if aweme_id != comment_aweme_id:
|
||||
print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}")
|
||||
utils.logger.error(f"[models.douyin.update_dy_aweme_comment] comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}")
|
||||
return
|
||||
user_info = comment_item.get("user", {})
|
||||
comment_id = comment_item.get("cid")
|
||||
|
@ -145,7 +145,7 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
|
|||
"sub_comment_count": str(comment_item.get("reply_comment_total", 0)),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")
|
||||
utils.logger.info(f"[models.douyin.update_dy_aweme_comment] douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")
|
||||
if config.IS_SAVED_DATABASED:
|
||||
if not await DouyinAwemeComment.filter(comment_id=comment_id).exists():
|
||||
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||
|
|
|
@ -80,7 +80,7 @@ async def update_kuaishou_video(video_item: Dict):
|
|||
"video_cover_url": photo_info.get("coverUrl", ""),
|
||||
"video_play_url": photo_info.get("photoUrl", ""),
|
||||
}
|
||||
print(f"Kuaishou video id:{video_id}, title:{local_db_item.get('title')}")
|
||||
utils.logger.info(f"[models.kuaishou.update_kuaishou_video] Kuaishou video id:{video_id}, title:{local_db_item.get('title')}")
|
||||
if config.IS_SAVED_DATABASED:
|
||||
if not await KuaishouVideo.filter(video_id=video_id).exists():
|
||||
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||
|
@ -106,7 +106,7 @@ async def update_kuaishou_video(video_item: Dict):
|
|||
|
||||
|
||||
async def batch_update_ks_video_comments(video_id: str, comments: List[Dict]):
|
||||
utils.logger.info(f"[batch_update_ks_video_comments] video_id:{video_id}, comments:{comments}")
|
||||
utils.logger.info(f"[KuaishouVideoComment.batch_update_ks_video_comments] video_id:{video_id}, comments:{comments}")
|
||||
if not comments:
|
||||
return
|
||||
for comment_item in comments:
|
||||
|
@ -126,7 +126,7 @@ async def update_ks_video_comment(video_id: str, comment_item: Dict):
|
|||
"sub_comment_count": str(comment_item.get("subCommentCount", 0)),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
print(f"Kuaishou video comment: {comment_id}, content: {local_db_item.get('content')}")
|
||||
utils.logger.info(f"[models.kuaishou.update_ks_video_comment] Kuaishou video comment: {comment_id}, content: {local_db_item.get('content')}")
|
||||
if config.IS_SAVED_DATABASED:
|
||||
if not await KuaishouVideoComment.filter(comment_id=comment_id).exists():
|
||||
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||
|
|
|
@ -86,7 +86,7 @@ async def update_xhs_note(note_item: Dict):
|
|||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
"note_url": f"https://www.xiaohongshu.com/explore/{note_id}"
|
||||
}
|
||||
print("xhs note:", local_db_item)
|
||||
utils.logger.info(f"[models.xiaohongshu.update_xhs_note] xhs note: {local_db_item}")
|
||||
if config.IS_SAVED_DATABASED:
|
||||
if not await XHSNote.filter(note_id=note_id).first():
|
||||
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||
|
@ -125,7 +125,7 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
|
|||
"sub_comment_count": comment_item.get("sub_comment_count"),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
}
|
||||
print("xhs note comment:", local_db_item)
|
||||
utils.logger.info(f"[models.xiaohongshu.update_xhs_note_comment] xhs note comment:{local_db_item}")
|
||||
if config.IS_SAVED_DATABASED:
|
||||
if not await XHSNoteComment.filter(comment_id=comment_id).first():
|
||||
local_db_item["add_ts"] = utils.get_current_timestamp()
|
||||
|
|
|
@ -115,7 +115,7 @@ class JiSuHttpProxy(ProxyProvider):
|
|||
ip_infos = []
|
||||
async with httpx.AsyncClient() as client:
|
||||
url = self.api_path + "/fetchips" + '?' + urlencode(self.params)
|
||||
utils.logger.info(f"[JiSuHttpProxy] get ip proxy url:{url}")
|
||||
utils.logger.info(f"[JiSuHttpProxy.get_proxies] get ip proxy url:{url}")
|
||||
response = await client.get(url, headers={
|
||||
"User-Agent": "MediaCrawler https://github.com/NanmiCoder/MediaCrawler"})
|
||||
res_dict: Dict = response.json()
|
||||
|
|
Loading…
Reference in New Issue