refactor: 规范日志打印

feat: B站指定视频ID爬取(bvid)
This commit is contained in:
Relakkes 2023-12-23 01:04:08 +08:00
parent 273c9a316b
commit aba9f14f50
18 changed files with 147 additions and 133 deletions

View File

@ -34,7 +34,7 @@ MAX_COMMENTS_PER_POST = 10
# 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
COMMENT_KEYWORDS = [
""
# "真棒"
# ........................
]
@ -56,9 +56,10 @@ DY_SPECIFIED_ID_LIST = [
# 指定快手平台需要爬取的ID列表
KS_SPECIFIED_ID_LIST = []
# 指定B站平台需要爬取的视频ID列表
# 指定B站平台需要爬取的视频bvid列表
BILI_SPECIFIED_ID_LIST = [
"416252543",
"976148468"
"BV1d54y1g7db",
"BV1Sz4y1U77N",
"BV14Q4y1n7jz",
# ........................
]

2
db.py
View File

@ -16,7 +16,7 @@ async def init_db(create_db: bool = False) -> None:
async def init():
await init_db(create_db=True)
await Tortoise.generate_schemas()
utils.logger.info("Init DB Success!")
utils.logger.info("[db.init] Init DB Success!")
if __name__ == '__main__':

View File

@ -4,7 +4,7 @@
# @Desc : bilibili 请求客户端
import asyncio
import json
from typing import Any, Callable, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from urllib.parse import urlencode
import httpx
@ -94,16 +94,16 @@ class BilibiliClient:
async def pong(self) -> bool:
"""get a note to check if login state is ok"""
utils.logger.info("Begin pong bilibili...")
utils.logger.info("[BilibiliClient.pong] Begin pong bilibili...")
ping_flag = False
try:
check_login_uri = "/x/web-interface/nav"
response = await self.get(check_login_uri)
if response.get("isLogin"):
utils.logger.info("use cache login state get web interface successfull!")
utils.logger.info("[BilibiliClient.pong] Use cache login state get web interface successfull!")
ping_flag = True
except Exception as e:
utils.logger.error(f"Pong bilibili failed: {e}, and try to login again...")
utils.logger.error(f"[BilibiliClient.pong] Pong bilibili failed: {e}, and try to login again...")
ping_flag = False
return ping_flag
@ -132,16 +132,22 @@ class BilibiliClient:
}
return await self.get(uri, post_data)
async def get_video_info(self, video_id: str) -> Dict:
async def get_video_info(self, aid: Union[int, None] = None, bvid: Union[str, None] = None) -> Dict:
"""
Bilibli web video detail api
:param video_id:
Bilibli web video detail api, aid bvid任选一个参数
:param aid: 稿件avid
:param bvid: 稿件bvid
:return:
"""
if not aid and not bvid:
raise ValueError("请提供 aid 或 bvid 中的至少一个参数")
uri = "/x/web-interface/view/detail"
params = {
"aid": video_id
}
params = dict()
if aid:
params.update({"aid": aid})
else:
params.update({"bvid": bvid})
return await self.get(uri, params, enable_params_sign=False)
async def get_video_comments(self,

View File

@ -8,7 +8,7 @@ import os
import random
import time
from asyncio import Task
from typing import Dict, List, Optional, Tuple
from typing import Dict, List, Optional, Tuple, Union
from playwright.async_api import (BrowserContext, BrowserType, Page,
async_playwright)
@ -94,10 +94,10 @@ class BilibiliCrawler(AbstractCrawler):
search bilibili video with keywords
:return:
"""
utils.logger.info("Begin search bilibli keywords")
utils.logger.info("[BilibiliCrawler.search] Begin search bilibli keywords")
bili_limit_count = 20 # bilibili limit page fixed value
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current search keyword: {keyword}")
utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
page = 1
while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
video_id_list: List[str] = []
@ -111,7 +111,7 @@ class BilibiliCrawler(AbstractCrawler):
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(video_item.get("aid"), semaphore)
self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore)
for video_item in video_list
]
video_items = await asyncio.gather(*task_list)
@ -129,7 +129,7 @@ class BilibiliCrawler(AbstractCrawler):
:param video_id_list:
:return:
"""
utils.logger.info(f"[batch_get_video_comments] video ids:{video_id_list}")
utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
for video_id in video_id_list:
@ -146,7 +146,7 @@ class BilibiliCrawler(AbstractCrawler):
"""
async with semaphore:
try:
utils.logger.info(f"[get_comments] begin get video_id: {video_id} comments ...")
utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
# Read keyword and quantity from config
keywords = config.COMMENT_KEYWORDS
max_comments = config.MAX_COMMENTS_PER_POST
@ -174,9 +174,9 @@ class BilibiliCrawler(AbstractCrawler):
await bilibili.batch_update_bilibili_video_comments(video_id, filtered_comments)
except DataFetchError as ex:
utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}")
utils.logger.error(f"[BilibiliCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
except Exception as e:
utils.logger.error(f"[get_comments] may be been blocked, err:{e}")
utils.logger.error(f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
async def get_specified_videos(self):
"""
@ -185,35 +185,42 @@ class BilibiliCrawler(AbstractCrawler):
"""
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(video_id=video_id, semaphore=semaphore) for video_id in config.BILI_SPECIFIED_ID_LIST
self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
config.BILI_SPECIFIED_ID_LIST
]
video_details = await asyncio.gather(*task_list)
video_aids_list = []
for video_detail in video_details:
if video_detail is not None:
video_item_view: Dict = video_detail.get("View")
video_aid: str = video_item_view.get("aid")
if video_aid:
video_aids_list.append(video_aid)
await bilibili.update_bilibili_video(video_detail)
await self.batch_get_video_comments(config.BILI_SPECIFIED_ID_LIST)
await self.batch_get_video_comments(video_aids_list)
async def get_video_info_task(self, video_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
"""
Get video detail task
:param video_id:
:param aid:
:param bvid:
:param semaphore:
:return:
"""
async with semaphore:
try:
result = await self.bili_client.get_video_info(video_id)
result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
return result
except DataFetchError as ex:
utils.logger.error(f"Get video detail error: {ex}")
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"have not fund note detail video_id:{video_id}, err: {ex}")
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
return None
async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
"""Create xhs client"""
utils.logger.info("Begin create xiaohongshu API client ...")
utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create xiaohongshu API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
bilibili_client_obj = BilibiliClient(
proxies=httpx_proxy,
@ -250,7 +257,7 @@ class BilibiliCrawler(AbstractCrawler):
headless: bool = True
) -> BrowserContext:
"""Launch browser and create browser context"""
utils.logger.info("Begin create browser context ...")
utils.logger.info("[BilibiliCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE:
# feat issue #14
# we will save login state to avoid login every time

View File

@ -34,7 +34,7 @@ class BilibiliLogin(AbstractLogin):
async def begin(self):
"""Start login xiaohongshu"""
utils.logger.info("Begin login Bilibili ...")
utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
if self.login_type == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
@ -42,7 +42,7 @@ class BilibiliLogin(AbstractLogin):
elif self.login_type == "cookie":
await self.login_by_cookies()
else:
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...")
raise ValueError("[BilibiliLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self) -> bool:
@ -59,7 +59,7 @@ class BilibiliLogin(AbstractLogin):
async def login_by_qrcode(self):
"""login bilibili website and keep webdriver login state"""
utils.logger.info("Begin login bilibili by qrcode ...")
utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...")
# click login button
login_button_ele = self.context_page.locator(
@ -74,29 +74,29 @@ class BilibiliLogin(AbstractLogin):
selector=qrcode_img_selector
)
if not base64_qrcode_img:
utils.logger.info("login failed , have not found qrcode please check ....")
utils.logger.info("[BilibiliLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
sys.exit()
# show login qrcode
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
utils.logger.info(f"Waiting for scan code login, remaining time is 20s")
utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
try:
await self.check_login_state()
except RetryError:
utils.logger.info("Login bilibili failed by qrcode login method ...")
utils.logger.info("[BilibiliLogin.login_by_qrcode] Login bilibili failed by qrcode login method ...")
sys.exit()
wait_redirect_seconds = 5
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_mobile(self):
pass
async def login_by_cookies(self):
utils.logger.info("Begin login bilibili by cookie ...")
utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,

View File

@ -75,12 +75,12 @@ class DouYinCrawler(AbstractCrawler):
# Get the information and comments of the specified post
await self.get_specified_awemes()
utils.logger.info("Douyin Crawler finished ...")
utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...")
async def search(self) -> None:
utils.logger.info("Begin search douyin keywords")
utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords")
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current keyword: {keyword}")
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
aweme_list: List[str] = []
dy_limit_count = 10
page = 0
@ -89,7 +89,7 @@ class DouYinCrawler(AbstractCrawler):
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
offset=page * dy_limit_count)
except DataFetchError:
utils.logger.error(f"search douyin keyword: {keyword} failed")
utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
break
page += 1
for post_item in posts_res.get("data"):
@ -100,7 +100,7 @@ class DouYinCrawler(AbstractCrawler):
continue
aweme_list.append(aweme_info.get("aweme_id", ""))
await douyin.update_douyin_aweme(aweme_item=aweme_info)
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list)
async def get_specified_awemes(self):
@ -121,10 +121,10 @@ class DouYinCrawler(AbstractCrawler):
try:
return await self.dy_client.get_video_by_id(aweme_id)
except DataFetchError as ex:
utils.logger.error(f"Get aweme detail error: {ex}")
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"have not fund note detail aweme_id:{aweme_id}, err: {ex}")
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
return None
async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
@ -147,9 +147,9 @@ class DouYinCrawler(AbstractCrawler):
)
# 现在返回的 comments 已经是经过关键词筛选的
await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained and filtered ...")
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
except DataFetchError as e:
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
@staticmethod
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
@ -213,4 +213,4 @@ class DouYinCrawler(AbstractCrawler):
async def close(self) -> None:
"""Close browser context"""
await self.browser_context.close()
utils.logger.info("Browser context closed ...")
utils.logger.info("[DouYinCrawler.close] Browser context closed ...")

View File

@ -47,7 +47,7 @@ class DouYinLogin(AbstractLogin):
elif self.login_type == "cookie":
await self.login_by_cookies()
else:
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...")
raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
# 如果页面重定向到滑动验证码页面,需要再次滑动滑块
await asyncio.sleep(6)
@ -56,16 +56,16 @@ class DouYinLogin(AbstractLogin):
await self.check_page_display_slider(move_step=3, slider_level="hard")
# check login state
utils.logger.info(f"login finished then check login state ...")
utils.logger.info(f"[DouYinLogin.begin] login finished then check login state ...")
try:
await self.check_login_state()
except RetryError:
utils.logger.info("login failed please confirm ...")
utils.logger.info("[DouYinLogin.begin] login failed please confirm ...")
sys.exit()
# wait for redirect
wait_redirect_seconds = 5
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
utils.logger.info(f"[DouYinLogin.begin] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
@ -84,21 +84,21 @@ class DouYinLogin(AbstractLogin):
# check dialog box is auto popup and wait for 10 seconds
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10)
except Exception as e:
utils.logger.error(f"login dialog box does not pop up automatically, error: {e}")
utils.logger.info("login dialog box does not pop up automatically, we will manually click the login button")
utils.logger.error(f"[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
utils.logger.info("[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']")
await login_button_ele.click()
await asyncio.sleep(0.5)
async def login_by_qrcode(self):
utils.logger.info("Begin login douyin by qrcode...")
utils.logger.info("[DouYinLogin.login_by_qrcode] Begin login douyin by qrcode...")
qrcode_img_selector = "xpath=//article[@class='web-login']//img"
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
utils.logger.info("login qrcode not found please confirm ...")
utils.logger.info("[DouYinLogin.login_by_qrcode] login qrcode not found please confirm ...")
sys.exit()
# show login qrcode
@ -109,7 +109,7 @@ class DouYinLogin(AbstractLogin):
await asyncio.sleep(2)
async def login_by_mobile(self):
utils.logger.info("Begin login douyin by mobile ...")
utils.logger.info("[DouYinLogin.login_by_mobile] Begin login douyin by mobile ...")
mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']")
await mobile_tap_ele.click()
await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']")
@ -124,7 +124,7 @@ class DouYinLogin(AbstractLogin):
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
while max_get_sms_code_time > 0:
utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
utils.logger.info(f"[DouYinLogin.login_by_mobile] get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"dy_{self.login_phone}"
sms_code_value = redis_obj.get(sms_code_key)
@ -157,7 +157,7 @@ class DouYinLogin(AbstractLogin):
slider_verify_success = False
while not slider_verify_success:
if max_slider_try_times <= 0:
utils.logger.error("slider verify failed ...")
utils.logger.error("[DouYinLogin.check_page_display_slider] slider verify failed ...")
sys.exit()
try:
await self.move_slider(back_selector, gap_selector, move_step, slider_level)
@ -166,20 +166,20 @@ class DouYinLogin(AbstractLogin):
# 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮
page_content = await self.context_page.content()
if "操作过慢" in page_content or "提示重新操作" in page_content:
utils.logger.info("slider verify failed, retry ...")
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...")
await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]")
continue
# 滑动成功后,等待滑块消失
await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000)
# 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码
utils.logger.info("slider verify success ...")
utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify success ...")
slider_verify_success = True
except Exception as e:
utils.logger.error(f"slider verify failed, error: {e}")
utils.logger.error(f"[DouYinLogin.check_page_display_slider] slider verify failed, error: {e}")
await asyncio.sleep(1)
max_slider_try_times -= 1
utils.logger.info(f"remaining slider try times: {max_slider_try_times}")
utils.logger.info(f"[DouYinLogin.check_page_display_slider] remaining slider try times: {max_slider_try_times}")
continue
async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"):
@ -236,7 +236,7 @@ class DouYinLogin(AbstractLogin):
await self.context_page.mouse.up()
async def login_by_cookies(self):
utils.logger.info("Begin login douyin by cookie ...")
utils.logger.info("[DouYinLogin.login_by_cookies] Begin login douyin by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,

View File

@ -59,12 +59,12 @@ class KuaiShouClient:
@staticmethod
async def pong() -> bool:
"""get a note to check if login state is ok"""
utils.logger.info("Begin pong kuaishou...")
utils.logger.info("[KuaiShouClient.pong] Begin pong kuaishou...")
ping_flag = False
try:
pass
except Exception as e:
utils.logger.error(f"Pong kuaishou failed: {e}, and try to login again...")
utils.logger.error(f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again...")
ping_flag = False
return ping_flag

View File

@ -81,13 +81,13 @@ class KuaishouCrawler(AbstractCrawler):
else:
pass
utils.logger.info("Kuaishou Crawler finished ...")
utils.logger.info("[KuaishouCrawler.start] Kuaishou Crawler finished ...")
async def search(self):
utils.logger.info("Begin search kuaishou keywords")
utils.logger.info("[KuaishouCrawler.search] Begin search kuaishou keywords")
ks_limit_count = 20 # kuaishou limit page fixed value
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current search keyword: {keyword}")
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
page = 1
while page * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
video_id_list: List[str] = []
@ -96,12 +96,12 @@ class KuaishouCrawler(AbstractCrawler):
pcursor=str(page),
)
if not videos_res:
utils.logger.error(f"search info by keyword:{keyword} not found data")
utils.logger.error(f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data")
continue
vision_search_photo: Dict = videos_res.get("visionSearchPhoto")
if vision_search_photo.get("result") != 1:
utils.logger.error(f"search info by keyword:{keyword} not found data ")
utils.logger.error(f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data ")
continue
for video_detail in vision_search_photo.get("feeds"):
@ -129,13 +129,13 @@ class KuaishouCrawler(AbstractCrawler):
async with semaphore:
try:
result = await self.ks_client.get_video_info(video_id)
utils.logger.info(f"Get video_id:{video_id} info result: {result} ...")
utils.logger.info(f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ...")
return result.get("visionVideoDetail")
except DataFetchError as ex:
utils.logger.error(f"Get video detail error: {ex}")
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"have not fund note detail video_id:{video_id}, err: {ex}")
utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund note detail video_id:{video_id}, err: {ex}")
return None
async def batch_get_video_comments(self, video_id_list: List[str]):
@ -144,7 +144,7 @@ class KuaishouCrawler(AbstractCrawler):
:param video_id_list:
:return:
"""
utils.logger.info(f"[batch_get_video_comments] video ids:{video_id_list}")
utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
for video_id in video_id_list:
@ -163,16 +163,16 @@ class KuaishouCrawler(AbstractCrawler):
"""
async with semaphore:
try:
utils.logger.info(f"[get_comments] bengin get video_id: {video_id} comments ...")
utils.logger.info(f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ...")
await self.ks_client.get_video_all_comments(
photo_id=video_id,
crawl_interval=random.random(),
callback=kuaishou.batch_update_ks_video_comments
)
except DataFetchError as ex:
utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}")
utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
except Exception as e:
utils.logger.error(f"[get_comments] may be been blocked, err:{e}")
utils.logger.error(f"[KuaishouCrawler.get_comments] may be been blocked, err:{e}")
# use time.sleeep block main coroutine instead of asyncio.sleep and cacel running comment task
# maybe kuaishou block our request, we will take a nap and update the cookie again
current_running_tasks = comment_tasks_var.get()
@ -197,7 +197,7 @@ class KuaishouCrawler(AbstractCrawler):
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
"""Create xhs client"""
utils.logger.info("Begin create kuaishou API client ...")
utils.logger.info("[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
xhs_client_obj = KuaiShouClient(
proxies=httpx_proxy,
@ -221,7 +221,7 @@ class KuaishouCrawler(AbstractCrawler):
headless: bool = True
) -> BrowserContext:
"""Launch browser and create browser context"""
utils.logger.info("Begin create browser context ...")
utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore
@ -245,4 +245,4 @@ class KuaishouCrawler(AbstractCrawler):
async def close(self):
"""Close browser context"""
await self.browser_context.close()
utils.logger.info("Browser context closed ...")
utils.logger.info("[KuaishouCrawler.close] Browser context closed ...")

View File

@ -29,7 +29,7 @@ class KuaishouLogin(AbstractLogin):
async def begin(self):
"""Start login xiaohongshu"""
utils.logger.info("Begin login kuaishou ...")
utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
if self.login_type == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
@ -37,7 +37,7 @@ class KuaishouLogin(AbstractLogin):
elif self.login_type == "cookie":
await self.login_by_cookies()
else:
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...")
raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self) -> bool:
@ -55,7 +55,7 @@ class KuaishouLogin(AbstractLogin):
async def login_by_qrcode(self):
"""login kuaishou website and keep webdriver login state"""
utils.logger.info("Begin login kuaishou by qrcode ...")
utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")
# click login button
login_button_ele = self.context_page.locator(
@ -70,7 +70,7 @@ class KuaishouLogin(AbstractLogin):
selector=qrcode_img_selector
)
if not base64_qrcode_img:
utils.logger.info("login failed , have not found qrcode please check ....")
utils.logger.info("[KuaishouLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
sys.exit()
@ -78,22 +78,22 @@ class KuaishouLogin(AbstractLogin):
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
utils.logger.info(f"waiting for scan code login, remaining time is 20s")
utils.logger.info(f"[KuaishouLogin.login_by_qrcode] waiting for scan code login, remaining time is 20s")
try:
await self.check_login_state()
except RetryError:
utils.logger.info("Login kuaishou failed by qrcode login method ...")
utils.logger.info("[KuaishouLogin.login_by_qrcode] Login kuaishou failed by qrcode login method ...")
sys.exit()
wait_redirect_seconds = 5
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
utils.logger.info(f"[KuaishouLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_mobile(self):
pass
async def login_by_cookies(self):
utils.logger.info("Begin login kuaishou by cookie ...")
utils.logger.info("[KuaishouLogin.login_by_cookies] Begin login kuaishou by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,

View File

@ -83,14 +83,14 @@ class XHSClient:
async def pong(self) -> bool:
"""get a note to check if login state is ok"""
utils.logger.info("Begin to pong xhs...")
utils.logger.info("[XHSClient.pong] Begin to pong xhs...")
ping_flag = False
try:
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
if note_card.get("items"):
ping_flag = True
except Exception as e:
utils.logger.error(f"Ping xhs failed: {e}, and try to login again...")
utils.logger.error(f"[XHSClient.pong] Ping xhs failed: {e}, and try to login again...")
ping_flag = False
return ping_flag
@ -136,7 +136,7 @@ class XHSClient:
if res and res.get("items"):
res_dict: Dict = res["items"][0]["note_card"]
return res_dict
utils.logger.error(f"[xhs.client.get_note_by_id] get note empty and res:{res}")
utils.logger.error(f"[XHSClient.get_note_by_id] get note empty and res:{res}")
return dict()
async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
@ -195,7 +195,7 @@ class XHSClient:
# Handle the absence of 'comments' key appropriately
# For example, log an error message, break from the loop, etc.
# This is just an example:
print(f"No 'comments' key found in response: {comments_res}")
utils.logger.info(f"[XHSClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
break
comments = comments_res["comments"]
if not is_fetch_sub_comments:

View File

@ -87,14 +87,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
else:
pass
utils.logger.info("Xhs Crawler finished ...")
utils.logger.info("[XiaoHongShuCrawler.start] Xhs Crawler finished ...")
async def search(self) -> None:
"""Search for notes and retrieve their comment information."""
utils.logger.info("Begin search xiaohongshu keywords")
utils.logger.info("[XiaoHongShuCrawler.search] Begin search xiaohongshu keywords")
xhs_limit_count = 20 # xhs limit page fixed value
for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current search keyword: {keyword}")
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
page = 1
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
note_id_list: List[str] = []
@ -102,7 +102,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
keyword=keyword,
page=page,
)
utils.logger.info(f"Search notes res:{notes_res}")
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_note_detail(post_item.get("id"), semaphore)
@ -115,7 +115,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
await xhs_model.update_xhs_note(note_detail)
note_id_list.append(note_detail.get("note_id"))
page += 1
utils.logger.info(f"Note details: {note_details}")
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
await self.batch_get_note_comments(note_id_list)
async def get_specified_notes(self):
@ -136,15 +136,15 @@ class XiaoHongShuCrawler(AbstractCrawler):
try:
return await self.xhs_client.get_note_by_id(note_id)
except DataFetchError as ex:
utils.logger.error(f"Get note detail error: {ex}")
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}")
return None
except KeyError as ex:
utils.logger.error(f"have not fund note detail note_id:{note_id}, err: {ex}")
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}")
return None
async def batch_get_note_comments(self, note_list: List[str]):
"""Batch get note comments"""
utils.logger.info(f"Begin batch get note comments, note list: {note_list}")
utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = []
for note_id in note_list:
@ -155,7 +155,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
"""Get note comments with keyword filtering and quantity limitation"""
async with semaphore:
utils.logger.info(f"Begin get note id comments {note_id}")
utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
# 从配置文件中读取关键词和数量限制
@ -191,7 +191,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XHSClient:
"""Create xhs client"""
utils.logger.info("Begin create xiaohongshu API client ...")
utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
xhs_client_obj = XHSClient(
proxies=httpx_proxy,
@ -215,7 +215,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
headless: bool = True
) -> BrowserContext:
"""Launch browser and create browser context"""
utils.logger.info("Begin create browser context ...")
utils.logger.info("[XiaoHongShuCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE:
# feat issue #14
# we will save login state to avoid login every time
@ -241,4 +241,4 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def close(self):
"""Close browser context"""
await self.browser_context.close()
utils.logger.info("Browser context closed ...")
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")

View File

@ -37,7 +37,7 @@ class XHSLogin(AbstractLogin):
"""
if "请通过验证" in await self.context_page.content():
utils.logger.info("登录过程中出现验证码,请手动验证")
utils.logger.info("[XHSLogin.check_login_state] 登录过程中出现验证码,请手动验证")
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
@ -48,7 +48,7 @@ class XHSLogin(AbstractLogin):
async def begin(self):
"""Start login xiaohongshu"""
utils.logger.info("Begin login xiaohongshu ...")
utils.logger.info("[XHSLogin.begin] Begin login xiaohongshu ...")
if self.login_type == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
@ -56,11 +56,11 @@ class XHSLogin(AbstractLogin):
elif self.login_type == "cookie":
await self.login_by_cookies()
else:
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookies ...")
raise ValueError("[XHSLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
async def login_by_mobile(self):
"""Login xiaohongshu by mobile"""
utils.logger.info("Begin login xiaohongshu by mobile ...")
utils.logger.info("[XHSLogin.login_by_mobile] Begin login xiaohongshu by mobile ...")
await asyncio.sleep(1)
try:
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
@ -77,7 +77,7 @@ class XHSLogin(AbstractLogin):
)
await element.click()
except Exception as e:
utils.logger.info("have not found mobile button icon and keep going ...")
utils.logger.info("[XHSLogin.login_by_mobile] have not found mobile button icon and keep going ...")
await asyncio.sleep(1)
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
@ -93,7 +93,7 @@ class XHSLogin(AbstractLogin):
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
no_logged_in_session = ""
while max_get_sms_code_time > 0:
utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
utils.logger.info(f"[XHSLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"xhs_{self.login_phone}"
sms_code_value = redis_obj.get(sms_code_key)
@ -119,16 +119,16 @@ class XHSLogin(AbstractLogin):
try:
await self.check_login_state(no_logged_in_session)
except RetryError:
utils.logger.info("Login xiaohongshu failed by mobile login method ...")
utils.logger.info("[XHSLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...")
sys.exit()
wait_redirect_seconds = 5
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
utils.logger.info(f"[XHSLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_qrcode(self):
"""login xiaohongshu website and keep webdriver login state"""
utils.logger.info("Begin login xiaohongshu by qrcode ...")
utils.logger.info("[XHSLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...")
# login_selector = "div.login-container > div.left > div.qrcode > img"
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
# find login qrcode
@ -137,7 +137,7 @@ class XHSLogin(AbstractLogin):
selector=qrcode_img_selector
)
if not base64_qrcode_img:
utils.logger.info("login failed , have not found qrcode please check ....")
utils.logger.info("[XHSLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
# if this website does not automatically popup login dialog box, we will manual click login button
await asyncio.sleep(0.5)
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
@ -161,20 +161,20 @@ class XHSLogin(AbstractLogin):
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
utils.logger.info(f"waiting for scan code login, remaining time is 120s")
utils.logger.info(f"[XHSLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
try:
await self.check_login_state(no_logged_in_session)
except RetryError:
utils.logger.info("Login xiaohongshu failed by qrcode login method ...")
utils.logger.info("[XHSLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...")
sys.exit()
wait_redirect_seconds = 5
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
utils.logger.info(f"[XHSLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_cookies(self):
"""login xiaohongshu website by cookies"""
utils.logger.info("Begin login xiaohongshu by cookie ...")
utils.logger.info("[XHSLogin.login_by_cookies] Begin login xiaohongshu by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
if key != "web_session": # only set web_session cookie attr
continue

View File

@ -85,7 +85,7 @@ async def update_bilibili_video(video_item: Dict):
"video_url": f"https://www.bilibili.com/video/av{video_id}",
"video_cover_url": video_item_view.get("pic", ""),
}
utils.logger.info(f"bilibili video id:{video_id}, title:{local_db_item.get('title')}")
utils.logger.info(f"[models.bilibili.update_bilibili_video] bilibili video id:{video_id}, title:{local_db_item.get('title')}")
if config.IS_SAVED_DATABASED:
if not await BilibiliVideo.filter(video_id=video_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp()
@ -131,7 +131,7 @@ async def update_bilibili_video_comment(video_id: str, comment_item: Dict):
"sub_comment_count": str(comment_item.get("rcount", 0)),
"last_modify_ts": utils.get_current_timestamp(),
}
utils.logger.info(f"Bilibili video comment: {comment_id}, content: {local_db_item.get('content')}")
utils.logger.info(f"[models.bilibili.update_bilibili_video_comment] Bilibili video comment: {comment_id}, content: {local_db_item.get('content')}")
if config.IS_SAVED_DATABASED:
if not await BilibiliComment.filter(comment_id=comment_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp()

View File

@ -88,7 +88,7 @@ async def update_douyin_aweme(aweme_item: Dict):
"last_modify_ts": utils.get_current_timestamp(),
"aweme_url": f"https://www.douyin.com/video/{aweme_id}"
}
print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
utils.logger.info(f"[models.douyin.update_douyin_aweme] douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
if config.IS_SAVED_DATABASED:
if not await DouyinAweme.filter(aweme_id=aweme_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp()
@ -123,7 +123,7 @@ async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
comment_aweme_id = comment_item.get("aweme_id")
if aweme_id != comment_aweme_id:
print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}")
utils.logger.error(f"[models.douyin.update_dy_aweme_comment] comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}")
return
user_info = comment_item.get("user", {})
comment_id = comment_item.get("cid")
@ -145,7 +145,7 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
"sub_comment_count": str(comment_item.get("reply_comment_total", 0)),
"last_modify_ts": utils.get_current_timestamp(),
}
print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")
utils.logger.info(f"[models.douyin.update_dy_aweme_comment] douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")
if config.IS_SAVED_DATABASED:
if not await DouyinAwemeComment.filter(comment_id=comment_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp()

View File

@ -80,7 +80,7 @@ async def update_kuaishou_video(video_item: Dict):
"video_cover_url": photo_info.get("coverUrl", ""),
"video_play_url": photo_info.get("photoUrl", ""),
}
print(f"Kuaishou video id:{video_id}, title:{local_db_item.get('title')}")
utils.logger.info(f"[models.kuaishou.update_kuaishou_video] Kuaishou video id:{video_id}, title:{local_db_item.get('title')}")
if config.IS_SAVED_DATABASED:
if not await KuaishouVideo.filter(video_id=video_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp()
@ -106,7 +106,7 @@ async def update_kuaishou_video(video_item: Dict):
async def batch_update_ks_video_comments(video_id: str, comments: List[Dict]):
utils.logger.info(f"[batch_update_ks_video_comments] video_id:{video_id}, comments:{comments}")
utils.logger.info(f"[KuaishouVideoComment.batch_update_ks_video_comments] video_id:{video_id}, comments:{comments}")
if not comments:
return
for comment_item in comments:
@ -126,7 +126,7 @@ async def update_ks_video_comment(video_id: str, comment_item: Dict):
"sub_comment_count": str(comment_item.get("subCommentCount", 0)),
"last_modify_ts": utils.get_current_timestamp(),
}
print(f"Kuaishou video comment: {comment_id}, content: {local_db_item.get('content')}")
utils.logger.info(f"[models.kuaishou.update_ks_video_comment] Kuaishou video comment: {comment_id}, content: {local_db_item.get('content')}")
if config.IS_SAVED_DATABASED:
if not await KuaishouVideoComment.filter(comment_id=comment_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp()

View File

@ -86,7 +86,7 @@ async def update_xhs_note(note_item: Dict):
"last_modify_ts": utils.get_current_timestamp(),
"note_url": f"https://www.xiaohongshu.com/explore/{note_id}"
}
print("xhs note:", local_db_item)
utils.logger.info(f"[models.xiaohongshu.update_xhs_note] xhs note: {local_db_item}")
if config.IS_SAVED_DATABASED:
if not await XHSNote.filter(note_id=note_id).first():
local_db_item["add_ts"] = utils.get_current_timestamp()
@ -125,7 +125,7 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
"sub_comment_count": comment_item.get("sub_comment_count"),
"last_modify_ts": utils.get_current_timestamp(),
}
print("xhs note comment:", local_db_item)
utils.logger.info(f"[models.xiaohongshu.update_xhs_note_comment] xhs note comment:{local_db_item}")
if config.IS_SAVED_DATABASED:
if not await XHSNoteComment.filter(comment_id=comment_id).first():
local_db_item["add_ts"] = utils.get_current_timestamp()

View File

@ -115,7 +115,7 @@ class JiSuHttpProxy(ProxyProvider):
ip_infos = []
async with httpx.AsyncClient() as client:
url = self.api_path + "/fetchips" + '?' + urlencode(self.params)
utils.logger.info(f"[JiSuHttpProxy] get ip proxy url:{url}")
utils.logger.info(f"[JiSuHttpProxy.get_proxies] get ip proxy url:{url}")
response = await client.get(url, headers={
"User-Agent": "MediaCrawler https://github.com/NanmiCoder/MediaCrawler"})
res_dict: Dict = response.json()