refactor: 规范日志打印

feat: B站指定视频ID爬取(bvid)
This commit is contained in:
Relakkes 2023-12-23 01:04:08 +08:00
parent 273c9a316b
commit aba9f14f50
18 changed files with 147 additions and 133 deletions

View File

@ -34,7 +34,7 @@ MAX_COMMENTS_PER_POST = 10
# 评论关键词筛选(只会留下包含关键词的评论,为空不限制) # 评论关键词筛选(只会留下包含关键词的评论,为空不限制)
COMMENT_KEYWORDS = [ COMMENT_KEYWORDS = [
"" # "真棒"
# ........................ # ........................
] ]
@ -56,9 +56,10 @@ DY_SPECIFIED_ID_LIST = [
# 指定快手平台需要爬取的ID列表 # 指定快手平台需要爬取的ID列表
KS_SPECIFIED_ID_LIST = [] KS_SPECIFIED_ID_LIST = []
# 指定B站平台需要爬取的视频ID列表 # 指定B站平台需要爬取的视频bvid列表
BILI_SPECIFIED_ID_LIST = [ BILI_SPECIFIED_ID_LIST = [
"416252543", "BV1d54y1g7db",
"976148468" "BV1Sz4y1U77N",
"BV14Q4y1n7jz",
# ........................ # ........................
] ]

2
db.py
View File

@ -16,7 +16,7 @@ async def init_db(create_db: bool = False) -> None:
async def init(): async def init():
await init_db(create_db=True) await init_db(create_db=True)
await Tortoise.generate_schemas() await Tortoise.generate_schemas()
utils.logger.info("Init DB Success!") utils.logger.info("[db.init] Init DB Success!")
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -4,7 +4,7 @@
# @Desc : bilibili 请求客户端 # @Desc : bilibili 请求客户端
import asyncio import asyncio
import json import json
from typing import Any, Callable, Dict, List, Optional, Tuple from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from urllib.parse import urlencode from urllib.parse import urlencode
import httpx import httpx
@ -94,16 +94,16 @@ class BilibiliClient:
async def pong(self) -> bool: async def pong(self) -> bool:
"""get a note to check if login state is ok""" """get a note to check if login state is ok"""
utils.logger.info("Begin pong bilibili...") utils.logger.info("[BilibiliClient.pong] Begin pong bilibili...")
ping_flag = False ping_flag = False
try: try:
check_login_uri = "/x/web-interface/nav" check_login_uri = "/x/web-interface/nav"
response = await self.get(check_login_uri) response = await self.get(check_login_uri)
if response.get("isLogin"): if response.get("isLogin"):
utils.logger.info("use cache login state get web interface successfull!") utils.logger.info("[BilibiliClient.pong] Use cache login state get web interface successfull!")
ping_flag = True ping_flag = True
except Exception as e: except Exception as e:
utils.logger.error(f"Pong bilibili failed: {e}, and try to login again...") utils.logger.error(f"[BilibiliClient.pong] Pong bilibili failed: {e}, and try to login again...")
ping_flag = False ping_flag = False
return ping_flag return ping_flag
@ -132,16 +132,22 @@ class BilibiliClient:
} }
return await self.get(uri, post_data) return await self.get(uri, post_data)
async def get_video_info(self, video_id: str) -> Dict: async def get_video_info(self, aid: Union[int, None] = None, bvid: Union[str, None] = None) -> Dict:
""" """
Bilibli web video detail api Bilibli web video detail api, aid bvid任选一个参数
:param video_id: :param aid: 稿件avid
:param bvid: 稿件bvid
:return: :return:
""" """
if not aid and not bvid:
raise ValueError("请提供 aid 或 bvid 中的至少一个参数")
uri = "/x/web-interface/view/detail" uri = "/x/web-interface/view/detail"
params = { params = dict()
"aid": video_id if aid:
} params.update({"aid": aid})
else:
params.update({"bvid": bvid})
return await self.get(uri, params, enable_params_sign=False) return await self.get(uri, params, enable_params_sign=False)
async def get_video_comments(self, async def get_video_comments(self,

View File

@ -8,7 +8,7 @@ import os
import random import random
import time import time
from asyncio import Task from asyncio import Task
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple, Union
from playwright.async_api import (BrowserContext, BrowserType, Page, from playwright.async_api import (BrowserContext, BrowserType, Page,
async_playwright) async_playwright)
@ -69,7 +69,7 @@ class BilibiliCrawler(AbstractCrawler):
if not await self.bili_client.pong(): if not await self.bili_client.pong():
login_obj = BilibiliLogin( login_obj = BilibiliLogin(
login_type=self.login_type, login_type=self.login_type,
login_phone="", # your phone number login_phone="", # your phone number
browser_context=self.browser_context, browser_context=self.browser_context,
context_page=self.context_page, context_page=self.context_page,
cookie_str=config.COOKIES cookie_str=config.COOKIES
@ -94,10 +94,10 @@ class BilibiliCrawler(AbstractCrawler):
search bilibili video with keywords search bilibili video with keywords
:return: :return:
""" """
utils.logger.info("Begin search bilibli keywords") utils.logger.info("[BilibiliCrawler.search] Begin search bilibli keywords")
bili_limit_count = 20 # bilibili limit page fixed value bili_limit_count = 20 # bilibili limit page fixed value
for keyword in config.KEYWORDS.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current search keyword: {keyword}") utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
page = 1 page = 1
while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
video_id_list: List[str] = [] video_id_list: List[str] = []
@ -111,7 +111,7 @@ class BilibiliCrawler(AbstractCrawler):
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [ task_list = [
self.get_video_info_task(video_item.get("aid"), semaphore) self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore)
for video_item in video_list for video_item in video_list
] ]
video_items = await asyncio.gather(*task_list) video_items = await asyncio.gather(*task_list)
@ -129,7 +129,7 @@ class BilibiliCrawler(AbstractCrawler):
:param video_id_list: :param video_id_list:
:return: :return:
""" """
utils.logger.info(f"[batch_get_video_comments] video ids:{video_id_list}") utils.logger.info(f"[BilibiliCrawler.batch_get_video_comments] video ids:{video_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = [] task_list: List[Task] = []
for video_id in video_id_list: for video_id in video_id_list:
@ -146,7 +146,7 @@ class BilibiliCrawler(AbstractCrawler):
""" """
async with semaphore: async with semaphore:
try: try:
utils.logger.info(f"[get_comments] begin get video_id: {video_id} comments ...") utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
# Read keyword and quantity from config # Read keyword and quantity from config
keywords = config.COMMENT_KEYWORDS keywords = config.COMMENT_KEYWORDS
max_comments = config.MAX_COMMENTS_PER_POST max_comments = config.MAX_COMMENTS_PER_POST
@ -174,9 +174,9 @@ class BilibiliCrawler(AbstractCrawler):
await bilibili.batch_update_bilibili_video_comments(video_id, filtered_comments) await bilibili.batch_update_bilibili_video_comments(video_id, filtered_comments)
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}") utils.logger.error(f"[BilibiliCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
except Exception as e: except Exception as e:
utils.logger.error(f"[get_comments] may be been blocked, err:{e}") utils.logger.error(f"[BilibiliCrawler.get_comments] may be been blocked, err:{e}")
async def get_specified_videos(self): async def get_specified_videos(self):
""" """
@ -185,35 +185,42 @@ class BilibiliCrawler(AbstractCrawler):
""" """
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [ task_list = [
self.get_video_info_task(video_id=video_id, semaphore=semaphore) for video_id in config.BILI_SPECIFIED_ID_LIST self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in
config.BILI_SPECIFIED_ID_LIST
] ]
video_details = await asyncio.gather(*task_list) video_details = await asyncio.gather(*task_list)
video_aids_list = []
for video_detail in video_details: for video_detail in video_details:
if video_detail is not None: if video_detail is not None:
video_item_view: Dict = video_detail.get("View")
video_aid: str = video_item_view.get("aid")
if video_aid:
video_aids_list.append(video_aid)
await bilibili.update_bilibili_video(video_detail) await bilibili.update_bilibili_video(video_detail)
await self.batch_get_video_comments(config.BILI_SPECIFIED_ID_LIST) await self.batch_get_video_comments(video_aids_list)
async def get_video_info_task(self, video_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
""" """
Get video detail task Get video detail task
:param video_id: :param aid:
:param bvid:
:param semaphore: :param semaphore:
:return: :return:
""" """
async with semaphore: async with semaphore:
try: try:
result = await self.bili_client.get_video_info(video_id) result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
return result return result
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error(f"Get video detail error: {ex}") utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
return None return None
except KeyError as ex: except KeyError as ex:
utils.logger.error(f"have not fund note detail video_id:{video_id}, err: {ex}") utils.logger.error(f"[BilibiliCrawler.get_video_info_task] have not fund note detail video_id:{bvid}, err: {ex}")
return None return None
async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient: async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient:
"""Create xhs client""" """Create xhs client"""
utils.logger.info("Begin create xiaohongshu API client ...") utils.logger.info("[BilibiliCrawler.create_bilibili_client] Begin create xiaohongshu API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
bilibili_client_obj = BilibiliClient( bilibili_client_obj = BilibiliClient(
proxies=httpx_proxy, proxies=httpx_proxy,
@ -250,7 +257,7 @@ class BilibiliCrawler(AbstractCrawler):
headless: bool = True headless: bool = True
) -> BrowserContext: ) -> BrowserContext:
"""Launch browser and create browser context""" """Launch browser and create browser context"""
utils.logger.info("Begin create browser context ...") utils.logger.info("[BilibiliCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
# feat issue #14 # feat issue #14
# we will save login state to avoid login every time # we will save login state to avoid login every time

View File

@ -34,7 +34,7 @@ class BilibiliLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login xiaohongshu""" """Start login xiaohongshu"""
utils.logger.info("Begin login Bilibili ...") utils.logger.info("[BilibiliLogin.begin] Begin login Bilibili ...")
if self.login_type == "qrcode": if self.login_type == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif self.login_type == "phone":
@ -42,7 +42,7 @@ class BilibiliLogin(AbstractLogin):
elif self.login_type == "cookie": elif self.login_type == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...") raise ValueError("[BilibiliLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self) -> bool: async def check_login_state(self) -> bool:
@ -59,7 +59,7 @@ class BilibiliLogin(AbstractLogin):
async def login_by_qrcode(self): async def login_by_qrcode(self):
"""login bilibili website and keep webdriver login state""" """login bilibili website and keep webdriver login state"""
utils.logger.info("Begin login bilibili by qrcode ...") utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by qrcode ...")
# click login button # click login button
login_button_ele = self.context_page.locator( login_button_ele = self.context_page.locator(
@ -74,29 +74,29 @@ class BilibiliLogin(AbstractLogin):
selector=qrcode_img_selector selector=qrcode_img_selector
) )
if not base64_qrcode_img: if not base64_qrcode_img:
utils.logger.info("login failed , have not found qrcode please check ....") utils.logger.info("[BilibiliLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
sys.exit() sys.exit()
# show login qrcode # show login qrcode
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
utils.logger.info(f"Waiting for scan code login, remaining time is 20s") utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Waiting for scan code login, remaining time is 20s")
try: try:
await self.check_login_state() await self.check_login_state()
except RetryError: except RetryError:
utils.logger.info("Login bilibili failed by qrcode login method ...") utils.logger.info("[BilibiliLogin.login_by_qrcode] Login bilibili failed by qrcode login method ...")
sys.exit() sys.exit()
wait_redirect_seconds = 5 wait_redirect_seconds = 5
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") utils.logger.info(f"[BilibiliLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds) await asyncio.sleep(wait_redirect_seconds)
async def login_by_mobile(self): async def login_by_mobile(self):
pass pass
async def login_by_cookies(self): async def login_by_cookies(self):
utils.logger.info("Begin login bilibili by cookie ...") utils.logger.info("[BilibiliLogin.login_by_qrcode] Begin login bilibili by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{ await self.browser_context.add_cookies([{
'name': key, 'name': key,

View File

@ -75,12 +75,12 @@ class DouYinCrawler(AbstractCrawler):
# Get the information and comments of the specified post # Get the information and comments of the specified post
await self.get_specified_awemes() await self.get_specified_awemes()
utils.logger.info("Douyin Crawler finished ...") utils.logger.info("[DouYinCrawler.start] Douyin Crawler finished ...")
async def search(self) -> None: async def search(self) -> None:
utils.logger.info("Begin search douyin keywords") utils.logger.info("[DouYinCrawler.search] Begin search douyin keywords")
for keyword in config.KEYWORDS.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current keyword: {keyword}") utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
aweme_list: List[str] = [] aweme_list: List[str] = []
dy_limit_count = 10 dy_limit_count = 10
page = 0 page = 0
@ -89,7 +89,7 @@ class DouYinCrawler(AbstractCrawler):
posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword, posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
offset=page * dy_limit_count) offset=page * dy_limit_count)
except DataFetchError: except DataFetchError:
utils.logger.error(f"search douyin keyword: {keyword} failed") utils.logger.error(f"[DouYinCrawler.search] search douyin keyword: {keyword} failed")
break break
page += 1 page += 1
for post_item in posts_res.get("data"): for post_item in posts_res.get("data"):
@ -100,7 +100,7 @@ class DouYinCrawler(AbstractCrawler):
continue continue
aweme_list.append(aweme_info.get("aweme_id", "")) aweme_list.append(aweme_info.get("aweme_id", ""))
await douyin.update_douyin_aweme(aweme_item=aweme_info) await douyin.update_douyin_aweme(aweme_item=aweme_info)
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}") utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list) await self.batch_get_note_comments(aweme_list)
async def get_specified_awemes(self): async def get_specified_awemes(self):
@ -121,10 +121,10 @@ class DouYinCrawler(AbstractCrawler):
try: try:
return await self.dy_client.get_video_by_id(aweme_id) return await self.dy_client.get_video_by_id(aweme_id)
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error(f"Get aweme detail error: {ex}") utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
return None return None
except KeyError as ex: except KeyError as ex:
utils.logger.error(f"have not fund note detail aweme_id:{aweme_id}, err: {ex}") utils.logger.error(f"[DouYinCrawler.get_aweme_detail] have not fund note detail aweme_id:{aweme_id}, err: {ex}")
return None return None
async def batch_get_note_comments(self, aweme_list: List[str]) -> None: async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
@ -147,9 +147,9 @@ class DouYinCrawler(AbstractCrawler):
) )
# 现在返回的 comments 已经是经过关键词筛选的 # 现在返回的 comments 已经是经过关键词筛选的
await douyin.batch_update_dy_aweme_comments(aweme_id, comments) await douyin.batch_update_dy_aweme_comments(aweme_id, comments)
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained and filtered ...") utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
except DataFetchError as e: except DataFetchError as e:
utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}") utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
@staticmethod @staticmethod
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:
@ -213,4 +213,4 @@ class DouYinCrawler(AbstractCrawler):
async def close(self) -> None: async def close(self) -> None:
"""Close browser context""" """Close browser context"""
await self.browser_context.close() await self.browser_context.close()
utils.logger.info("Browser context closed ...") utils.logger.info("[DouYinCrawler.close] Browser context closed ...")

View File

@ -47,7 +47,7 @@ class DouYinLogin(AbstractLogin):
elif self.login_type == "cookie": elif self.login_type == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...") raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
# 如果页面重定向到滑动验证码页面,需要再次滑动滑块 # 如果页面重定向到滑动验证码页面,需要再次滑动滑块
await asyncio.sleep(6) await asyncio.sleep(6)
@ -56,16 +56,16 @@ class DouYinLogin(AbstractLogin):
await self.check_page_display_slider(move_step=3, slider_level="hard") await self.check_page_display_slider(move_step=3, slider_level="hard")
# check login state # check login state
utils.logger.info(f"login finished then check login state ...") utils.logger.info(f"[DouYinLogin.begin] login finished then check login state ...")
try: try:
await self.check_login_state() await self.check_login_state()
except RetryError: except RetryError:
utils.logger.info("login failed please confirm ...") utils.logger.info("[DouYinLogin.begin] login failed please confirm ...")
sys.exit() sys.exit()
# wait for redirect # wait for redirect
wait_redirect_seconds = 5 wait_redirect_seconds = 5
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") utils.logger.info(f"[DouYinLogin.begin] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds) await asyncio.sleep(wait_redirect_seconds)
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
@ -84,21 +84,21 @@ class DouYinLogin(AbstractLogin):
# check dialog box is auto popup and wait for 10 seconds # check dialog box is auto popup and wait for 10 seconds
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10) await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10)
except Exception as e: except Exception as e:
utils.logger.error(f"login dialog box does not pop up automatically, error: {e}") utils.logger.error(f"[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
utils.logger.info("login dialog box does not pop up automatically, we will manually click the login button") utils.logger.info("[DouYinLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']") login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']")
await login_button_ele.click() await login_button_ele.click()
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
async def login_by_qrcode(self): async def login_by_qrcode(self):
utils.logger.info("Begin login douyin by qrcode...") utils.logger.info("[DouYinLogin.login_by_qrcode] Begin login douyin by qrcode...")
qrcode_img_selector = "xpath=//article[@class='web-login']//img" qrcode_img_selector = "xpath=//article[@class='web-login']//img"
base64_qrcode_img = await utils.find_login_qrcode( base64_qrcode_img = await utils.find_login_qrcode(
self.context_page, self.context_page,
selector=qrcode_img_selector selector=qrcode_img_selector
) )
if not base64_qrcode_img: if not base64_qrcode_img:
utils.logger.info("login qrcode not found please confirm ...") utils.logger.info("[DouYinLogin.login_by_qrcode] login qrcode not found please confirm ...")
sys.exit() sys.exit()
# show login qrcode # show login qrcode
@ -109,7 +109,7 @@ class DouYinLogin(AbstractLogin):
await asyncio.sleep(2) await asyncio.sleep(2)
async def login_by_mobile(self): async def login_by_mobile(self):
utils.logger.info("Begin login douyin by mobile ...") utils.logger.info("[DouYinLogin.login_by_mobile] Begin login douyin by mobile ...")
mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']") mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']")
await mobile_tap_ele.click() await mobile_tap_ele.click()
await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']") await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']")
@ -124,7 +124,7 @@ class DouYinLogin(AbstractLogin):
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD) redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
while max_get_sms_code_time > 0: while max_get_sms_code_time > 0:
utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...") utils.logger.info(f"[DouYinLogin.login_by_mobile] get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1) await asyncio.sleep(1)
sms_code_key = f"dy_{self.login_phone}" sms_code_key = f"dy_{self.login_phone}"
sms_code_value = redis_obj.get(sms_code_key) sms_code_value = redis_obj.get(sms_code_key)
@ -157,7 +157,7 @@ class DouYinLogin(AbstractLogin):
slider_verify_success = False slider_verify_success = False
while not slider_verify_success: while not slider_verify_success:
if max_slider_try_times <= 0: if max_slider_try_times <= 0:
utils.logger.error("slider verify failed ...") utils.logger.error("[DouYinLogin.check_page_display_slider] slider verify failed ...")
sys.exit() sys.exit()
try: try:
await self.move_slider(back_selector, gap_selector, move_step, slider_level) await self.move_slider(back_selector, gap_selector, move_step, slider_level)
@ -166,20 +166,20 @@ class DouYinLogin(AbstractLogin):
# 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮 # 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮
page_content = await self.context_page.content() page_content = await self.context_page.content()
if "操作过慢" in page_content or "提示重新操作" in page_content: if "操作过慢" in page_content or "提示重新操作" in page_content:
utils.logger.info("slider verify failed, retry ...") utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...")
await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]") await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]")
continue continue
# 滑动成功后,等待滑块消失 # 滑动成功后,等待滑块消失
await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000) await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000)
# 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码 # 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码
utils.logger.info("slider verify success ...") utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify success ...")
slider_verify_success = True slider_verify_success = True
except Exception as e: except Exception as e:
utils.logger.error(f"slider verify failed, error: {e}") utils.logger.error(f"[DouYinLogin.check_page_display_slider] slider verify failed, error: {e}")
await asyncio.sleep(1) await asyncio.sleep(1)
max_slider_try_times -= 1 max_slider_try_times -= 1
utils.logger.info(f"remaining slider try times: {max_slider_try_times}") utils.logger.info(f"[DouYinLogin.check_page_display_slider] remaining slider try times: {max_slider_try_times}")
continue continue
async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"): async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"):
@ -236,7 +236,7 @@ class DouYinLogin(AbstractLogin):
await self.context_page.mouse.up() await self.context_page.mouse.up()
async def login_by_cookies(self): async def login_by_cookies(self):
utils.logger.info("Begin login douyin by cookie ...") utils.logger.info("[DouYinLogin.login_by_cookies] Begin login douyin by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{ await self.browser_context.add_cookies([{
'name': key, 'name': key,

View File

@ -59,12 +59,12 @@ class KuaiShouClient:
@staticmethod @staticmethod
async def pong() -> bool: async def pong() -> bool:
"""get a note to check if login state is ok""" """get a note to check if login state is ok"""
utils.logger.info("Begin pong kuaishou...") utils.logger.info("[KuaiShouClient.pong] Begin pong kuaishou...")
ping_flag = False ping_flag = False
try: try:
pass pass
except Exception as e: except Exception as e:
utils.logger.error(f"Pong kuaishou failed: {e}, and try to login again...") utils.logger.error(f"[KuaiShouClient.pong] Pong kuaishou failed: {e}, and try to login again...")
ping_flag = False ping_flag = False
return ping_flag return ping_flag

View File

@ -81,13 +81,13 @@ class KuaishouCrawler(AbstractCrawler):
else: else:
pass pass
utils.logger.info("Kuaishou Crawler finished ...") utils.logger.info("[KuaishouCrawler.start] Kuaishou Crawler finished ...")
async def search(self): async def search(self):
utils.logger.info("Begin search kuaishou keywords") utils.logger.info("[KuaishouCrawler.search] Begin search kuaishou keywords")
ks_limit_count = 20 # kuaishou limit page fixed value ks_limit_count = 20 # kuaishou limit page fixed value
for keyword in config.KEYWORDS.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current search keyword: {keyword}") utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
page = 1 page = 1
while page * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: while page * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
video_id_list: List[str] = [] video_id_list: List[str] = []
@ -96,12 +96,12 @@ class KuaishouCrawler(AbstractCrawler):
pcursor=str(page), pcursor=str(page),
) )
if not videos_res: if not videos_res:
utils.logger.error(f"search info by keyword:{keyword} not found data") utils.logger.error(f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data")
continue continue
vision_search_photo: Dict = videos_res.get("visionSearchPhoto") vision_search_photo: Dict = videos_res.get("visionSearchPhoto")
if vision_search_photo.get("result") != 1: if vision_search_photo.get("result") != 1:
utils.logger.error(f"search info by keyword:{keyword} not found data ") utils.logger.error(f"[KuaishouCrawler.search] search info by keyword:{keyword} not found data ")
continue continue
for video_detail in vision_search_photo.get("feeds"): for video_detail in vision_search_photo.get("feeds"):
@ -129,13 +129,13 @@ class KuaishouCrawler(AbstractCrawler):
async with semaphore: async with semaphore:
try: try:
result = await self.ks_client.get_video_info(video_id) result = await self.ks_client.get_video_info(video_id)
utils.logger.info(f"Get video_id:{video_id} info result: {result} ...") utils.logger.info(f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ...")
return result.get("visionVideoDetail") return result.get("visionVideoDetail")
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error(f"Get video detail error: {ex}") utils.logger.error(f"[KuaishouCrawler.get_video_info_task] Get video detail error: {ex}")
return None return None
except KeyError as ex: except KeyError as ex:
utils.logger.error(f"have not fund note detail video_id:{video_id}, err: {ex}") utils.logger.error(f"[KuaishouCrawler.get_video_info_task] have not fund note detail video_id:{video_id}, err: {ex}")
return None return None
async def batch_get_video_comments(self, video_id_list: List[str]): async def batch_get_video_comments(self, video_id_list: List[str]):
@ -144,7 +144,7 @@ class KuaishouCrawler(AbstractCrawler):
:param video_id_list: :param video_id_list:
:return: :return:
""" """
utils.logger.info(f"[batch_get_video_comments] video ids:{video_id_list}") utils.logger.info(f"[KuaishouCrawler.batch_get_video_comments] video ids:{video_id_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = [] task_list: List[Task] = []
for video_id in video_id_list: for video_id in video_id_list:
@ -163,16 +163,16 @@ class KuaishouCrawler(AbstractCrawler):
""" """
async with semaphore: async with semaphore:
try: try:
utils.logger.info(f"[get_comments] bengin get video_id: {video_id} comments ...") utils.logger.info(f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ...")
await self.ks_client.get_video_all_comments( await self.ks_client.get_video_all_comments(
photo_id=video_id, photo_id=video_id,
crawl_interval=random.random(), crawl_interval=random.random(),
callback=kuaishou.batch_update_ks_video_comments callback=kuaishou.batch_update_ks_video_comments
) )
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error(f"[get_comments] get video_id: {video_id} comment error: {ex}") utils.logger.error(f"[KuaishouCrawler.get_comments] get video_id: {video_id} comment error: {ex}")
except Exception as e: except Exception as e:
utils.logger.error(f"[get_comments] may be been blocked, err:{e}") utils.logger.error(f"[KuaishouCrawler.get_comments] may be been blocked, err:{e}")
# use time.sleeep block main coroutine instead of asyncio.sleep and cacel running comment task # use time.sleeep block main coroutine instead of asyncio.sleep and cacel running comment task
# maybe kuaishou block our request, we will take a nap and update the cookie again # maybe kuaishou block our request, we will take a nap and update the cookie again
current_running_tasks = comment_tasks_var.get() current_running_tasks = comment_tasks_var.get()
@ -197,7 +197,7 @@ class KuaishouCrawler(AbstractCrawler):
async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient: async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
"""Create xhs client""" """Create xhs client"""
utils.logger.info("Begin create kuaishou API client ...") utils.logger.info("[KuaishouCrawler.create_ks_client] Begin create kuaishou API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
xhs_client_obj = KuaiShouClient( xhs_client_obj = KuaiShouClient(
proxies=httpx_proxy, proxies=httpx_proxy,
@ -221,7 +221,7 @@ class KuaishouCrawler(AbstractCrawler):
headless: bool = True headless: bool = True
) -> BrowserContext: ) -> BrowserContext:
"""Launch browser and create browser context""" """Launch browser and create browser context"""
utils.logger.info("Begin create browser context ...") utils.logger.info("[KuaishouCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", user_data_dir = os.path.join(os.getcwd(), "browser_data",
config.USER_DATA_DIR % self.platform) # type: ignore config.USER_DATA_DIR % self.platform) # type: ignore
@ -245,4 +245,4 @@ class KuaishouCrawler(AbstractCrawler):
async def close(self): async def close(self):
"""Close browser context""" """Close browser context"""
await self.browser_context.close() await self.browser_context.close()
utils.logger.info("Browser context closed ...") utils.logger.info("[KuaishouCrawler.close] Browser context closed ...")

View File

@ -29,7 +29,7 @@ class KuaishouLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login xiaohongshu""" """Start login xiaohongshu"""
utils.logger.info("Begin login kuaishou ...") utils.logger.info("[KuaishouLogin.begin] Begin login kuaishou ...")
if self.login_type == "qrcode": if self.login_type == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif self.login_type == "phone":
@ -37,7 +37,7 @@ class KuaishouLogin(AbstractLogin):
elif self.login_type == "cookie": elif self.login_type == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...") raise ValueError("[KuaishouLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...")
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self) -> bool: async def check_login_state(self) -> bool:
@ -55,7 +55,7 @@ class KuaishouLogin(AbstractLogin):
async def login_by_qrcode(self): async def login_by_qrcode(self):
"""login kuaishou website and keep webdriver login state""" """login kuaishou website and keep webdriver login state"""
utils.logger.info("Begin login kuaishou by qrcode ...") utils.logger.info("[KuaishouLogin.login_by_qrcode] Begin login kuaishou by qrcode ...")
# click login button # click login button
login_button_ele = self.context_page.locator( login_button_ele = self.context_page.locator(
@ -70,7 +70,7 @@ class KuaishouLogin(AbstractLogin):
selector=qrcode_img_selector selector=qrcode_img_selector
) )
if not base64_qrcode_img: if not base64_qrcode_img:
utils.logger.info("login failed , have not found qrcode please check ....") utils.logger.info("[KuaishouLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
sys.exit() sys.exit()
@ -78,22 +78,22 @@ class KuaishouLogin(AbstractLogin):
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
utils.logger.info(f"waiting for scan code login, remaining time is 20s") utils.logger.info(f"[KuaishouLogin.login_by_qrcode] waiting for scan code login, remaining time is 20s")
try: try:
await self.check_login_state() await self.check_login_state()
except RetryError: except RetryError:
utils.logger.info("Login kuaishou failed by qrcode login method ...") utils.logger.info("[KuaishouLogin.login_by_qrcode] Login kuaishou failed by qrcode login method ...")
sys.exit() sys.exit()
wait_redirect_seconds = 5 wait_redirect_seconds = 5
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") utils.logger.info(f"[KuaishouLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds) await asyncio.sleep(wait_redirect_seconds)
async def login_by_mobile(self): async def login_by_mobile(self):
pass pass
async def login_by_cookies(self): async def login_by_cookies(self):
utils.logger.info("Begin login kuaishou by cookie ...") utils.logger.info("[KuaishouLogin.login_by_cookies] Begin login kuaishou by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{ await self.browser_context.add_cookies([{
'name': key, 'name': key,

View File

@ -83,14 +83,14 @@ class XHSClient:
async def pong(self) -> bool: async def pong(self) -> bool:
"""get a note to check if login state is ok""" """get a note to check if login state is ok"""
utils.logger.info("Begin to pong xhs...") utils.logger.info("[XHSClient.pong] Begin to pong xhs...")
ping_flag = False ping_flag = False
try: try:
note_card: Dict = await self.get_note_by_keyword(keyword="小红书") note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
if note_card.get("items"): if note_card.get("items"):
ping_flag = True ping_flag = True
except Exception as e: except Exception as e:
utils.logger.error(f"Ping xhs failed: {e}, and try to login again...") utils.logger.error(f"[XHSClient.pong] Ping xhs failed: {e}, and try to login again...")
ping_flag = False ping_flag = False
return ping_flag return ping_flag
@ -136,7 +136,7 @@ class XHSClient:
if res and res.get("items"): if res and res.get("items"):
res_dict: Dict = res["items"][0]["note_card"] res_dict: Dict = res["items"][0]["note_card"]
return res_dict return res_dict
utils.logger.error(f"[xhs.client.get_note_by_id] get note empty and res:{res}") utils.logger.error(f"[XHSClient.get_note_by_id] get note empty and res:{res}")
return dict() return dict()
async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict: async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
@ -195,7 +195,7 @@ class XHSClient:
# Handle the absence of 'comments' key appropriately # Handle the absence of 'comments' key appropriately
# For example, log an error message, break from the loop, etc. # For example, log an error message, break from the loop, etc.
# This is just an example: # This is just an example:
print(f"No 'comments' key found in response: {comments_res}") utils.logger.info(f"[XHSClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
break break
comments = comments_res["comments"] comments = comments_res["comments"]
if not is_fetch_sub_comments: if not is_fetch_sub_comments:

View File

@ -87,14 +87,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
else: else:
pass pass
utils.logger.info("Xhs Crawler finished ...") utils.logger.info("[XiaoHongShuCrawler.start] Xhs Crawler finished ...")
async def search(self) -> None: async def search(self) -> None:
"""Search for notes and retrieve their comment information.""" """Search for notes and retrieve their comment information."""
utils.logger.info("Begin search xiaohongshu keywords") utils.logger.info("[XiaoHongShuCrawler.search] Begin search xiaohongshu keywords")
xhs_limit_count = 20 # xhs limit page fixed value xhs_limit_count = 20 # xhs limit page fixed value
for keyword in config.KEYWORDS.split(","): for keyword in config.KEYWORDS.split(","):
utils.logger.info(f"Current search keyword: {keyword}") utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
page = 1 page = 1
while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
note_id_list: List[str] = [] note_id_list: List[str] = []
@ -102,7 +102,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
keyword=keyword, keyword=keyword,
page=page, page=page,
) )
utils.logger.info(f"Search notes res:{notes_res}") utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [ task_list = [
self.get_note_detail(post_item.get("id"), semaphore) self.get_note_detail(post_item.get("id"), semaphore)
@ -115,7 +115,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
await xhs_model.update_xhs_note(note_detail) await xhs_model.update_xhs_note(note_detail)
note_id_list.append(note_detail.get("note_id")) note_id_list.append(note_detail.get("note_id"))
page += 1 page += 1
utils.logger.info(f"Note details: {note_details}") utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
await self.batch_get_note_comments(note_id_list) await self.batch_get_note_comments(note_id_list)
async def get_specified_notes(self): async def get_specified_notes(self):
@ -136,15 +136,15 @@ class XiaoHongShuCrawler(AbstractCrawler):
try: try:
return await self.xhs_client.get_note_by_id(note_id) return await self.xhs_client.get_note_by_id(note_id)
except DataFetchError as ex: except DataFetchError as ex:
utils.logger.error(f"Get note detail error: {ex}") utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}")
return None return None
except KeyError as ex: except KeyError as ex:
utils.logger.error(f"have not fund note detail note_id:{note_id}, err: {ex}") utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] have not fund note detail note_id:{note_id}, err: {ex}")
return None return None
async def batch_get_note_comments(self, note_list: List[str]): async def batch_get_note_comments(self, note_list: List[str]):
"""Batch get note comments""" """Batch get note comments"""
utils.logger.info(f"Begin batch get note comments, note list: {note_list}") utils.logger.info(f"[XiaoHongShuCrawler.batch_get_note_comments] Begin batch get note comments, note list: {note_list}")
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list: List[Task] = [] task_list: List[Task] = []
for note_id in note_list: for note_id in note_list:
@ -155,7 +155,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore): async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
"""Get note comments with keyword filtering and quantity limitation""" """Get note comments with keyword filtering and quantity limitation"""
async with semaphore: async with semaphore:
utils.logger.info(f"Begin get note id comments {note_id}") utils.logger.info(f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}")
all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random()) all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
# 从配置文件中读取关键词和数量限制 # 从配置文件中读取关键词和数量限制
@ -191,7 +191,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XHSClient: async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XHSClient:
"""Create xhs client""" """Create xhs client"""
utils.logger.info("Begin create xiaohongshu API client ...") utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
xhs_client_obj = XHSClient( xhs_client_obj = XHSClient(
proxies=httpx_proxy, proxies=httpx_proxy,
@ -215,7 +215,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
headless: bool = True headless: bool = True
) -> BrowserContext: ) -> BrowserContext:
"""Launch browser and create browser context""" """Launch browser and create browser context"""
utils.logger.info("Begin create browser context ...") utils.logger.info("[XiaoHongShuCrawler.launch_browser] Begin create browser context ...")
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
# feat issue #14 # feat issue #14
# we will save login state to avoid login every time # we will save login state to avoid login every time
@ -241,4 +241,4 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def close(self): async def close(self):
"""Close browser context""" """Close browser context"""
await self.browser_context.close() await self.browser_context.close()
utils.logger.info("Browser context closed ...") utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")

View File

@ -37,7 +37,7 @@ class XHSLogin(AbstractLogin):
""" """
if "请通过验证" in await self.context_page.content(): if "请通过验证" in await self.context_page.content():
utils.logger.info("登录过程中出现验证码,请手动验证") utils.logger.info("[XHSLogin.check_login_state] 登录过程中出现验证码,请手动验证")
current_cookie = await self.browser_context.cookies() current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie) _, cookie_dict = utils.convert_cookies(current_cookie)
@ -48,7 +48,7 @@ class XHSLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login xiaohongshu""" """Start login xiaohongshu"""
utils.logger.info("Begin login xiaohongshu ...") utils.logger.info("[XHSLogin.begin] Begin login xiaohongshu ...")
if self.login_type == "qrcode": if self.login_type == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif self.login_type == "phone":
@ -56,11 +56,11 @@ class XHSLogin(AbstractLogin):
elif self.login_type == "cookie": elif self.login_type == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookies ...") raise ValueError("[XHSLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
async def login_by_mobile(self): async def login_by_mobile(self):
"""Login xiaohongshu by mobile""" """Login xiaohongshu by mobile"""
utils.logger.info("Begin login xiaohongshu by mobile ...") utils.logger.info("[XHSLogin.login_by_mobile] Begin login xiaohongshu by mobile ...")
await asyncio.sleep(1) await asyncio.sleep(1)
try: try:
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮 # 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
@ -77,7 +77,7 @@ class XHSLogin(AbstractLogin):
) )
await element.click() await element.click()
except Exception as e: except Exception as e:
utils.logger.info("have not found mobile button icon and keep going ...") utils.logger.info("[XHSLogin.login_by_mobile] have not found mobile button icon and keep going ...")
await asyncio.sleep(1) await asyncio.sleep(1)
login_container_ele = await self.context_page.wait_for_selector("div.login-container") login_container_ele = await self.context_page.wait_for_selector("div.login-container")
@ -93,7 +93,7 @@ class XHSLogin(AbstractLogin):
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
no_logged_in_session = "" no_logged_in_session = ""
while max_get_sms_code_time > 0: while max_get_sms_code_time > 0:
utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...") utils.logger.info(f"[XHSLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1) await asyncio.sleep(1)
sms_code_key = f"xhs_{self.login_phone}" sms_code_key = f"xhs_{self.login_phone}"
sms_code_value = redis_obj.get(sms_code_key) sms_code_value = redis_obj.get(sms_code_key)
@ -119,16 +119,16 @@ class XHSLogin(AbstractLogin):
try: try:
await self.check_login_state(no_logged_in_session) await self.check_login_state(no_logged_in_session)
except RetryError: except RetryError:
utils.logger.info("Login xiaohongshu failed by mobile login method ...") utils.logger.info("[XHSLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...")
sys.exit() sys.exit()
wait_redirect_seconds = 5 wait_redirect_seconds = 5
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") utils.logger.info(f"[XHSLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds) await asyncio.sleep(wait_redirect_seconds)
async def login_by_qrcode(self): async def login_by_qrcode(self):
"""login xiaohongshu website and keep webdriver login state""" """login xiaohongshu website and keep webdriver login state"""
utils.logger.info("Begin login xiaohongshu by qrcode ...") utils.logger.info("[XHSLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...")
# login_selector = "div.login-container > div.left > div.qrcode > img" # login_selector = "div.login-container > div.left > div.qrcode > img"
qrcode_img_selector = "xpath=//img[@class='qrcode-img']" qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
# find login qrcode # find login qrcode
@ -137,7 +137,7 @@ class XHSLogin(AbstractLogin):
selector=qrcode_img_selector selector=qrcode_img_selector
) )
if not base64_qrcode_img: if not base64_qrcode_img:
utils.logger.info("login failed , have not found qrcode please check ....") utils.logger.info("[XHSLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
# if this website does not automatically popup login dialog box, we will manual click login button # if this website does not automatically popup login dialog box, we will manual click login button
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button") login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
@ -161,20 +161,20 @@ class XHSLogin(AbstractLogin):
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
utils.logger.info(f"waiting for scan code login, remaining time is 120s") utils.logger.info(f"[XHSLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
try: try:
await self.check_login_state(no_logged_in_session) await self.check_login_state(no_logged_in_session)
except RetryError: except RetryError:
utils.logger.info("Login xiaohongshu failed by qrcode login method ...") utils.logger.info("[XHSLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...")
sys.exit() sys.exit()
wait_redirect_seconds = 5 wait_redirect_seconds = 5
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") utils.logger.info(f"[XHSLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds) await asyncio.sleep(wait_redirect_seconds)
async def login_by_cookies(self): async def login_by_cookies(self):
"""login xiaohongshu website by cookies""" """login xiaohongshu website by cookies"""
utils.logger.info("Begin login xiaohongshu by cookie ...") utils.logger.info("[XHSLogin.login_by_cookies] Begin login xiaohongshu by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
if key != "web_session": # only set web_session cookie attr if key != "web_session": # only set web_session cookie attr
continue continue

View File

@ -85,7 +85,7 @@ async def update_bilibili_video(video_item: Dict):
"video_url": f"https://www.bilibili.com/video/av{video_id}", "video_url": f"https://www.bilibili.com/video/av{video_id}",
"video_cover_url": video_item_view.get("pic", ""), "video_cover_url": video_item_view.get("pic", ""),
} }
utils.logger.info(f"bilibili video id:{video_id}, title:{local_db_item.get('title')}") utils.logger.info(f"[models.bilibili.update_bilibili_video] bilibili video id:{video_id}, title:{local_db_item.get('title')}")
if config.IS_SAVED_DATABASED: if config.IS_SAVED_DATABASED:
if not await BilibiliVideo.filter(video_id=video_id).exists(): if not await BilibiliVideo.filter(video_id=video_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp() local_db_item["add_ts"] = utils.get_current_timestamp()
@ -131,7 +131,7 @@ async def update_bilibili_video_comment(video_id: str, comment_item: Dict):
"sub_comment_count": str(comment_item.get("rcount", 0)), "sub_comment_count": str(comment_item.get("rcount", 0)),
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
} }
utils.logger.info(f"Bilibili video comment: {comment_id}, content: {local_db_item.get('content')}") utils.logger.info(f"[models.bilibili.update_bilibili_video_comment] Bilibili video comment: {comment_id}, content: {local_db_item.get('content')}")
if config.IS_SAVED_DATABASED: if config.IS_SAVED_DATABASED:
if not await BilibiliComment.filter(comment_id=comment_id).exists(): if not await BilibiliComment.filter(comment_id=comment_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp() local_db_item["add_ts"] = utils.get_current_timestamp()

View File

@ -88,7 +88,7 @@ async def update_douyin_aweme(aweme_item: Dict):
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
"aweme_url": f"https://www.douyin.com/video/{aweme_id}" "aweme_url": f"https://www.douyin.com/video/{aweme_id}"
} }
print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}") utils.logger.info(f"[models.douyin.update_douyin_aweme] douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
if config.IS_SAVED_DATABASED: if config.IS_SAVED_DATABASED:
if not await DouyinAweme.filter(aweme_id=aweme_id).exists(): if not await DouyinAweme.filter(aweme_id=aweme_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp() local_db_item["add_ts"] = utils.get_current_timestamp()
@ -123,7 +123,7 @@ async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
comment_aweme_id = comment_item.get("aweme_id") comment_aweme_id = comment_item.get("aweme_id")
if aweme_id != comment_aweme_id: if aweme_id != comment_aweme_id:
print(f"comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}") utils.logger.error(f"[models.douyin.update_dy_aweme_comment] comment_aweme_id: {comment_aweme_id} != aweme_id: {aweme_id}")
return return
user_info = comment_item.get("user", {}) user_info = comment_item.get("user", {})
comment_id = comment_item.get("cid") comment_id = comment_item.get("cid")
@ -145,7 +145,7 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
"sub_comment_count": str(comment_item.get("reply_comment_total", 0)), "sub_comment_count": str(comment_item.get("reply_comment_total", 0)),
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
} }
print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}") utils.logger.info(f"[models.douyin.update_dy_aweme_comment] douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")
if config.IS_SAVED_DATABASED: if config.IS_SAVED_DATABASED:
if not await DouyinAwemeComment.filter(comment_id=comment_id).exists(): if not await DouyinAwemeComment.filter(comment_id=comment_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp() local_db_item["add_ts"] = utils.get_current_timestamp()

View File

@ -80,7 +80,7 @@ async def update_kuaishou_video(video_item: Dict):
"video_cover_url": photo_info.get("coverUrl", ""), "video_cover_url": photo_info.get("coverUrl", ""),
"video_play_url": photo_info.get("photoUrl", ""), "video_play_url": photo_info.get("photoUrl", ""),
} }
print(f"Kuaishou video id:{video_id}, title:{local_db_item.get('title')}") utils.logger.info(f"[models.kuaishou.update_kuaishou_video] Kuaishou video id:{video_id}, title:{local_db_item.get('title')}")
if config.IS_SAVED_DATABASED: if config.IS_SAVED_DATABASED:
if not await KuaishouVideo.filter(video_id=video_id).exists(): if not await KuaishouVideo.filter(video_id=video_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp() local_db_item["add_ts"] = utils.get_current_timestamp()
@ -106,7 +106,7 @@ async def update_kuaishou_video(video_item: Dict):
async def batch_update_ks_video_comments(video_id: str, comments: List[Dict]): async def batch_update_ks_video_comments(video_id: str, comments: List[Dict]):
utils.logger.info(f"[batch_update_ks_video_comments] video_id:{video_id}, comments:{comments}") utils.logger.info(f"[KuaishouVideoComment.batch_update_ks_video_comments] video_id:{video_id}, comments:{comments}")
if not comments: if not comments:
return return
for comment_item in comments: for comment_item in comments:
@ -126,7 +126,7 @@ async def update_ks_video_comment(video_id: str, comment_item: Dict):
"sub_comment_count": str(comment_item.get("subCommentCount", 0)), "sub_comment_count": str(comment_item.get("subCommentCount", 0)),
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
} }
print(f"Kuaishou video comment: {comment_id}, content: {local_db_item.get('content')}") utils.logger.info(f"[models.kuaishou.update_ks_video_comment] Kuaishou video comment: {comment_id}, content: {local_db_item.get('content')}")
if config.IS_SAVED_DATABASED: if config.IS_SAVED_DATABASED:
if not await KuaishouVideoComment.filter(comment_id=comment_id).exists(): if not await KuaishouVideoComment.filter(comment_id=comment_id).exists():
local_db_item["add_ts"] = utils.get_current_timestamp() local_db_item["add_ts"] = utils.get_current_timestamp()

View File

@ -86,7 +86,7 @@ async def update_xhs_note(note_item: Dict):
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
"note_url": f"https://www.xiaohongshu.com/explore/{note_id}" "note_url": f"https://www.xiaohongshu.com/explore/{note_id}"
} }
print("xhs note:", local_db_item) utils.logger.info(f"[models.xiaohongshu.update_xhs_note] xhs note: {local_db_item}")
if config.IS_SAVED_DATABASED: if config.IS_SAVED_DATABASED:
if not await XHSNote.filter(note_id=note_id).first(): if not await XHSNote.filter(note_id=note_id).first():
local_db_item["add_ts"] = utils.get_current_timestamp() local_db_item["add_ts"] = utils.get_current_timestamp()
@ -125,7 +125,7 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
"sub_comment_count": comment_item.get("sub_comment_count"), "sub_comment_count": comment_item.get("sub_comment_count"),
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
} }
print("xhs note comment:", local_db_item) utils.logger.info(f"[models.xiaohongshu.update_xhs_note_comment] xhs note comment:{local_db_item}")
if config.IS_SAVED_DATABASED: if config.IS_SAVED_DATABASED:
if not await XHSNoteComment.filter(comment_id=comment_id).first(): if not await XHSNoteComment.filter(comment_id=comment_id).first():
local_db_item["add_ts"] = utils.get_current_timestamp() local_db_item["add_ts"] = utils.get_current_timestamp()

View File

@ -115,7 +115,7 @@ class JiSuHttpProxy(ProxyProvider):
ip_infos = [] ip_infos = []
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
url = self.api_path + "/fetchips" + '?' + urlencode(self.params) url = self.api_path + "/fetchips" + '?' + urlencode(self.params)
utils.logger.info(f"[JiSuHttpProxy] get ip proxy url:{url}") utils.logger.info(f"[JiSuHttpProxy.get_proxies] get ip proxy url:{url}")
response = await client.get(url, headers={ response = await client.get(url, headers={
"User-Agent": "MediaCrawler https://github.com/NanmiCoder/MediaCrawler"}) "User-Agent": "MediaCrawler https://github.com/NanmiCoder/MediaCrawler"})
res_dict: Dict = response.json() res_dict: Dict = response.json()