refactor: 优化抖音Crawler部分代码

fix: 日志初始化错误修复
This commit is contained in:
Relakkes 2023-07-15 21:30:12 +08:00
parent dad8d56ab5
commit 2398a17e21
10 changed files with 186 additions and 152 deletions

3
.gitignore vendored
View File

@ -163,5 +163,4 @@ cython_debug/
*.iml
.idea
/temp_image/
/xhs_user_data_dir/
/dy_user_data_dir/
/browser_data/

View File

@ -1,3 +1,5 @@
from typing import Tuple, Optional
import config
@ -8,14 +10,14 @@ class PhonePool:
self.phones = []
self.used_phones = set()
def add_phone(self, phone):
def add_phone(self, phone: str) -> bool:
"""add phone to the pool"""
if phone not in self.phones:
self.phones.append(phone)
return True
return False
def remove_phone(self, phone):
def remove_phone(self, phone: str) -> bool:
"""remove phone from the pool"""
if phone in self.used_phones:
self.phones.remove(phone)
@ -23,7 +25,7 @@ class PhonePool:
return True
return False
def get_phone(self):
def get_phone(self) -> Optional[str]:
"""get phone and mark as used"""
if self.phones:
left_phone = self.phones.pop(0)
@ -49,7 +51,7 @@ class IPPool:
return True
return False
def remove_ip(self, ip):
def remove_ip(self, ip: str) -> bool:
"""remove ip"""
if ip in self.used_ips:
self.ips.remove(ip)
@ -57,7 +59,7 @@ class IPPool:
return True
return False
def get_ip(self):
def get_ip(self) -> Optional[str]:
"""get ip and mark as used"""
if self.ips:
left_ips = self.ips.pop(0)
@ -78,19 +80,19 @@ class AccountPool:
self.phone_pool = PhonePool()
self.ip_pool = IPPool()
def add_account(self, phone, ip):
def add_account(self, phone: str, ip: str) -> bool:
"""add account to pool with phone and ip"""
if self.phone_pool.add_phone(phone) and self.ip_pool.add_ip(ip):
return True
return False
def remove_account(self, phone, ip):
def remove_account(self, phone: str, ip: str) -> bool:
"""remove account from pool """
if self.phone_pool.remove_phone(phone) and self.ip_pool.remove_ip(ip):
return True
return False
def get_account(self):
def get_account(self) -> Tuple[str, str]:
"""get account if no account, reload account pool"""
phone = self.phone_pool.get_phone()
ip = self.ip_pool.get_ip()

15
main.py
View File

@ -21,7 +21,6 @@ class CrawlerFactory:
async def main():
utils.init_loging_config()
# define command line params ...
parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.PLATFORM)
@ -38,20 +37,6 @@ async def main():
)
await crawler.start()
"""
# retry when exception ...
while True:
try:
await crawler.start()
except Exception as e:
logging.info(f"crawler start error: {e} ...")
await crawler.close()
# If you encounter an exception
# sleep for a period of time before retrying
# to avoid frequent requests that may result in the account being blocked.
await asyncio.sleep(config.RETRY_INTERVAL)
"""
if __name__ == '__main__':
try:

View File

@ -6,9 +6,11 @@ import httpx
import execjs
import urllib.parse
from playwright.async_api import Page
from playwright.async_api import BrowserContext
from .field import *
from .exception import *
from tools import utils
class DOUYINClient:
@ -33,7 +35,6 @@ class DOUYINClient:
headers = headers or self.headers
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage")
douyin_js_obj = execjs.compile(open('libs/douyin.js').read())
# douyin_js_obj = execjs.compile(open('libs/X-Bogus.js').read())
common_params = {
"device_platform": "webapp",
"aid": "6383",
@ -82,6 +83,17 @@ class DOUYINClient:
headers = headers or self.headers
return await self.request(method="POST", url=f"{self._host}{uri}", data=data, headers=headers)
@staticmethod
async def ping(browser_context: BrowserContext) -> bool:
_, cookie_dict = utils.convert_cookies(await browser_context.cookies())
# todo send some api to test login status
return cookie_dict.get("LOGIN_STATUS") == "1"
async def update_cookies(self, browser_context: BrowserContext):
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
self.headers["Cookie"] = cookie_str
self.cookie_dict = cookie_dict
async def search_info_by_keyword(
self,
keyword: str,

View File

@ -1,13 +1,14 @@
import logging
import os
import asyncio
import logging
from asyncio import Task
from argparse import Namespace
from typing import Optional, List, Dict, Tuple
from playwright.async_api import async_playwright
from playwright.async_api import Page
from playwright.async_api import Cookie
from playwright.async_api import BrowserType
from playwright.async_api import BrowserContext
from playwright.async_api import Page
import config
from tools import utils
@ -21,12 +22,11 @@ from models import douyin
class DouYinCrawler(AbstractCrawler):
def __init__(self):
self.cookies: Optional[List[Cookie]] = None
self.browser_context: Optional[BrowserContext] = None
self.context_page: Optional[Page] = None
self.proxy: Optional[Dict] = None
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.dy_client: Optional[DOUYINClient] = None
self.index_url = "https://www.douyin.com"
self.command_args: Optional[Namespace] = None
self.account_pool: Optional[AccountPool] = None
@ -34,94 +34,45 @@ class DouYinCrawler(AbstractCrawler):
for key, value in kwargs.items():
setattr(self, key, value)
def create_proxy_info(self) -> Tuple[str, Dict, str]:
"""Create proxy info for playwright and httpx"""
# phone: 13012345671
# ip_proxy: 111.122.xx.xx1:8888
# 手机号和IP代理都是从账号池中获取的并且它们是固定绑定的
phone, ip_proxy = self.account_pool.get_account()
playwright_proxy = {
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
"username": config.IP_PROXY_USER,
"password": config.IP_PROXY_PASSWORD,
}
httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
return phone, playwright_proxy, httpx_proxy
async def start(self):
# phone: 1340xxxx, ip_proxy: 47.xxx.xxx.xxx:8888
account_phone, ip_proxy = self.account_pool.get_account()
# 抖音平台如果开启代理登录的话,会被风控,所以这里不开启代理
playwright_proxy = None
# playwright_proxy = {
# "server": f"{config.ip_proxy_protocol}{ip_proxy}",
# "username": config.ip_proxy_user,
# "password": config.ip_proxy_password,
# }
httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
if not config.ENABLE_IP_PROXY:
playwright_proxy = None
httpx_proxy = None
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
async with async_playwright() as playwright:
# Launch a browser context.
chromium = playwright.chromium
browser = await chromium.launch(headless=config.HEADLESS, proxy=playwright_proxy)
self.browser_context = await browser.new_context(
viewport={"width": 1800, "height": 900},
user_agent=self.user_agent,
self.browser_context = await self.launch_browser(
chromium,
playwright_proxy,
self.user_agent,
headless=config.HEADLESS
)
# execute JS to bypass anti automation/crawler detection
# stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page()
await self.context_page.goto("https://www.douyin.com", wait_until="domcontentloaded")
await asyncio.sleep(3)
await self.context_page.goto(self.index_url)
# begin login
login_obj = DouYinLogin(
login_type=self.command_args.lt,
login_phone=account_phone,
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES
)
await login_obj.begin()
# update cookies
await self.update_cookies()
# init request client
cookie_str, cookie_dict = utils.convert_cookies(self.cookies)
self.dy_client = DOUYINClient(
proxies=httpx_proxy,
headers={
"User-Agent": self.user_agent,
"Cookie": cookie_str,
"Host": "www.douyin.com",
"Origin": "https://www.douyin.com/",
"Referer": "https://www.douyin.com/",
"Content-Type": "application/json;charset=UTF-8"
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
self.dy_client = await self.create_douyin_client(httpx_proxy)
if not await self.dy_client.ping(browser_context=self.browser_context):
login_obj = DouYinLogin(
login_type=self.command_args.lt,
login_phone=account_phone,
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES
)
await login_obj.begin()
await self.dy_client.update_cookies(browser_context=self.browser_context)
# search_posts
await self.search_posts()
# block main crawler coroutine
await asyncio.Event().wait()
async def update_cookies(self):
self.cookies = await self.browser_context.cookies()
utils.logger.info("Douyin Crawler finished ...")
async def search_posts(self):
logging.info("Begin search douyin keywords")
utils.logger.info("Begin search douyin keywords")
for keyword in config.KEYWORDS.split(","):
logging.info(f"Current keyword: {keyword}")
utils.logger.info(f"Current keyword: {keyword}")
aweme_list: List[str] = []
max_note_len = 20
max_note_len = config.MAX_PAGE_NUM
page = 0
while max_note_len > 0:
try:
@ -139,8 +90,8 @@ class DouYinCrawler(AbstractCrawler):
continue
aweme_list.append(aweme_info.get("aweme_id"))
await douyin.update_douyin_aweme(aweme_item=aweme_info)
print(f"keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list)
utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
# await self.batch_get_note_comments(aweme_list)
async def batch_get_note_comments(self, aweme_list: List[str]):
task_list: List[Task] = []
@ -155,6 +106,71 @@ class DouYinCrawler(AbstractCrawler):
aweme_id=aweme_id,
callback=douyin.batch_update_dy_aweme_comments
)
print(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
except DataFetchError as e:
logging.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]:
"""Create proxy info for playwright and httpx"""
if not config.ENABLE_IP_PROXY:
return None, None, None
# phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888
phone, ip_proxy = self.account_pool.get_account()
playwright_proxy = {
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
"username": config.IP_PROXY_USER,
"password": config.IP_PROXY_PASSWORD,
}
httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
return phone, playwright_proxy, httpx_proxy
async def create_douyin_client(self, httpx_proxy: str) -> DOUYINClient:
"""Create douyin client"""
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
douyin_client = DOUYINClient(
proxies=httpx_proxy,
headers={
"User-Agent": self.user_agent,
"Cookie": cookie_str,
"Host": "www.douyin.com",
"Origin": "https://www.douyin.com/",
"Referer": "https://www.douyin.com/",
"Content-Type": "application/json;charset=UTF-8"
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
return douyin_client
async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True
) -> BrowserContext:
"""Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform)
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy,
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
)
return browser_context
else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy)
browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
)
return browser_context
async def close(self):
"""Close browser context"""
await self.browser_context.close()
utils.logger.info("Browser context closed ...")

View File

@ -8,13 +8,14 @@ from tenacity import (
retry,
stop_after_attempt,
wait_fixed,
retry_if_result
retry_if_result,
RetryError
)
from playwright.async_api import Page, TimeoutError as PlaywrightTimeoutError
from playwright.async_api import BrowserContext
import config
from tools import utils, easing
from tools import utils
from base.base_crawler import AbstractLogin
@ -54,21 +55,22 @@ class DouYinLogin(AbstractLogin):
raise ValueError("Invalid Login Type Currently only supported qrcode or phone ...")
# 如果页面重定向到滑动验证码页面,需要再次滑动滑块
await asyncio.sleep(3)
await asyncio.sleep(6)
current_page_title = await self.context_page.title()
if "验证码中间页" in current_page_title:
await self.check_page_display_slider(move_step=3, slider_level="hard")
# check login state
logging.info(f"login finished then check login state ...")
login_flag: bool = await self.check_login_state()
if not login_flag:
logging.info("login failed please confirm ...")
utils.logger.info(f"login finished then check login state ...")
try:
await self.check_login_state()
except RetryError:
utils.logger.info("login failed please confirm ...")
sys.exit()
# wait for redirect
wait_redirect_seconds = 5
logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
@ -88,31 +90,31 @@ class DouYinLogin(AbstractLogin):
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10)
except Exception as e:
logging.error(f"login dialog box does not pop up automatically, error: {e}")
logging.info("login dialog box does not pop up automatically, we will manually click the login button")
utils.logger.info("login dialog box does not pop up automatically, we will manually click the login button")
login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']")
await login_button_ele.click()
await asyncio.sleep(0.5)
async def login_by_qrcode(self):
logging.info("Begin login douyin by qrcode...")
utils.logger.info("Begin login douyin by qrcode...")
qrcode_img_selector = "xpath=//article[@class='web-login']//img"
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
logging.info("login qrcode not found please confirm ...")
utils.logger.info("login qrcode not found please confirm ...")
sys.exit()
# show login qrcode
# utils.show_qrcode(base64_qrcode_img)
# utils.show_qrcode(base64_qrcode_img)
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
utils.show_qrcode(base64_qrcode_img)
await asyncio.sleep(2)
async def login_by_mobile(self):
logging.info("Begin login douyin by mobile ...")
utils.logger.info("Begin login douyin by mobile ...")
mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']")
await mobile_tap_ele.click()
await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']")
@ -128,7 +130,7 @@ class DouYinLogin(AbstractLogin):
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
while max_get_sms_code_time > 0:
logging.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"dy_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key)
@ -170,20 +172,20 @@ class DouYinLogin(AbstractLogin):
# 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮
page_content = await self.context_page.content()
if "操作过慢" in page_content or "提示重新操作" in page_content:
logging.info("slider verify failed, retry ...")
utils.logger.info("slider verify failed, retry ...")
await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]")
continue
# 滑动成功后,等待滑块消失
await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000)
# 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码
logging.info("slider verify success ...")
utils.logger.info("slider verify success ...")
slider_verify_success = True
except Exception as e:
logging.error(f"slider verify failed, error: {e}")
await asyncio.sleep(1)
max_slider_try_times -= 1
logging.info(f"remaining slider try times: {max_slider_try_times}")
utils.logger.info(f"remaining slider try times: {max_slider_try_times}")
continue
async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"):
@ -240,7 +242,7 @@ class DouYinLogin(AbstractLogin):
await self.context_page.mouse.up()
async def login_by_cookies(self):
logging.info("Begin login douyin by cookie ...")
utils.logger.info("Begin login douyin by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,

View File

@ -82,7 +82,7 @@ class XHSClient:
async def ping(self) -> bool:
"""get a note to check if login state is ok"""
logging.info("begin to ping xhs...")
utils.logger.info("begin to ping xhs...")
note_id = "5e5cb38a000000000100185e"
try:
note_card: Dict = await self.get_note_by_id(note_id)

View File

@ -1,3 +1,4 @@
import os
import random
import asyncio
import logging
@ -8,6 +9,7 @@ from argparse import Namespace
from playwright.async_api import Page
from playwright.async_api import BrowserContext
from playwright.async_api import async_playwright
from playwright.async_api import BrowserType
import config
from tools import utils
@ -31,8 +33,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.account_pool: Optional[AccountPool] = None
def init_config(self, **kwargs):
for key in kwargs.keys():
setattr(self, key, kwargs[key])
for key, value in kwargs.items():
setattr(self, key, value)
async def start(self):
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
@ -66,13 +68,13 @@ class XiaoHongShuCrawler(AbstractCrawler):
# Search for notes and retrieve their comment information.
await self.search_posts()
logging.info("Xhs Crawler finished ...")
utils.logger.info("Xhs Crawler finished ...")
async def search_posts(self):
"""Search for notes and retrieve their comment information."""
logging.info("Begin search xiaohongshu keywords")
utils.logger.info("Begin search xiaohongshu keywords")
for keyword in config.KEYWORDS.split(","):
logging.info(f"Current keyword: {keyword}")
utils.logger.info(f"Current keyword: {keyword}")
note_list: List[str] = []
max_note_len = config.MAX_PAGE_NUM
page = 1
@ -88,13 +90,13 @@ class XiaoHongShuCrawler(AbstractCrawler):
try:
note_detail = await self.xhs_client.get_note_by_id(note_id)
except DataFetchError as ex:
logging.error(f"Get note detail error: {ex}")
utils.logger.error(f"Get note detail error: {ex}")
continue
await xhs_model.update_xhs_note(note_detail)
await asyncio.sleep(0.05)
note_list.append(note_id)
logging.info(f"keyword:{keyword}, note_list:{note_list}")
# await self.batch_get_note_comments(note_list)
utils.logger.info(f"keyword:{keyword}, note_list:{note_list}")
await self.batch_get_note_comments(note_list)
async def batch_get_note_comments(self, note_list: List[str]):
"""Batch get note comments"""
@ -106,7 +108,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def get_comments(self, note_id: str):
"""Get note comments"""
logging.info(f"Begin get note id comments {note_id}")
utils.logger.info(f"Begin get note id comments {note_id}")
all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
for comment in all_comments:
await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)
@ -143,12 +145,21 @@ class XiaoHongShuCrawler(AbstractCrawler):
)
return xhs_client_obj
async def launch_browser(self, chromium, playwright_proxy, user_agent, headless=True) -> BrowserContext:
async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True
) -> BrowserContext:
utils.logger.info("Begin create browser context ...")
"""Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE:
# feat issue #14
# we will save login state to avoid login every time
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform)
browser_context = await chromium.launch_persistent_context(
user_data_dir=config.USER_DATA_DIR % self.command_args.platform,
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy,
@ -167,4 +178,4 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def close(self):
"""Close browser context"""
await self.browser_context.close()
logging.info("Browser context closed ...")
utils.logger.info("Browser context closed ...")

View File

@ -50,7 +50,7 @@ class XHSLogin(AbstractLogin):
async def begin(self):
"""Start login xiaohongshu"""
logging.info("Begin login xiaohongshu ...")
utils.logger.info("Begin login xiaohongshu ...")
if self.login_type == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
@ -62,7 +62,7 @@ class XHSLogin(AbstractLogin):
async def login_by_mobile(self):
"""Login xiaohongshu by mobile"""
logging.info("Begin login xiaohongshu by mobile ...")
utils.logger.info("Begin login xiaohongshu by mobile ...")
await asyncio.sleep(1)
try:
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
@ -79,21 +79,24 @@ class XHSLogin(AbstractLogin):
)
await element.click()
except Exception as e:
logging.info("have not found mobile button icon and keep going ...")
utils.logger.info("have not found mobile button icon and keep going ...")
await asyncio.sleep(1)
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
input_ele = await login_container_ele.query_selector("label.phone > input")
await input_ele.fill(self.login_phone)
await asyncio.sleep(0.5)
send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
await send_btn_ele.click() # 点击发送验证码
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
no_logged_in_session = ""
while max_get_sms_code_time > 0:
logging.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"xhs_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key)
@ -119,17 +122,16 @@ class XHSLogin(AbstractLogin):
try:
await self.check_login_state(no_logged_in_session)
except RetryError:
logging.info("Login xiaohongshu failed by mobile login method ...")
utils.logger.info("Login xiaohongshu failed by mobile login method ...")
sys.exit()
wait_redirect_seconds = 5
logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_qrcode(self):
"""login xiaohongshu website and keep webdriver login state"""
logging.info("Begin login xiaohongshu by qrcode ...")
await asyncio.sleep(10)
utils.logger.info("Begin login xiaohongshu by qrcode ...")
# login_selector = "div.login-container > div.left > div.qrcode > img"
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
# find login qrcode
@ -138,7 +140,7 @@ class XHSLogin(AbstractLogin):
selector=qrcode_img_selector
)
if not base64_qrcode_img:
logging.info("login failed , have not found qrcode please check ....")
utils.logger.info("login failed , have not found qrcode please check ....")
# if this website does not automatically popup login dialog box, we will manual click login button
await asyncio.sleep(0.5)
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
@ -162,20 +164,20 @@ class XHSLogin(AbstractLogin):
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
logging.info(f"waiting for scan code login, remaining time is 20s")
utils.logger.info(f"waiting for scan code login, remaining time is 20s")
try:
await self.check_login_state(no_logged_in_session)
except RetryError:
logging.info("Login xiaohongshu failed by qrcode login method ...")
utils.logger.info("Login xiaohongshu failed by qrcode login method ...")
sys.exit()
wait_redirect_seconds = 5
logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_cookies(self):
"""login xiaohongshu website by cookies"""
logging.info("Begin login xiaohongshu by cookie ...")
utils.logger.info("Begin login xiaohongshu by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,

View File

@ -108,7 +108,12 @@ def init_loging_config():
format="%(asctime)s %(name)s %(levelname)s %(message)s ",
datefmt='%Y-%m-%d %H:%M:%S'
)
logging.Logger("Media Crawler")
_logger = logging.getLogger("MediaCrawler")
_logger.setLevel(level)
return _logger
logger = init_loging_config()
class Slide: