diff --git a/.gitignore b/.gitignore index 2a4a8da..03f9900 100644 --- a/.gitignore +++ b/.gitignore @@ -163,5 +163,4 @@ cython_debug/ *.iml .idea /temp_image/ -/xhs_user_data_dir/ -/dy_user_data_dir/ +/browser_data/ diff --git a/base/proxy_account_pool.py b/base/proxy_account_pool.py index 18b7884..470e0f7 100644 --- a/base/proxy_account_pool.py +++ b/base/proxy_account_pool.py @@ -1,3 +1,5 @@ +from typing import Tuple, Optional + import config @@ -8,14 +10,14 @@ class PhonePool: self.phones = [] self.used_phones = set() - def add_phone(self, phone): + def add_phone(self, phone: str) -> bool: """add phone to the pool""" if phone not in self.phones: self.phones.append(phone) return True return False - def remove_phone(self, phone): + def remove_phone(self, phone: str) -> bool: """remove phone from the pool""" if phone in self.used_phones: self.phones.remove(phone) @@ -23,7 +25,7 @@ class PhonePool: return True return False - def get_phone(self): + def get_phone(self) -> Optional[str]: """get phone and mark as used""" if self.phones: left_phone = self.phones.pop(0) @@ -49,7 +51,7 @@ class IPPool: return True return False - def remove_ip(self, ip): + def remove_ip(self, ip: str) -> bool: """remove ip""" if ip in self.used_ips: self.ips.remove(ip) @@ -57,7 +59,7 @@ class IPPool: return True return False - def get_ip(self): + def get_ip(self) -> Optional[str]: """get ip and mark as used""" if self.ips: left_ips = self.ips.pop(0) @@ -78,19 +80,19 @@ class AccountPool: self.phone_pool = PhonePool() self.ip_pool = IPPool() - def add_account(self, phone, ip): + def add_account(self, phone: str, ip: str) -> bool: """add account to pool with phone and ip""" if self.phone_pool.add_phone(phone) and self.ip_pool.add_ip(ip): return True return False - def remove_account(self, phone, ip): + def remove_account(self, phone: str, ip: str) -> bool: """remove account from pool """ if self.phone_pool.remove_phone(phone) and self.ip_pool.remove_ip(ip): return True return False - def get_account(self): + def get_account(self) -> Tuple[str, str]: """get account if no account, reload account pool""" phone = self.phone_pool.get_phone() ip = self.ip_pool.get_ip() diff --git a/main.py b/main.py index 131b5b4..c682140 100644 --- a/main.py +++ b/main.py @@ -21,7 +21,6 @@ class CrawlerFactory: async def main(): - utils.init_loging_config() # define command line params ... parser = argparse.ArgumentParser(description='Media crawler program.') parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.PLATFORM) @@ -38,20 +37,6 @@ async def main(): ) await crawler.start() - """ - # retry when exception ... - while True: - try: - await crawler.start() - except Exception as e: - logging.info(f"crawler start error: {e} ...") - await crawler.close() - # If you encounter an exception - # sleep for a period of time before retrying - # to avoid frequent requests that may result in the account being blocked. - await asyncio.sleep(config.RETRY_INTERVAL) - """ - if __name__ == '__main__': try: diff --git a/media_platform/douyin/client.py b/media_platform/douyin/client.py index d8c5e7f..75c78a6 100644 --- a/media_platform/douyin/client.py +++ b/media_platform/douyin/client.py @@ -6,9 +6,11 @@ import httpx import execjs import urllib.parse from playwright.async_api import Page +from playwright.async_api import BrowserContext from .field import * from .exception import * +from tools import utils class DOUYINClient: @@ -33,7 +35,6 @@ class DOUYINClient: headers = headers or self.headers local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") douyin_js_obj = execjs.compile(open('libs/douyin.js').read()) - # douyin_js_obj = execjs.compile(open('libs/X-Bogus.js').read()) common_params = { "device_platform": "webapp", "aid": "6383", @@ -82,6 +83,17 @@ class DOUYINClient: headers = headers or self.headers return await self.request(method="POST", url=f"{self._host}{uri}", data=data, headers=headers) + @staticmethod + async def ping(browser_context: BrowserContext) -> bool: + _, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + # todo send some api to test login status + return cookie_dict.get("LOGIN_STATUS") == "1" + + async def update_cookies(self, browser_context: BrowserContext): + cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + self.headers["Cookie"] = cookie_str + self.cookie_dict = cookie_dict + async def search_info_by_keyword( self, keyword: str, diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 520628e..4463ca0 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -1,13 +1,14 @@ -import logging +import os import asyncio +import logging from asyncio import Task from argparse import Namespace from typing import Optional, List, Dict, Tuple from playwright.async_api import async_playwright -from playwright.async_api import Page -from playwright.async_api import Cookie +from playwright.async_api import BrowserType from playwright.async_api import BrowserContext +from playwright.async_api import Page import config from tools import utils @@ -21,12 +22,11 @@ from models import douyin class DouYinCrawler(AbstractCrawler): def __init__(self): - self.cookies: Optional[List[Cookie]] = None self.browser_context: Optional[BrowserContext] = None self.context_page: Optional[Page] = None - self.proxy: Optional[Dict] = None self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed self.dy_client: Optional[DOUYINClient] = None + self.index_url = "https://www.douyin.com" self.command_args: Optional[Namespace] = None self.account_pool: Optional[AccountPool] = None @@ -34,94 +34,45 @@ class DouYinCrawler(AbstractCrawler): for key, value in kwargs.items(): setattr(self, key, value) - def create_proxy_info(self) -> Tuple[str, Dict, str]: - """Create proxy info for playwright and httpx""" - # phone: 13012345671 - # ip_proxy: 111.122.xx.xx1:8888 - # 手机号和IP代理都是从账号池中获取的,并且它们是固定绑定的 - phone, ip_proxy = self.account_pool.get_account() - playwright_proxy = { - "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}", - "username": config.IP_PROXY_USER, - "password": config.IP_PROXY_PASSWORD, - } - httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}" - return phone, playwright_proxy, httpx_proxy - async def start(self): - # phone: 1340xxxx, ip_proxy: 47.xxx.xxx.xxx:8888 - account_phone, ip_proxy = self.account_pool.get_account() - - # 抖音平台如果开启代理登录的话,会被风控,所以这里不开启代理 - playwright_proxy = None - # playwright_proxy = { - # "server": f"{config.ip_proxy_protocol}{ip_proxy}", - # "username": config.ip_proxy_user, - # "password": config.ip_proxy_password, - # } - - httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}" - if not config.ENABLE_IP_PROXY: - playwright_proxy = None - httpx_proxy = None - + account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() async with async_playwright() as playwright: + # Launch a browser context. chromium = playwright.chromium - browser = await chromium.launch(headless=config.HEADLESS, proxy=playwright_proxy) - self.browser_context = await browser.new_context( - viewport={"width": 1800, "height": 900}, - user_agent=self.user_agent, + self.browser_context = await self.launch_browser( + chromium, + playwright_proxy, + self.user_agent, + headless=config.HEADLESS ) - # execute JS to bypass anti automation/crawler detection + # stealth.min.js is a js script to prevent the website from detecting the crawler. await self.browser_context.add_init_script(path="libs/stealth.min.js") self.context_page = await self.browser_context.new_page() - await self.context_page.goto("https://www.douyin.com", wait_until="domcontentloaded") - await asyncio.sleep(3) + await self.context_page.goto(self.index_url) - # begin login - login_obj = DouYinLogin( - login_type=self.command_args.lt, - login_phone=account_phone, - browser_context=self.browser_context, - context_page=self.context_page, - cookie_str=config.COOKIES - ) - await login_obj.begin() - - # update cookies - await self.update_cookies() - - # init request client - cookie_str, cookie_dict = utils.convert_cookies(self.cookies) - self.dy_client = DOUYINClient( - proxies=httpx_proxy, - headers={ - "User-Agent": self.user_agent, - "Cookie": cookie_str, - "Host": "www.douyin.com", - "Origin": "https://www.douyin.com/", - "Referer": "https://www.douyin.com/", - "Content-Type": "application/json;charset=UTF-8" - }, - playwright_page=self.context_page, - cookie_dict=cookie_dict, - ) + self.dy_client = await self.create_douyin_client(httpx_proxy) + if not await self.dy_client.ping(browser_context=self.browser_context): + login_obj = DouYinLogin( + login_type=self.command_args.lt, + login_phone=account_phone, + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES + ) + await login_obj.begin() + await self.dy_client.update_cookies(browser_context=self.browser_context) # search_posts await self.search_posts() - # block main crawler coroutine - await asyncio.Event().wait() - - async def update_cookies(self): - self.cookies = await self.browser_context.cookies() + utils.logger.info("Douyin Crawler finished ...") async def search_posts(self): - logging.info("Begin search douyin keywords") + utils.logger.info("Begin search douyin keywords") for keyword in config.KEYWORDS.split(","): - logging.info(f"Current keyword: {keyword}") + utils.logger.info(f"Current keyword: {keyword}") aweme_list: List[str] = [] - max_note_len = 20 + max_note_len = config.MAX_PAGE_NUM page = 0 while max_note_len > 0: try: @@ -139,8 +90,8 @@ class DouYinCrawler(AbstractCrawler): continue aweme_list.append(aweme_info.get("aweme_id")) await douyin.update_douyin_aweme(aweme_item=aweme_info) - print(f"keyword:{keyword}, aweme_list:{aweme_list}") - await self.batch_get_note_comments(aweme_list) + utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}") + # await self.batch_get_note_comments(aweme_list) async def batch_get_note_comments(self, aweme_list: List[str]): task_list: List[Task] = [] @@ -155,6 +106,71 @@ class DouYinCrawler(AbstractCrawler): aweme_id=aweme_id, callback=douyin.batch_update_dy_aweme_comments ) - print(f"aweme_id: {aweme_id} comments have all been obtained completed ...") + utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...") except DataFetchError as e: logging.error(f"aweme_id: {aweme_id} get comments failed, error: {e}") + + def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]: + """Create proxy info for playwright and httpx""" + if not config.ENABLE_IP_PROXY: + return None, None, None + + # phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888 + phone, ip_proxy = self.account_pool.get_account() + playwright_proxy = { + "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}", + "username": config.IP_PROXY_USER, + "password": config.IP_PROXY_PASSWORD, + } + httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}" + return phone, playwright_proxy, httpx_proxy + + async def create_douyin_client(self, httpx_proxy: str) -> DOUYINClient: + """Create douyin client""" + cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) + douyin_client = DOUYINClient( + proxies=httpx_proxy, + headers={ + "User-Agent": self.user_agent, + "Cookie": cookie_str, + "Host": "www.douyin.com", + "Origin": "https://www.douyin.com/", + "Referer": "https://www.douyin.com/", + "Content-Type": "application/json;charset=UTF-8" + }, + playwright_page=self.context_page, + cookie_dict=cookie_dict, + ) + return douyin_client + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True + ) -> BrowserContext: + """Launch browser and create browser context""" + if config.SAVE_LOGIN_STATE: + user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform) + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent + ) + return browser_context + else: + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) + browser_context = await browser.new_context( + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent + ) + return browser_context + + async def close(self): + """Close browser context""" + await self.browser_context.close() + utils.logger.info("Browser context closed ...") diff --git a/media_platform/douyin/login.py b/media_platform/douyin/login.py index 418fbe6..7ee65f8 100644 --- a/media_platform/douyin/login.py +++ b/media_platform/douyin/login.py @@ -8,13 +8,14 @@ from tenacity import ( retry, stop_after_attempt, wait_fixed, - retry_if_result + retry_if_result, + RetryError ) from playwright.async_api import Page, TimeoutError as PlaywrightTimeoutError from playwright.async_api import BrowserContext import config -from tools import utils, easing +from tools import utils from base.base_crawler import AbstractLogin @@ -54,21 +55,22 @@ class DouYinLogin(AbstractLogin): raise ValueError("Invalid Login Type Currently only supported qrcode or phone ...") # 如果页面重定向到滑动验证码页面,需要再次滑动滑块 - await asyncio.sleep(3) + await asyncio.sleep(6) current_page_title = await self.context_page.title() if "验证码中间页" in current_page_title: await self.check_page_display_slider(move_step=3, slider_level="hard") # check login state - logging.info(f"login finished then check login state ...") - login_flag: bool = await self.check_login_state() - if not login_flag: - logging.info("login failed please confirm ...") + utils.logger.info(f"login finished then check login state ...") + try: + await self.check_login_state() + except RetryError: + utils.logger.info("login failed please confirm ...") sys.exit() # wait for redirect wait_redirect_seconds = 5 - logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") await asyncio.sleep(wait_redirect_seconds) @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) @@ -88,31 +90,31 @@ class DouYinLogin(AbstractLogin): await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10) except Exception as e: logging.error(f"login dialog box does not pop up automatically, error: {e}") - logging.info("login dialog box does not pop up automatically, we will manually click the login button") + utils.logger.info("login dialog box does not pop up automatically, we will manually click the login button") login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']") await login_button_ele.click() await asyncio.sleep(0.5) async def login_by_qrcode(self): - logging.info("Begin login douyin by qrcode...") + utils.logger.info("Begin login douyin by qrcode...") qrcode_img_selector = "xpath=//article[@class='web-login']//img" base64_qrcode_img = await utils.find_login_qrcode( self.context_page, selector=qrcode_img_selector ) if not base64_qrcode_img: - logging.info("login qrcode not found please confirm ...") + utils.logger.info("login qrcode not found please confirm ...") sys.exit() # show login qrcode - # utils.show_qrcode(base64_qrcode_img) + # utils.show_qrcode(base64_qrcode_img) partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) utils.show_qrcode(base64_qrcode_img) await asyncio.sleep(2) async def login_by_mobile(self): - logging.info("Begin login douyin by mobile ...") + utils.logger.info("Begin login douyin by mobile ...") mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']") await mobile_tap_ele.click() await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']") @@ -128,7 +130,7 @@ class DouYinLogin(AbstractLogin): redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True) max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 while max_get_sms_code_time > 0: - logging.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...") + utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...") await asyncio.sleep(1) sms_code_key = f"dy_{self.login_phone}" sms_code_value = await redis_obj.get(sms_code_key) @@ -170,20 +172,20 @@ class DouYinLogin(AbstractLogin): # 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮 page_content = await self.context_page.content() if "操作过慢" in page_content or "提示重新操作" in page_content: - logging.info("slider verify failed, retry ...") + utils.logger.info("slider verify failed, retry ...") await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]") continue # 滑动成功后,等待滑块消失 await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000) # 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码 - logging.info("slider verify success ...") + utils.logger.info("slider verify success ...") slider_verify_success = True except Exception as e: logging.error(f"slider verify failed, error: {e}") await asyncio.sleep(1) max_slider_try_times -= 1 - logging.info(f"remaining slider try times: {max_slider_try_times}") + utils.logger.info(f"remaining slider try times: {max_slider_try_times}") continue async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"): @@ -240,7 +242,7 @@ class DouYinLogin(AbstractLogin): await self.context_page.mouse.up() async def login_by_cookies(self): - logging.info("Begin login douyin by cookie ...") + utils.logger.info("Begin login douyin by cookie ...") for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): await self.browser_context.add_cookies([{ 'name': key, diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index b110825..c3f6baa 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -82,7 +82,7 @@ class XHSClient: async def ping(self) -> bool: """get a note to check if login state is ok""" - logging.info("begin to ping xhs...") + utils.logger.info("begin to ping xhs...") note_id = "5e5cb38a000000000100185e" try: note_card: Dict = await self.get_note_by_id(note_id) diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 20a92e3..f354aa1 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -1,3 +1,4 @@ +import os import random import asyncio import logging @@ -8,6 +9,7 @@ from argparse import Namespace from playwright.async_api import Page from playwright.async_api import BrowserContext from playwright.async_api import async_playwright +from playwright.async_api import BrowserType import config from tools import utils @@ -31,8 +33,8 @@ class XiaoHongShuCrawler(AbstractCrawler): self.account_pool: Optional[AccountPool] = None def init_config(self, **kwargs): - for key in kwargs.keys(): - setattr(self, key, kwargs[key]) + for key, value in kwargs.items(): + setattr(self, key, value) async def start(self): account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() @@ -66,13 +68,13 @@ class XiaoHongShuCrawler(AbstractCrawler): # Search for notes and retrieve their comment information. await self.search_posts() - logging.info("Xhs Crawler finished ...") + utils.logger.info("Xhs Crawler finished ...") async def search_posts(self): """Search for notes and retrieve their comment information.""" - logging.info("Begin search xiaohongshu keywords") + utils.logger.info("Begin search xiaohongshu keywords") for keyword in config.KEYWORDS.split(","): - logging.info(f"Current keyword: {keyword}") + utils.logger.info(f"Current keyword: {keyword}") note_list: List[str] = [] max_note_len = config.MAX_PAGE_NUM page = 1 @@ -88,13 +90,13 @@ class XiaoHongShuCrawler(AbstractCrawler): try: note_detail = await self.xhs_client.get_note_by_id(note_id) except DataFetchError as ex: - logging.error(f"Get note detail error: {ex}") + utils.logger.error(f"Get note detail error: {ex}") continue await xhs_model.update_xhs_note(note_detail) await asyncio.sleep(0.05) note_list.append(note_id) - logging.info(f"keyword:{keyword}, note_list:{note_list}") - # await self.batch_get_note_comments(note_list) + utils.logger.info(f"keyword:{keyword}, note_list:{note_list}") + await self.batch_get_note_comments(note_list) async def batch_get_note_comments(self, note_list: List[str]): """Batch get note comments""" @@ -106,7 +108,7 @@ class XiaoHongShuCrawler(AbstractCrawler): async def get_comments(self, note_id: str): """Get note comments""" - logging.info(f"Begin get note id comments {note_id}") + utils.logger.info(f"Begin get note id comments {note_id}") all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random()) for comment in all_comments: await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment) @@ -143,12 +145,21 @@ class XiaoHongShuCrawler(AbstractCrawler): ) return xhs_client_obj - async def launch_browser(self, chromium, playwright_proxy, user_agent, headless=True) -> BrowserContext: + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True + ) -> BrowserContext: + utils.logger.info("Begin create browser context ...") """Launch browser and create browser context""" if config.SAVE_LOGIN_STATE: # feat issue #14 + # we will save login state to avoid login every time + user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform) browser_context = await chromium.launch_persistent_context( - user_data_dir=config.USER_DATA_DIR % self.command_args.platform, + user_data_dir=user_data_dir, accept_downloads=True, headless=headless, proxy=playwright_proxy, @@ -167,4 +178,4 @@ class XiaoHongShuCrawler(AbstractCrawler): async def close(self): """Close browser context""" await self.browser_context.close() - logging.info("Browser context closed ...") + utils.logger.info("Browser context closed ...") diff --git a/media_platform/xhs/login.py b/media_platform/xhs/login.py index 523bd1c..404a09d 100644 --- a/media_platform/xhs/login.py +++ b/media_platform/xhs/login.py @@ -50,7 +50,7 @@ class XHSLogin(AbstractLogin): async def begin(self): """Start login xiaohongshu""" - logging.info("Begin login xiaohongshu ...") + utils.logger.info("Begin login xiaohongshu ...") if self.login_type == "qrcode": await self.login_by_qrcode() elif self.login_type == "phone": @@ -62,7 +62,7 @@ class XHSLogin(AbstractLogin): async def login_by_mobile(self): """Login xiaohongshu by mobile""" - logging.info("Begin login xiaohongshu by mobile ...") + utils.logger.info("Begin login xiaohongshu by mobile ...") await asyncio.sleep(1) try: # 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮 @@ -79,21 +79,24 @@ class XHSLogin(AbstractLogin): ) await element.click() except Exception as e: - logging.info("have not found mobile button icon and keep going ...") + utils.logger.info("have not found mobile button icon and keep going ...") + await asyncio.sleep(1) login_container_ele = await self.context_page.wait_for_selector("div.login-container") input_ele = await login_container_ele.query_selector("label.phone > input") await input_ele.fill(self.login_phone) await asyncio.sleep(0.5) + send_btn_ele = await login_container_ele.query_selector("label.auth-code > span") await send_btn_ele.click() # 点击发送验证码 sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input") submit_btn_ele = await login_container_ele.query_selector("div.input-container > button") + redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True) max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 no_logged_in_session = "" while max_get_sms_code_time > 0: - logging.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...") + utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...") await asyncio.sleep(1) sms_code_key = f"xhs_{self.login_phone}" sms_code_value = await redis_obj.get(sms_code_key) @@ -119,17 +122,16 @@ class XHSLogin(AbstractLogin): try: await self.check_login_state(no_logged_in_session) except RetryError: - logging.info("Login xiaohongshu failed by mobile login method ...") + utils.logger.info("Login xiaohongshu failed by mobile login method ...") sys.exit() wait_redirect_seconds = 5 - logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") await asyncio.sleep(wait_redirect_seconds) async def login_by_qrcode(self): """login xiaohongshu website and keep webdriver login state""" - logging.info("Begin login xiaohongshu by qrcode ...") - await asyncio.sleep(10) + utils.logger.info("Begin login xiaohongshu by qrcode ...") # login_selector = "div.login-container > div.left > div.qrcode > img" qrcode_img_selector = "xpath=//img[@class='qrcode-img']" # find login qrcode @@ -138,7 +140,7 @@ class XHSLogin(AbstractLogin): selector=qrcode_img_selector ) if not base64_qrcode_img: - logging.info("login failed , have not found qrcode please check ....") + utils.logger.info("login failed , have not found qrcode please check ....") # if this website does not automatically popup login dialog box, we will manual click login button await asyncio.sleep(0.5) login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button") @@ -162,20 +164,20 @@ class XHSLogin(AbstractLogin): partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) - logging.info(f"waiting for scan code login, remaining time is 20s") + utils.logger.info(f"waiting for scan code login, remaining time is 20s") try: await self.check_login_state(no_logged_in_session) except RetryError: - logging.info("Login xiaohongshu failed by qrcode login method ...") + utils.logger.info("Login xiaohongshu failed by qrcode login method ...") sys.exit() wait_redirect_seconds = 5 - logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") await asyncio.sleep(wait_redirect_seconds) async def login_by_cookies(self): """login xiaohongshu website by cookies""" - logging.info("Begin login xiaohongshu by cookie ...") + utils.logger.info("Begin login xiaohongshu by cookie ...") for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): await self.browser_context.add_cookies([{ 'name': key, diff --git a/tools/utils.py b/tools/utils.py index 4f709f8..f71af4b 100644 --- a/tools/utils.py +++ b/tools/utils.py @@ -108,7 +108,12 @@ def init_loging_config(): format="%(asctime)s %(name)s %(levelname)s %(message)s ", datefmt='%Y-%m-%d %H:%M:%S' ) - logging.Logger("Media Crawler") + _logger = logging.getLogger("MediaCrawler") + _logger.setLevel(level) + return _logger + + +logger = init_loging_config() class Slide: