refactor:优化部分代码

feat: 增加IP代理账号池
2023-06-27 23:38:30 +08:00 · 2023-06-27 23:38:30 +08:00 · b8093a2c0f
parent 963d9a16d3
commit b8093a2c0f
19 changed files with 615 additions and 254 deletions
--- a/README.md
+++ b/README.md
@ -13,6 +13,7 @@
 - [x] 小红书 笔记、评论
 - [x] 小红书 二维码扫描登录 | 手机号+验证码自动登录 | cookies登录
 - [x] 爬取抖音视频、评论
 - [x] IP代理池，账号池
 - [ ] To do 抖音滑块
 ## 技术栈
@ -28,7 +29,7 @@
 2. 安装playwright浏览器驱动
   `playwright install`
 3. 运行爬虫程序
-   `python main.py --platform xhs --keywords 健身 --lt qrcode`
+   `python main.py --platform xhs --lt qrcode`
 4. 打开小红书扫二维码登录
 ## 小红书运行截图
@ -46,8 +47,8 @@
 - 转发软件中配置WEBHOOK相关的信息，主要分为 消息模板（请查看本项目中的recv_sms_notification.py）、一个能push短信通知的API地址
 - push的API地址一般是需要绑定一个域名的（当然也可以是内网的IP地址），我用的是内网穿透方式，会有一个免费的域名绑定到内网的web server，内网穿透工具 [ngrok](https://ngrok.com/docs/)
 - 安装redis并设置一个密码 [redis安装](https://www.cnblogs.com/hunanzp/p/12304622.html)
- 执行 `python recv_sms_notification.py` 等待短信转发器发送HTTP通知
+- 执行 `python tools/recv_sms_notification.py` 等待短信转发器发送HTTP通知
- 执行手机号登录的爬虫程序 `python main.py --platform xhs --keywords 健身 --lt phone --phone 13812345678`
+- 执行手机号登录的爬虫程序 `python main.py --platform xhs --lt phone`
 备注：
 - 小红书这边一个手机号一天只能发10条短信（悠着点），目前在发验证码时还未触发滑块验证，估计多了之后也会有~
--- a/base/init.py
+++ b/base/init.py
--- a/base/base_crawler.py
+++ b/base/base_crawler.py
@ -0,0 +1,41 @@
 from abc import ABC, abstractmethod
 class AbstractCrawler(ABC):
    @abstractmethod
    def init_config(self, **kwargs):
        pass
    @abstractmethod
    async def start(self):
        pass
    @abstractmethod
    async def search_posts(self):
        pass
    @abstractmethod
    async def get_comments(self, item_id: int):
        pass
 class AbstractLogin(ABC):
    @abstractmethod
    async def begin(self):
        pass
    @abstractmethod
    async def check_login_state(self):
        pass
    @abstractmethod
    async def login_by_qrcode(self):
        pass
    @abstractmethod
    async def login_by_mobile(self):
        pass
    @abstractmethod
    async def login_by_cookies(self):
        pass
--- a/base/proxy_account_pool.py
+++ b/base/proxy_account_pool.py
@ -0,0 +1,130 @@
 import config
 class PhonePool:
    """phone pool class"""
    def __init__(self):
        self.phones = []
        self.used_phones = set()
    def add_phone(self, phone):
        """add phone to the pool"""
        if phone not in self.phones:
            self.phones.append(phone)
            return True
        return False
    def remove_phone(self, phone):
        """remove phone from the pool"""
        if phone in self.used_phones:
            self.phones.remove(phone)
            self.used_phones.remove(phone)
            return True
        return False
    def get_phone(self):
        """get phone and mark as used"""
        if self.phones:
            left_phone = self.phones.pop(0)
            self.used_phones.add(left_phone)
            return left_phone
        return None
    def clear(self):
        """clear phone pool"""
        self.phones = []
        self.used_phones = set()
 class IPPool:
    def __init__(self):
        self.ips = []
        self.used_ips = set()
    def add_ip(self, ip):
        """添加ip"""
        if ip not in self.ips:
            self.ips.append(ip)
            return True
        return False
    def remove_ip(self, ip):
        """remove ip"""
        if ip in self.used_ips:
            self.ips.remove(ip)
            self.used_ips.remove(ip)
            return True
        return False
    def get_ip(self):
        """get ip and mark as used"""
        if self.ips:
            left_ips = self.ips.pop(0)
            self.used_ips.add(left_ips)
            return left_ips
        return None
    def clear(self):
        """ clear ip pool"""
        self.ips = []
        self.used_ips = set()
 class AccountPool:
    """account pool class"""
    def __init__(self):
        self.phone_pool = PhonePool()
        self.ip_pool = IPPool()
    def add_account(self, phone, ip):
        """add account to pool with phone and ip"""
        if self.phone_pool.add_phone(phone) and self.ip_pool.add_ip(ip):
            return True
        return False
    def remove_account(self, phone, ip):
        """remove account from pool """
        if self.phone_pool.remove_phone(phone) and self.ip_pool.remove_ip(ip):
            return True
        return False
    def get_account(self):
        """get account if no account, reload account pool"""
        phone = self.phone_pool.get_phone()
        ip = self.ip_pool.get_ip()
        if not phone or not ip:
            reload_account_pool(self)
            return self.get_account()
        return phone, ip
    def clear_account(self):
        """clear account pool"""
        self.phone_pool.clear()
        self.ip_pool.clear()
 def reload_account_pool(apo: AccountPool):
    """reload account pool"""
    apo.clear_account()
    for phone, ip in zip(config.PHONE_LIST, config.IP_PROXY_LIST):
        apo.add_account(phone, ip)
 def create_account_pool() -> AccountPool:
    """create account pool"""
    apo = AccountPool()
    reload_account_pool(apo=apo)
    return apo
 if __name__ == '__main__':
    import time
    ac_pool = create_account_pool()
    p, i = ac_pool.get_account()
    while p:
        print(f"get phone:{p}, ip proxy:{i} from account pool")
        p, i = ac_pool.get_account()
        time.sleep(1)
--- a/base_crawler.py
+++ b/base_crawler.py
@ -1,23 +0,0 @@
 from abc import ABC, abstractmethod
 class Crawler(ABC):
    @abstractmethod
    def init_config(self, **kwargs):
        pass
    @abstractmethod
    async def start(self):
        pass
    @abstractmethod
    async def login(self):
        pass
    @abstractmethod
    async def search_posts(self):
        pass
    @abstractmethod
    async def get_comments(self, item_id: int):
        pass
--- a/config.py
+++ b/config.py
@ -1,14 +0,0 @@
 # config file
 platform = "xhs"
 keyword = "健身"
 login_type = "cookie"  # qrcode or phone or cookie
 login_phone = ""  # your login phone
 # If it's on the Xiaohongshu platform, only the web_session cookie will be kept.
 # web_session=040069b2acxxxxxxxxxxxxxxxxxxxx;
 cookies = ""
 # redis config
 redis_db_host = "redis://127.0.0.1"
 redis_db_pwd = "123456"  # your redis password
--- a/config/init.py
+++ b/config/init.py
@ -0,0 +1,2 @@
 from .base_config import *
 from .account_config import *
--- a/config/account_config.py
+++ b/config/account_config.py
@ -0,0 +1,27 @@
 # -*- coding: utf-8 -*-
 # account_config.py
 PHONE_LIST = [
    "13012345671",
    "13012345672",
    "13012345673",
    "13012345674",
    "13012345675",
    "13012345676",
    # ...
 ]
 IP_PROXY_LIST = [
    "111.122.xx.xx1:8888",
    "111.122.xx.xx2:8888",
    "111.122.xx.xx3:8888",
    "111.122.xx.xx4:8888",
    "111.122.xx.xx5:8888",
    "111.122.xx.xx6:8888",
    # ...
 ]
 IP_PROXY_PROTOCOL = "http://"
 IP_PROXY_USER = "xxxx"
 IP_PROXY_PASSWORD = "xxxx"
--- a/config/base_config.py
+++ b/config/base_config.py
@ -0,0 +1,19 @@
 PLATFORM = "xhs"
 KEYWORDS = "健身,旅游"
 LOGIN_TYPE = "qrcode"  # qrcode or phone or cookies
 # If it's on the Xiaohongshu platform, only the web_session cookie will be kept.
 # xhs cookie format -> web_session=040069b2acxxxxxxxxxxxxxxxxxxxx;
 COOKIES = ""
 # redis config
 REDIS_DB_HOST = "redis://127.0.0.1"  # your redis host
 REDIS_DB_PWD = "123456"  # your redis password
 # enable ip proxy
 ENABLE_IP_PROXY = False
 # retry_interval
 RETRY_INTERVAL = 60 * 30  # 30 minutes
 # playwright headless
 HEADLESS = True
--- a/main.py
+++ b/main.py
@ -3,6 +3,8 @@ import asyncio
 import argparse
 import config
 from tools import utils
 from base import proxy_account_pool
 from media_platform.douyin import DouYinCrawler
 from media_platform.xhs import XiaoHongShuCrawler
@ -19,24 +21,37 @@ class CrawlerFactory:
 async def main():
    utils.init_loging_config()
    # define command line params ...
    parser = argparse.ArgumentParser(description='Media crawler program.')
-    parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform)
+    parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.PLATFORM)
-    parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword)
+    parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.LOGIN_TYPE)
-    parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.login_type)
+
-    parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone)
+    # init account pool
-    parser.add_argument('--cookies', type=str, help='cookies to keep log in', default=config.cookies)
+    account_pool = proxy_account_pool.create_account_pool()
    args = parser.parse_args()
    crawler = CrawlerFactory().create_crawler(platform=args.platform)
    crawler.init_config(
-        keywords=args.keywords,
+        command_args=args,
-        login_phone=args.phone,
+        account_pool=account_pool
        login_type=args.lt,
        cookie_str=args.cookies
    )
    await crawler.start()
    """
    # retry when exception ...
    while True:
        try:
            await crawler.start()
        except Exception as e:
            logging.info(f"crawler start error: {e} ...")
            await crawler.close()
            # If you encounter an exception
            # sleep for a period of time before retrying
            # to avoid frequent requests that may result in the account being blocked.
            await asyncio.sleep(config.RETRY_INTERVAL)
    """
 if __name__ == '__main__':
    try:
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@ -1,42 +1,64 @@
 import logging
 import asyncio
 from asyncio import Task
-from typing import Optional, List, Dict
+from argparse import Namespace
 from typing import Optional, List, Dict, Tuple
 from playwright.async_api import async_playwright
 from playwright.async_api import Page
 from playwright.async_api import Cookie
 from playwright.async_api import BrowserContext
-import utils
+import config
 from tools import utils
 from .client import DOUYINClient
 from .exception import DataFetchError
-from base_crawler import Crawler
+from .login import DouYinLogin
 from base.base_crawler import AbstractCrawler
 from base.proxy_account_pool import AccountPool
 from models import douyin
-class DouYinCrawler(Crawler):
+class DouYinCrawler(AbstractCrawler):
    def __init__(self):
        self.keywords: Optional[str] = None
        self.cookies: Optional[List[Cookie]] = None
        self.browser_context: Optional[BrowserContext] = None
        self.context_page: Optional[Page] = None
        self.proxy: Optional[Dict] = None
        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"  # fixed
        self.dy_client: Optional[DOUYINClient] = None
        self.command_args: Optional[Namespace] = None
        self.account_pool: Optional[AccountPool] = None
    def init_config(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)
    def create_proxy_info(self) -> Tuple[str, Dict, str]:
        """Create proxy info for playwright and httpx"""
        # phone: 13012345671
        # ip_proxy: 111.122.xx.xx1:8888
        # 手机号和IP代理都是从账号池中获取的，并且它们是固定绑定的
        phone, ip_proxy = self.account_pool.get_account()
        playwright_proxy = {
            "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
            "username": config.IP_PROXY_USER,
            "password": config.IP_PROXY_PASSWORD,
        }
        httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
        return phone, playwright_proxy, httpx_proxy
    async def start(self):
        account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
        if not config.ENABLE_IP_PROXY:
            playwright_proxy, httpx_proxy = None, None
        async with async_playwright() as playwright:
            chromium = playwright.chromium
-            browser = await chromium.launch(headless=True)
+            browser = await chromium.launch(headless=True, proxy=playwright_proxy)
            self.browser_context = await browser.new_context(
                viewport={"width": 1800, "height": 900},
                user_agent=self.user_agent,
                proxy=self.proxy
            )
            # execute JS to bypass anti automation/crawler detection
            await self.browser_context.add_init_script(path="libs/stealth.min.js")
@ -44,14 +66,23 @@ class DouYinCrawler(Crawler):
            await self.context_page.goto("https://www.douyin.com", wait_until="domcontentloaded")
            await asyncio.sleep(3)
-            # scan qrcode login
+            # begin login
-            # await self.login()
+            login_obj = DouYinLogin(
                login_type=self.command_args.lt,
                login_phone=account_phone,
                browser_context=self.browser_context,
                context_page=self.context_page,
                cookie_str=config.COOKIES
            )
            # await login_obj.begin()
            # update cookies
            await self.update_cookies()
            # init request client
            cookie_str, cookie_dict = utils.convert_cookies(self.cookies)
            self.dy_client = DOUYINClient(
-                proxies=self.proxy,
+                proxies=httpx_proxy,
                headers={
                    "User-Agent": self.user_agent,
                    "Cookie": cookie_str,
@ -73,23 +104,10 @@ class DouYinCrawler(Crawler):
    async def update_cookies(self):
        self.cookies = await self.browser_context.cookies()
    async def login(self):
        """login douyin website and keep webdriver login state"""
        print("Begin login douyin ...")
        # todo ...
    async def check_login_state(self) -> bool:
        """Check if the current login status is successful and return True otherwise return False"""
        current_cookie = await self.browser_context.cookies()
        _, cookie_dict = utils.convert_cookies(current_cookie)
        if cookie_dict.get("LOGIN_STATUS") == "1":
            return True
        return False
    async def search_posts(self):
-        # It is possible to modify the source code to allow for the passing of a batch of keywords.
+        logging.info("Begin search douyin keywords")
-        for keyword in [self.keywords]:
+        for keyword in config.KEYWORDS.split(","):
-            print("Begin search douyin keywords: ", keyword)
+            logging.info(f"Current keyword: {keyword}")
            aweme_list: List[str] = []
            max_note_len = 20
            page = 0
--- a/media_platform/douyin/login.py
+++ b/media_platform/douyin/login.py
@ -0,0 +1,86 @@
 import sys
 import asyncio
 from playwright.async_api import Page
 from playwright.async_api import BrowserContext
 from tools import utils
 from base.base_crawler import AbstractLogin
 class DouYinLogin(AbstractLogin):
    async def login_by_cookies(self):
        pass
    def __init__(self,
                 login_type: str,
                 browser_context: BrowserContext,
                 context_page: Page,
                 login_phone: str = None,
                 cookie_str: str = None
                 ):
        self.login_type = login_type
        self.browser_context = browser_context
        self.context_page = context_page
        self.login_phone = login_phone
        self.cookie_str = cookie_str
        self.scan_qrcode_time = 60
    async def check_login_state(self):
        """Check if the current login status is successful and return True otherwise return False"""
        current_cookie = await self.browser_context.cookies()
        _, cookie_dict = utils.convert_cookies(current_cookie)
        if cookie_dict.get("LOGIN_STATUS") == "1":
            return True
        return False
    async def login_by_qrcode(self):
        """login douyin website and keep webdriver login state"""
        print("Begin login douyin ...")
        # find login qrcode
        base64_qrcode_img = await utils.find_login_qrcode(
            self.context_page,
            selector="xpath=//article[@class='web-login']//img"
        )
        if not base64_qrcode_img:
            if await self.check_login_state():
                return
            # todo ...if this website does not automatically popup login dialog box, we will manual click login button
            print("login failed , have not found qrcode please check ....")
            sys.exit()
        # show login qrcode
        utils.show_qrcode(base64_qrcode_img)
        while self.scan_qrcode_time > 0:
            await asyncio.sleep(1)
            self.scan_qrcode_time -= 1
            print(f"waiting for scan code login, remaining time is {self.scan_qrcode_time} seconds")
            # get login state from browser
            if await self.check_login_state():
                # If the QR code login is successful, you need to wait for a moment.
                # Because there will be a second redirection after successful login
                # executing JS during this period may be performed in a Page that has already been destroyed.
                wait_for_seconds = 5
                print(f"Login successful then wait for {wait_for_seconds} seconds redirect ...")
                while wait_for_seconds > 0:
                    await asyncio.sleep(1)
                    print(f"remaining wait {wait_for_seconds} seconds ...")
                    wait_for_seconds -= 1
                break
        else:
            sys.exit()
    async def login_by_mobile(self):
        # todo implement login by mobile
        pass
    async def begin(self):
        if self.login_type == "qrcode":
            await self.login_by_qrcode()
        elif self.login_type == "phone":
            await self.login_by_mobile()
        elif self.login_type == "cookies":
            await self.login_by_cookies()
        else:
            raise ValueError("Invalid Login Type Currently only supported qrcode or phone ...")
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@ -1,43 +1,35 @@
 import sys
 import random
 import asyncio
 import logging
 from asyncio import Task
-from typing import Optional, List, Dict
+from typing import Optional, List, Dict, Tuple
 from argparse import Namespace
 import aioredis
 from tenacity import (
    retry,
    stop_after_attempt,
    wait_fixed,
    retry_if_result
 )
 from playwright.async_api import Page
 from playwright.async_api import Cookie
 from playwright.async_api import BrowserContext
 from playwright.async_api import async_playwright
 import utils
 import config
-from .client import XHSClient
+from tools import utils
 from base_crawler import Crawler
 from models import xhs as xhs_model
 from .exception import *
 from .login import XHSLogin
 from .client import XHSClient
 from models import xhs as xhs_model
 from base.base_crawler import AbstractCrawler
 from base.proxy_account_pool import AccountPool
-class XiaoHongShuCrawler(Crawler):
+class XiaoHongShuCrawler(AbstractCrawler):
    def __init__(self):
        self.login_phone = None
        self.login_type = None
        self.keywords = None
        self.web_session = None
        self.cookies: Optional[List[Cookie]] = None  # cookies from browser context
        self.cookie_str: Optional[str] = None  # cookie string from config or command line
        self.browser_context: Optional[BrowserContext] = None
        self.context_page: Optional[Page] = None
        self.proxy: Optional[Dict] = None
        self.user_agent = utils.get_user_agent()
        self.xhs_client: Optional[XHSClient] = None
        self.index_url = "https://www.xiaohongshu.com"
        self.command_args: Optional[Namespace] = None
        self.account_pool: Optional[AccountPool] = None
    def init_config(self, **kwargs):
        for key in kwargs.keys():
@ -46,15 +38,32 @@ class XiaoHongShuCrawler(Crawler):
    async def update_cookies(self):
        self.cookies = await self.browser_context.cookies()
    def create_proxy_info(self) -> Tuple[str, Dict, str]:
        """Create proxy info for playwright and httpx"""
        # phone: 13012345671
        # ip_proxy: 111.122.xx.xx1:8888
        # 手机号和IP代理都是从账号池中获取的，并且它们是固定绑定的
        phone, ip_proxy = self.account_pool.get_account()
        playwright_proxy = {
            "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
            "username": config.IP_PROXY_USER,
            "password": config.IP_PROXY_PASSWORD,
        }
        httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
        return phone, playwright_proxy, httpx_proxy
    async def start(self):
        account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
        if not config.ENABLE_IP_PROXY:
            playwright_proxy, httpx_proxy = None, None
        async with async_playwright() as playwright:
            # launch browser and create single browser context
            chromium = playwright.chromium
-            browser = await chromium.launch(headless=True)
+            browser = await chromium.launch(headless=config.HEADLESS, proxy=playwright_proxy)
            self.browser_context = await browser.new_context(
                viewport={"width": 1920, "height": 1080},
-                user_agent=self.user_agent,
+                user_agent=self.user_agent
                proxy=self.proxy
            )
            # execute JS to bypass anti automation/crawler detection
@ -62,14 +71,23 @@ class XiaoHongShuCrawler(Crawler):
            self.context_page = await self.browser_context.new_page()
            await self.context_page.goto(self.index_url)
-            # scan qrcode login
+            # begin login
-            await self.login()
+            login_obj = XHSLogin(
                login_type=self.command_args.lt,
                login_phone=account_phone,
                browser_context=self.browser_context,
                context_page=self.context_page,
                cookie_str=config.COOKIES
            )
            await login_obj.begin()
            # update cookies
            await self.update_cookies()
            # init request client
            cookie_str, cookie_dict = utils.convert_cookies(self.cookies)
            self.xhs_client = XHSClient(
-                proxies=self.proxy,
+                proxies=httpx_proxy,
                headers={
                    "User-Agent": self.user_agent,
                    "Cookie": cookie_str,
@ -87,153 +105,15 @@ class XiaoHongShuCrawler(Crawler):
            # block main crawler coroutine
            await asyncio.Event().wait()
-    async def login(self):
+    async def close(self):
-        """login xiaohongshu website and keep webdriver login state"""
+        await self.browser_context.close()
-        # There are three ways to log in:
+        await self.browser_context.close()
-        # 1. Semi-automatic: Log in by scanning the QR code.
+        logging.info("Browser context closed ...")
        # 2. Fully automatic: Log in using forwarded text message notifications
        # 3. Semi-automatic: Log in using preset cookie
        if self.login_type == "qrcode":
            await self.login_by_qrcode()
        elif self.login_type == "phone":
            await self.login_by_mobile()
        elif self.login_type == "cookie":
            # cookie str convert to cookie dict
            for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
                await self.browser_context.add_cookies([{
                    'name': key,
                    'value': value,
                    'domain': ".xiaohongshu.com",
                    'path': "/"
                }])
        else:
            pass
    async def login_by_mobile(self):
        print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")
        await asyncio.sleep(1)
        try:
            # After entering the main page of Xiaohongshu,
            # the login window may not pop up automatically and you need to manually click the login button.
            login_button_ele = await self.context_page.wait_for_selector(
                selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
                timeout=5000
            )
            await login_button_ele.click()
            # There are also two types of login dialog boxes for pop-ups.
            # One type directly shows the phone number and verification code.
            # Another type requires clicking to switch to mobile login.
            element = await self.context_page.wait_for_selector(
                selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
                timeout=5000
            )
            await element.click()
        except:
            print("have not found mobile button icon and keep going ...")
        await asyncio.sleep(1)
        login_container_ele = await self.context_page.wait_for_selector("div.login-container")
        # Fill login phone
        input_ele = await login_container_ele.query_selector("label.phone > input")
        await input_ele.fill(self.login_phone)
        await asyncio.sleep(0.5)
        # Click to send verification code and fill it from redis server.
        send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
        await send_btn_ele.click()
        sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
        submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
        redis_obj = aioredis.from_url(url=config.redis_db_host, password=config.redis_db_pwd, decode_responses=True)
        max_get_sms_code_time = 60 * 2
        current_cookie = await self.browser_context.cookies()
        _, cookie_dict = utils.convert_cookies(current_cookie)
        no_logged_in_session = cookie_dict.get("web_session")
        while max_get_sms_code_time > 0:
            print(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
            await asyncio.sleep(1)
            sms_code_key = f"xhs_{self.login_phone}"
            sms_code_value = await redis_obj.get(sms_code_key)
            if not sms_code_value:
                max_get_sms_code_time -= 1
                continue
            await sms_code_input_ele.fill(value=sms_code_value)  # Enter SMS verification code.
            await asyncio.sleep(0.5)
            agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
            await agree_privacy_ele.click()  # Click "Agree" to the privacy policy.
            await asyncio.sleep(0.5)
            await submit_btn_ele.click()  # Click login button
            # todo ... It is necessary to check the correctness of the verification code,
            #  as it is possible that the entered verification code is incorrect.
            break
        login_flag: bool = await self.check_login_state(no_logged_in_session)
        if not login_flag:
            print("login failed please confirm sms code ...")
            sys.exit()
        wait_redirect_seconds = 5
        print(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
        await asyncio.sleep(wait_redirect_seconds)
    async def login_by_qrcode(self):
        """login xiaohongshu website and keep webdriver login state"""
        print("Start scanning QR code to log in to Xiaohongshu. ...")
        qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
        # find login qrcode
        base64_qrcode_img = await utils.find_login_qrcode(
            self.context_page,
            selector=qrcode_img_selector
        )
        if not base64_qrcode_img:
            print("have not found qrcode and try again get it ....")
            # if this website does not automatically popup login dialog box, we will manual click login button
            login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
            await login_button_ele.click()
            base64_qrcode_img = await utils.find_login_qrcode(
                self.context_page,
                selector=qrcode_img_selector
            )
            if not base64_qrcode_img:
                print("login failed , program exit ...")
                sys.exit()
        # get not logged session
        current_cookie = await self.browser_context.cookies()
        _, cookie_dict = utils.convert_cookies(current_cookie)
        no_logged_in_session = cookie_dict.get("web_session")
        # show login qrcode
        utils.show_qrcode(base64_qrcode_img)
        print(f"waiting for scan code login, remaining time is 20s")
        login_flag: bool = await self.check_login_state(no_logged_in_session)
        if not login_flag:
            print("login failed please confirm ...")
            sys.exit()
        wait_redirect_seconds = 5
        print(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
        await asyncio.sleep(wait_redirect_seconds)
    @retry(stop=stop_after_attempt(30), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
    async def check_login_state(self, no_logged_in_session: str) -> bool:
        """Check if the current login status is successful and return True otherwise return False"""
        # If login is unsuccessful, a retry exception will be thrown.
        current_cookie = await self.browser_context.cookies()
        _, cookie_dict = utils.convert_cookies(current_cookie)
        current_web_session = cookie_dict.get("web_session")
        if current_web_session != no_logged_in_session:
            return True
        return False
    async def search_posts(self):
-        print("Begin search xiaohongshu keywords")
+        logging.info("Begin search xiaohongshu keywords")
-        # It is possible to modify the source code to allow for the passing of a batch of keywords.
+        for keyword in config.KEYWORDS.split(","):
-        for keyword in [self.keywords]:
+            logging.info(f"Current keyword: {keyword}")
            note_list: List[str] = []
            max_note_len = 10
            page = 1
@ -253,7 +133,7 @@ class XiaoHongShuCrawler(Crawler):
                    await xhs_model.update_xhs_note(note_detail)
                    await asyncio.sleep(0.05)
                    note_list.append(note_id)
-            print(f"keyword:{keyword}, note_list:{note_list}")
+            logging.info(f"keyword:{keyword}, note_list:{note_list}")
            await self.batch_get_note_comments(note_list)
    async def batch_get_note_comments(self, note_list: List[str]):
@ -264,7 +144,7 @@ class XiaoHongShuCrawler(Crawler):
        await asyncio.wait(task_list)
    async def get_comments(self, note_id: str):
-        print("Begin get note id comments ", note_id)
+        logging.info(f"Begin get note id comments {note_id}")
        all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
        for comment in all_comments:
            await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)
--- a/media_platform/xhs/login.py
+++ b/media_platform/xhs/login.py
@ -0,0 +1,168 @@
 import sys
 import asyncio
 import logging
 import aioredis
 from tenacity import (
    retry,
    stop_after_attempt,
    wait_fixed,
    retry_if_result
 )
 from playwright.async_api import Page
 from playwright.async_api import BrowserContext
 import config
 from tools import utils
 from base.base_crawler import AbstractLogin
 class XHSLogin(AbstractLogin):
    def __init__(self,
                 login_type: str,
                 browser_context: BrowserContext,
                 context_page: Page,
                 login_phone: str = None,
                 cookie_str: str = None
                 ):
        self.login_type = login_type
        self.browser_context = browser_context
        self.context_page = context_page
        self.login_phone = login_phone
        self.cookie_str = cookie_str
    @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
    async def check_login_state(self, no_logged_in_session: str) -> bool:
        """Check if the current login status is successful and return True otherwise return False"""
        current_cookie = await self.browser_context.cookies()
        _, cookie_dict = utils.convert_cookies(current_cookie)
        current_web_session = cookie_dict.get("web_session")
        if current_web_session != no_logged_in_session:
            return True
        return False
    async def begin(self):
        if self.login_type == "qrcode":
            await self.login_by_qrcode()
        elif self.login_type == "phone":
            await self.login_by_mobile()
        elif self.login_type == "cookies":
            await self.login_by_cookies()
        else:
            raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookies ...")
    async def login_by_mobile(self):
        logging.info("Begin login xiaohongshu by mobile ...")
        await asyncio.sleep(1)
        try:
            # 小红书进入首页后，有可能不会自动弹出登录框，需要手动点击登录按钮
            login_button_ele = await self.context_page.wait_for_selector(
                selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
                timeout=5000
            )
            await login_button_ele.click()
            # 弹窗的登录对话框也有两种形态，一种是直接可以看到手机号和验证码的
            # 另一种是需要点击切换到手机登录的
            element = await self.context_page.wait_for_selector(
                selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
                timeout=5000
            )
            await element.click()
        except Exception as e:
            logging.info("have not found mobile button icon and keep going ...")
        await asyncio.sleep(1)
        login_container_ele = await self.context_page.wait_for_selector("div.login-container")
        input_ele = await login_container_ele.query_selector("label.phone > input")
        await input_ele.fill(self.login_phone)
        await asyncio.sleep(0.5)
        send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
        await send_btn_ele.click()  # 点击发送验证码
        sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
        submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
        redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
        max_get_sms_code_time = 60 * 2  # 最长获取验证码的时间为2分钟
        no_logged_in_session = ""
        while max_get_sms_code_time > 0:
            logging.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
            await asyncio.sleep(1)
            sms_code_key = f"xhs_{self.login_phone}"
            sms_code_value = await redis_obj.get(sms_code_key)
            if not sms_code_value:
                max_get_sms_code_time -= 1
                continue
            current_cookie = await self.browser_context.cookies()
            _, cookie_dict = utils.convert_cookies(current_cookie)
            no_logged_in_session = cookie_dict.get("web_session")
            await sms_code_input_ele.fill(value=sms_code_value)  # 输入短信验证码
            await asyncio.sleep(0.5)
            agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
            await agree_privacy_ele.click()  # 点击同意隐私协议
            await asyncio.sleep(0.5)
            await submit_btn_ele.click()  # 点击登录
            # todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
            break
        login_flag: bool = await self.check_login_state(no_logged_in_session)
        if not login_flag:
            logging.info("login failed please confirm ...")
            sys.exit()
        wait_redirect_seconds = 5
        logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
        await asyncio.sleep(wait_redirect_seconds)
    async def login_by_qrcode(self):
        """login xiaohongshu website and keep webdriver login state"""
        logging.info("Begin login xiaohongshu by qrcode ...")
        await asyncio.sleep(10)
        # login_selector = "div.login-container > div.left > div.qrcode > img"
        qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
        # find login qrcode
        base64_qrcode_img = await utils.find_login_qrcode(
            self.context_page,
            selector=qrcode_img_selector
        )
        if not base64_qrcode_img:
            logging.info("login failed , have not found qrcode please check ....")
            # if this website does not automatically popup login dialog box, we will manual click login button
            await asyncio.sleep(0.5)
            login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
            await login_button_ele.click()
            base64_qrcode_img = await utils.find_login_qrcode(
                self.context_page,
                selector=qrcode_img_selector
            )
            if not base64_qrcode_img:
                sys.exit()
        # get not logged session
        current_cookie = await self.browser_context.cookies()
        _, cookie_dict = utils.convert_cookies(current_cookie)
        no_logged_in_session = cookie_dict.get("web_session")
        # show login qrcode
        utils.show_qrcode(base64_qrcode_img)
        logging.info(f"waiting for scan code login, remaining time is 20s")
        login_flag: bool = await self.check_login_state(no_logged_in_session)
        if not login_flag:
            logging.info("login failed please confirm ...")
            sys.exit()
        wait_redirect_seconds = 5
        logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
        await asyncio.sleep(wait_redirect_seconds)
    async def login_by_cookies(self):
        logging.info("Begin login xiaohongshu by cookie ...")
        for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
            await self.browser_context.add_cookies([{
                'name': key,
                'value': value,
                'domain': ".xiaohongshu.com",
                'path': "/"
            }])
--- a/models/douyin/m_douyin.py
+++ b/models/douyin/m_douyin.py
@ -1,7 +1,7 @@
 import json
 from typing import Dict, List
-import utils
+from tools import utils
 async def update_douyin_aweme(aweme_item: Dict):
@ -24,7 +24,7 @@ async def update_douyin_aweme(aweme_item: Dict):
        "last_modify_ts": utils.get_current_timestamp(),
    }
    # do something ...
-    print(f"update douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
+    print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
 async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
@ -61,4 +61,4 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
        "last_modify_ts": utils.get_current_timestamp(),
    }
    # do something ...
-    print(f"update aweme comment: {comment_id}, content: {local_db_item.get('content')}")
+    print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")
--- a/models/xhs/m_xhs.py
+++ b/models/xhs/m_xhs.py
@ -1,6 +1,6 @@
 from typing import Dict
-import utils
+from tools import utils
 async def update_xhs_note(note_item: Dict):
@ -24,7 +24,7 @@ async def update_xhs_note(note_item: Dict):
        "last_modify_ts": utils.get_current_timestamp(),
    }
    # do something ...
-    print("update note:", local_db_item)
+    print("xhs note:", local_db_item)
 async def update_xhs_note_comment(note_id: str, comment_item: Dict):
@ -43,4 +43,4 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
        "last_modify_ts": utils.get_current_timestamp(),
    }
    # do something ...
-    print("update comment:", local_db_item)
+    print("xhs note comment:", local_db_item)
--- a/tools/init.py
+++ b/tools/init.py
--- a/tools/recv_sms_notification.py
+++ b/tools/recv_sms_notification.py
--- a/tools/utils.py
+++ b/tools/utils.py
@ -2,6 +2,7 @@ import re
 import time
 import random
 import base64
 import logging
 from io import BytesIO
 from typing import Optional, Dict, List, Tuple
@ -91,3 +92,13 @@ def match_interact_info_count(count_str: str) -> int:
        return int(number)
    else:
        return 0
 def init_loging_config():
    level = logging.INFO
    logging.basicConfig(
        level=level,
        format="%(asctime)s %(name)s %(levelname)s %(message)s ",
        datefmt='%Y-%m-%d  %H:%M:%S'
    )
    logging.Logger("Media Crawler")
		`@ -0,0 +1,2 @@`
							`from .base_config import *`
							`from .account_config import *`