refactor:优化部分代码

feat: 增加IP代理账号池
2023-06-27 23:38:30 +08:00 · 2023-06-27 23:38:30 +08:00 · b8093a2c0f
parent 963d9a16d3
commit b8093a2c0f
19 changed files with 615 additions and 254 deletions
--- a/README.md
+++ b/README.md
@ -13,6 +13,7 @@
 - [x] 小红书 笔记、评论
 - [x] 小红书 二维码扫描登录 | 手机号+验证码自动登录 | cookies登录
 - [x] 爬取抖音视频、评论
+- [x] IP代理池，账号池
 - [ ] To do 抖音滑块

 ## 技术栈
@ -28,7 +29,7 @@
 2. 安装playwright浏览器驱动
   `playwright install`
 3. 运行爬虫程序
-   `python main.py --platform xhs --keywords 健身 --lt qrcode`
+   `python main.py --platform xhs --lt qrcode`
 4. 打开小红书扫二维码登录

 ## 小红书运行截图
@ -46,8 +47,8 @@
 - 转发软件中配置WEBHOOK相关的信息，主要分为 消息模板（请查看本项目中的recv_sms_notification.py）、一个能push短信通知的API地址
 - push的API地址一般是需要绑定一个域名的（当然也可以是内网的IP地址），我用的是内网穿透方式，会有一个免费的域名绑定到内网的web server，内网穿透工具 [ngrok](https://ngrok.com/docs/)
 - 安装redis并设置一个密码 [redis安装](https://www.cnblogs.com/hunanzp/p/12304622.html)
- 执行 `python recv_sms_notification.py` 等待短信转发器发送HTTP通知
- 执行手机号登录的爬虫程序 `python main.py --platform xhs --keywords 健身 --lt phone --phone 13812345678`
+- 执行 `python tools/recv_sms_notification.py` 等待短信转发器发送HTTP通知
+- 执行手机号登录的爬虫程序 `python main.py --platform xhs --lt phone`

 备注：
 - 小红书这边一个手机号一天只能发10条短信（悠着点），目前在发验证码时还未触发滑块验证，估计多了之后也会有~
--- a/base/init.py
+++ b/base/init.py
--- a/base/base_crawler.py
+++ b/base/base_crawler.py
@ -0,0 +1,41 @@
+from abc import ABC, abstractmethod
+
+
+class AbstractCrawler(ABC):
+    @abstractmethod
+    def init_config(self, **kwargs):
+        pass
+
+    @abstractmethod
+    async def start(self):
+        pass
+
+    @abstractmethod
+    async def search_posts(self):
+        pass
+
+    @abstractmethod
+    async def get_comments(self, item_id: int):
+        pass
+
+
+class AbstractLogin(ABC):
+    @abstractmethod
+    async def begin(self):
+        pass
+
+    @abstractmethod
+    async def check_login_state(self):
+        pass
+
+    @abstractmethod
+    async def login_by_qrcode(self):
+        pass
+
+    @abstractmethod
+    async def login_by_mobile(self):
+        pass
+
+    @abstractmethod
+    async def login_by_cookies(self):
+        pass
--- a/base/proxy_account_pool.py
+++ b/base/proxy_account_pool.py
@ -0,0 +1,130 @@
+import config
+
+
+class PhonePool:
+    """phone pool class"""
+
+    def __init__(self):
+        self.phones = []
+        self.used_phones = set()
+
+    def add_phone(self, phone):
+        """add phone to the pool"""
+        if phone not in self.phones:
+            self.phones.append(phone)
+            return True
+        return False
+
+    def remove_phone(self, phone):
+        """remove phone from the pool"""
+        if phone in self.used_phones:
+            self.phones.remove(phone)
+            self.used_phones.remove(phone)
+            return True
+        return False
+
+    def get_phone(self):
+        """get phone and mark as used"""
+        if self.phones:
+            left_phone = self.phones.pop(0)
+            self.used_phones.add(left_phone)
+            return left_phone
+        return None
+
+    def clear(self):
+        """clear phone pool"""
+        self.phones = []
+        self.used_phones = set()
+
+
+class IPPool:
+    def __init__(self):
+        self.ips = []
+        self.used_ips = set()
+
+    def add_ip(self, ip):
+        """添加ip"""
+        if ip not in self.ips:
+            self.ips.append(ip)
+            return True
+        return False
+
+    def remove_ip(self, ip):
+        """remove ip"""
+        if ip in self.used_ips:
+            self.ips.remove(ip)
+            self.used_ips.remove(ip)
+            return True
+        return False
+
+    def get_ip(self):
+        """get ip and mark as used"""
+        if self.ips:
+            left_ips = self.ips.pop(0)
+            self.used_ips.add(left_ips)
+            return left_ips
+        return None
+
+    def clear(self):
+        """ clear ip pool"""
+        self.ips = []
+        self.used_ips = set()
+
+
+class AccountPool:
+    """account pool class"""
+
+    def __init__(self):
+        self.phone_pool = PhonePool()
+        self.ip_pool = IPPool()
+
+    def add_account(self, phone, ip):
+        """add account to pool with phone and ip"""
+        if self.phone_pool.add_phone(phone) and self.ip_pool.add_ip(ip):
+            return True
+        return False
+
+    def remove_account(self, phone, ip):
+        """remove account from pool """
+        if self.phone_pool.remove_phone(phone) and self.ip_pool.remove_ip(ip):
+            return True
+        return False
+
+    def get_account(self):
+        """get account if no account, reload account pool"""
+        phone = self.phone_pool.get_phone()
+        ip = self.ip_pool.get_ip()
+        if not phone or not ip:
+            reload_account_pool(self)
+            return self.get_account()
+        return phone, ip
+
+    def clear_account(self):
+        """clear account pool"""
+        self.phone_pool.clear()
+        self.ip_pool.clear()
+
+
+def reload_account_pool(apo: AccountPool):
+    """reload account pool"""
+    apo.clear_account()
+    for phone, ip in zip(config.PHONE_LIST, config.IP_PROXY_LIST):
+        apo.add_account(phone, ip)
+
+
+def create_account_pool() -> AccountPool:
+    """create account pool"""
+    apo = AccountPool()
+    reload_account_pool(apo=apo)
+    return apo
+
+
+if __name__ == '__main__':
+    import time
+
+    ac_pool = create_account_pool()
+    p, i = ac_pool.get_account()
+    while p:
+        print(f"get phone:{p}, ip proxy:{i} from account pool")
+        p, i = ac_pool.get_account()
+        time.sleep(1)
--- a/base_crawler.py
+++ b/base_crawler.py
@ -1,23 +0,0 @@
-from abc import ABC, abstractmethod
-
-
-class Crawler(ABC):
-    @abstractmethod
-    def init_config(self, **kwargs):
-        pass
-
-    @abstractmethod
-    async def start(self):
-        pass
-
-    @abstractmethod
-    async def login(self):
-        pass
-
-    @abstractmethod
-    async def search_posts(self):
-        pass
-
-    @abstractmethod
-    async def get_comments(self, item_id: int):
-        pass
--- a/config.py
+++ b/config.py
@ -1,14 +0,0 @@
-# config file
-
-platform = "xhs"
-keyword = "健身"
-login_type = "cookie"  # qrcode or phone or cookie
-login_phone = ""  # your login phone
-
-# If it's on the Xiaohongshu platform, only the web_session cookie will be kept.
-# web_session=040069b2acxxxxxxxxxxxxxxxxxxxx;
-cookies = ""
-
-# redis config
-redis_db_host = "redis://127.0.0.1"
-redis_db_pwd = "123456"  # your redis password
--- a/config/init.py
+++ b/config/init.py
@ -0,0 +1,2 @@
+from .base_config import *
+from .account_config import *
--- a/config/account_config.py
+++ b/config/account_config.py
@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+# account_config.py
+
+PHONE_LIST = [
+    "13012345671",
+    "13012345672",
+    "13012345673",
+    "13012345674",
+    "13012345675",
+    "13012345676",
+    # ...
+]
+
+IP_PROXY_LIST = [
+    "111.122.xx.xx1:8888",
+    "111.122.xx.xx2:8888",
+    "111.122.xx.xx3:8888",
+    "111.122.xx.xx4:8888",
+    "111.122.xx.xx5:8888",
+    "111.122.xx.xx6:8888",
+    # ...
+]
+
+IP_PROXY_PROTOCOL = "http://"
+IP_PROXY_USER = "xxxx"
+IP_PROXY_PASSWORD = "xxxx"
+
--- a/config/base_config.py
+++ b/config/base_config.py
@ -0,0 +1,19 @@
+PLATFORM = "xhs"
+KEYWORDS = "健身,旅游"
+LOGIN_TYPE = "qrcode"  # qrcode or phone or cookies
+# If it's on the Xiaohongshu platform, only the web_session cookie will be kept.
+# xhs cookie format -> web_session=040069b2acxxxxxxxxxxxxxxxxxxxx;
+COOKIES = ""
+
+# redis config
+REDIS_DB_HOST = "redis://127.0.0.1"  # your redis host
+REDIS_DB_PWD = "123456"  # your redis password
+
+# enable ip proxy
+ENABLE_IP_PROXY = False
+
+# retry_interval
+RETRY_INTERVAL = 60 * 30  # 30 minutes
+
+# playwright headless
+HEADLESS = True
--- a/main.py
+++ b/main.py
@ -3,6 +3,8 @@ import asyncio
 import argparse

 import config
+from tools import utils
+from base import proxy_account_pool
 from media_platform.douyin import DouYinCrawler
 from media_platform.xhs import XiaoHongShuCrawler

@ -19,24 +21,37 @@ class CrawlerFactory:


 async def main():
+    utils.init_loging_config()
    # define command line params ...
    parser = argparse.ArgumentParser(description='Media crawler program.')
-    parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform)
-    parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword)
-    parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.login_type)
-    parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone)
-    parser.add_argument('--cookies', type=str, help='cookies to keep log in', default=config.cookies)
+    parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.PLATFORM)
+    parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.LOGIN_TYPE)
+
+    # init account pool
+    account_pool = proxy_account_pool.create_account_pool()

    args = parser.parse_args()
    crawler = CrawlerFactory().create_crawler(platform=args.platform)
    crawler.init_config(
-        keywords=args.keywords,
-        login_phone=args.phone,
-        login_type=args.lt,
-        cookie_str=args.cookies
+        command_args=args,
+        account_pool=account_pool
    )
    await crawler.start()

+    """
+    # retry when exception ...
+    while True:
+        try:
+            await crawler.start()
+        except Exception as e:
+            logging.info(f"crawler start error: {e} ...")
+            await crawler.close()
+            # If you encounter an exception
+            # sleep for a period of time before retrying
+            # to avoid frequent requests that may result in the account being blocked.
+            await asyncio.sleep(config.RETRY_INTERVAL)
+    """
+

 if __name__ == '__main__':
    try:
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@ -1,42 +1,64 @@
 import logging
 import asyncio
 from asyncio import Task
-from typing import Optional, List, Dict
+from argparse import Namespace
+from typing import Optional, List, Dict, Tuple

 from playwright.async_api import async_playwright
 from playwright.async_api import Page
 from playwright.async_api import Cookie
 from playwright.async_api import BrowserContext

-import utils
+import config
+from tools import utils
 from .client import DOUYINClient
 from .exception import DataFetchError
-from base_crawler import Crawler
+from .login import DouYinLogin
+from base.base_crawler import AbstractCrawler
+from base.proxy_account_pool import AccountPool
 from models import douyin


-class DouYinCrawler(Crawler):
+class DouYinCrawler(AbstractCrawler):
    def __init__(self):
-        self.keywords: Optional[str] = None
        self.cookies: Optional[List[Cookie]] = None
        self.browser_context: Optional[BrowserContext] = None
        self.context_page: Optional[Page] = None
        self.proxy: Optional[Dict] = None
        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"  # fixed
        self.dy_client: Optional[DOUYINClient] = None
+        self.command_args: Optional[Namespace] = None
+        self.account_pool: Optional[AccountPool] = None

    def init_config(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

+    def create_proxy_info(self) -> Tuple[str, Dict, str]:
+        """Create proxy info for playwright and httpx"""
+        # phone: 13012345671
+        # ip_proxy: 111.122.xx.xx1:8888
+        # 手机号和IP代理都是从账号池中获取的，并且它们是固定绑定的
+        phone, ip_proxy = self.account_pool.get_account()
+        playwright_proxy = {
+            "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
+            "username": config.IP_PROXY_USER,
+            "password": config.IP_PROXY_PASSWORD,
+        }
+        httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
+        return phone, playwright_proxy, httpx_proxy
+
    async def start(self):
+        account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
+        if not config.ENABLE_IP_PROXY:
+            playwright_proxy, httpx_proxy = None, None
+
        async with async_playwright() as playwright:
            chromium = playwright.chromium
-            browser = await chromium.launch(headless=True)
+            browser = await chromium.launch(headless=True, proxy=playwright_proxy)
            self.browser_context = await browser.new_context(
                viewport={"width": 1800, "height": 900},
                user_agent=self.user_agent,
-                proxy=self.proxy
            )
            # execute JS to bypass anti automation/crawler detection
            await self.browser_context.add_init_script(path="libs/stealth.min.js")
@ -44,14 +66,23 @@ class DouYinCrawler(Crawler):
            await self.context_page.goto("https://www.douyin.com", wait_until="domcontentloaded")
            await asyncio.sleep(3)

-            # scan qrcode login
-            # await self.login()
+            # begin login
+            login_obj = DouYinLogin(
+                login_type=self.command_args.lt,
+                login_phone=account_phone,
+                browser_context=self.browser_context,
+                context_page=self.context_page,
+                cookie_str=config.COOKIES
+            )
+            # await login_obj.begin()
+
+            # update cookies
            await self.update_cookies()

            # init request client
            cookie_str, cookie_dict = utils.convert_cookies(self.cookies)
            self.dy_client = DOUYINClient(
-                proxies=self.proxy,
+                proxies=httpx_proxy,
                headers={
                    "User-Agent": self.user_agent,
                    "Cookie": cookie_str,
@ -73,23 +104,10 @@ class DouYinCrawler(Crawler):
    async def update_cookies(self):
        self.cookies = await self.browser_context.cookies()

-    async def login(self):
-        """login douyin website and keep webdriver login state"""
-        print("Begin login douyin ...")
-        # todo ...
-
-    async def check_login_state(self) -> bool:
-        """Check if the current login status is successful and return True otherwise return False"""
-        current_cookie = await self.browser_context.cookies()
-        _, cookie_dict = utils.convert_cookies(current_cookie)
-        if cookie_dict.get("LOGIN_STATUS") == "1":
-            return True
-        return False
-
    async def search_posts(self):
-        # It is possible to modify the source code to allow for the passing of a batch of keywords.
-        for keyword in [self.keywords]:
-            print("Begin search douyin keywords: ", keyword)
+        logging.info("Begin search douyin keywords")
+        for keyword in config.KEYWORDS.split(","):
+            logging.info(f"Current keyword: {keyword}")
            aweme_list: List[str] = []
            max_note_len = 20
            page = 0
--- a/media_platform/douyin/login.py
+++ b/media_platform/douyin/login.py
@ -0,0 +1,86 @@
+import sys
+import asyncio
+
+from playwright.async_api import Page
+from playwright.async_api import BrowserContext
+
+from tools import utils
+from base.base_crawler import AbstractLogin
+
+
+class DouYinLogin(AbstractLogin):
+    async def login_by_cookies(self):
+        pass
+
+    def __init__(self,
+                 login_type: str,
+                 browser_context: BrowserContext,
+                 context_page: Page,
+                 login_phone: str = None,
+                 cookie_str: str = None
+                 ):
+        self.login_type = login_type
+        self.browser_context = browser_context
+        self.context_page = context_page
+        self.login_phone = login_phone
+        self.cookie_str = cookie_str
+        self.scan_qrcode_time = 60
+
+    async def check_login_state(self):
+        """Check if the current login status is successful and return True otherwise return False"""
+        current_cookie = await self.browser_context.cookies()
+        _, cookie_dict = utils.convert_cookies(current_cookie)
+        if cookie_dict.get("LOGIN_STATUS") == "1":
+            return True
+        return False
+
+    async def login_by_qrcode(self):
+        """login douyin website and keep webdriver login state"""
+        print("Begin login douyin ...")
+        # find login qrcode
+        base64_qrcode_img = await utils.find_login_qrcode(
+            self.context_page,
+            selector="xpath=//article[@class='web-login']//img"
+        )
+        if not base64_qrcode_img:
+            if await self.check_login_state():
+                return
+            # todo ...if this website does not automatically popup login dialog box, we will manual click login button
+            print("login failed , have not found qrcode please check ....")
+            sys.exit()
+
+        # show login qrcode
+        utils.show_qrcode(base64_qrcode_img)
+
+        while self.scan_qrcode_time > 0:
+            await asyncio.sleep(1)
+            self.scan_qrcode_time -= 1
+            print(f"waiting for scan code login, remaining time is {self.scan_qrcode_time} seconds")
+            # get login state from browser
+            if await self.check_login_state():
+                # If the QR code login is successful, you need to wait for a moment.
+                # Because there will be a second redirection after successful login
+                # executing JS during this period may be performed in a Page that has already been destroyed.
+                wait_for_seconds = 5
+                print(f"Login successful then wait for {wait_for_seconds} seconds redirect ...")
+                while wait_for_seconds > 0:
+                    await asyncio.sleep(1)
+                    print(f"remaining wait {wait_for_seconds} seconds ...")
+                    wait_for_seconds -= 1
+                break
+        else:
+            sys.exit()
+
+    async def login_by_mobile(self):
+        # todo implement login by mobile
+        pass
+
+    async def begin(self):
+        if self.login_type == "qrcode":
+            await self.login_by_qrcode()
+        elif self.login_type == "phone":
+            await self.login_by_mobile()
+        elif self.login_type == "cookies":
+            await self.login_by_cookies()
+        else:
+            raise ValueError("Invalid Login Type Currently only supported qrcode or phone ...")
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@ -1,43 +1,35 @@
-import sys
 import random
 import asyncio
+import logging
 from asyncio import Task
-from typing import Optional, List, Dict
+from typing import Optional, List, Dict, Tuple
+from argparse import Namespace

-import aioredis
-from tenacity import (
-    retry,
-    stop_after_attempt,
-    wait_fixed,
-    retry_if_result
-)
 from playwright.async_api import Page
 from playwright.async_api import Cookie
 from playwright.async_api import BrowserContext
 from playwright.async_api import async_playwright

-import utils
 import config
-from .client import XHSClient
-from base_crawler import Crawler
-from models import xhs as xhs_model
+from tools import utils
 from .exception import *
+from .login import XHSLogin
+from .client import XHSClient
+from models import xhs as xhs_model
+from base.base_crawler import AbstractCrawler
+from base.proxy_account_pool import AccountPool


-class XiaoHongShuCrawler(Crawler):
+class XiaoHongShuCrawler(AbstractCrawler):
    def __init__(self):
-        self.login_phone = None
-        self.login_type = None
-        self.keywords = None
-        self.web_session = None
        self.cookies: Optional[List[Cookie]] = None  # cookies from browser context
-        self.cookie_str: Optional[str] = None  # cookie string from config or command line
        self.browser_context: Optional[BrowserContext] = None
        self.context_page: Optional[Page] = None
-        self.proxy: Optional[Dict] = None
        self.user_agent = utils.get_user_agent()
        self.xhs_client: Optional[XHSClient] = None
        self.index_url = "https://www.xiaohongshu.com"
+        self.command_args: Optional[Namespace] = None
+        self.account_pool: Optional[AccountPool] = None

    def init_config(self, **kwargs):
        for key in kwargs.keys():
@ -46,15 +38,32 @@ class XiaoHongShuCrawler(Crawler):
    async def update_cookies(self):
        self.cookies = await self.browser_context.cookies()

+    def create_proxy_info(self) -> Tuple[str, Dict, str]:
+        """Create proxy info for playwright and httpx"""
+        # phone: 13012345671
+        # ip_proxy: 111.122.xx.xx1:8888
+        # 手机号和IP代理都是从账号池中获取的，并且它们是固定绑定的
+        phone, ip_proxy = self.account_pool.get_account()
+        playwright_proxy = {
+            "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
+            "username": config.IP_PROXY_USER,
+            "password": config.IP_PROXY_PASSWORD,
+        }
+        httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
+        return phone, playwright_proxy, httpx_proxy
+
    async def start(self):
+        account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
+        if not config.ENABLE_IP_PROXY:
+            playwright_proxy, httpx_proxy = None, None
+
        async with async_playwright() as playwright:
            # launch browser and create single browser context
            chromium = playwright.chromium
-            browser = await chromium.launch(headless=True)
+            browser = await chromium.launch(headless=config.HEADLESS, proxy=playwright_proxy)
            self.browser_context = await browser.new_context(
                viewport={"width": 1920, "height": 1080},
-                user_agent=self.user_agent,
-                proxy=self.proxy
+                user_agent=self.user_agent
            )

            # execute JS to bypass anti automation/crawler detection
@ -62,14 +71,23 @@ class XiaoHongShuCrawler(Crawler):
            self.context_page = await self.browser_context.new_page()
            await self.context_page.goto(self.index_url)

-            # scan qrcode login
-            await self.login()
+            # begin login
+            login_obj = XHSLogin(
+                login_type=self.command_args.lt,
+                login_phone=account_phone,
+                browser_context=self.browser_context,
+                context_page=self.context_page,
+                cookie_str=config.COOKIES
+            )
+            await login_obj.begin()
+
+            # update cookies
            await self.update_cookies()

            # init request client
            cookie_str, cookie_dict = utils.convert_cookies(self.cookies)
            self.xhs_client = XHSClient(
-                proxies=self.proxy,
+                proxies=httpx_proxy,
                headers={
                    "User-Agent": self.user_agent,
                    "Cookie": cookie_str,
@ -87,153 +105,15 @@ class XiaoHongShuCrawler(Crawler):
            # block main crawler coroutine
            await asyncio.Event().wait()

-    async def login(self):
-        """login xiaohongshu website and keep webdriver login state"""
-        # There are three ways to log in:
-        # 1. Semi-automatic: Log in by scanning the QR code.
-        # 2. Fully automatic: Log in using forwarded text message notifications
-        # 3. Semi-automatic: Log in using preset cookie
-        if self.login_type == "qrcode":
-            await self.login_by_qrcode()
-        elif self.login_type == "phone":
-            await self.login_by_mobile()
-        elif self.login_type == "cookie":
-            # cookie str convert to cookie dict
-            for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
-                await self.browser_context.add_cookies([{
-                    'name': key,
-                    'value': value,
-                    'domain': ".xiaohongshu.com",
-                    'path': "/"
-                }])
-        else:
-            pass
-
-    async def login_by_mobile(self):
-        print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")
-
-        await asyncio.sleep(1)
-        try:
-            # After entering the main page of Xiaohongshu,
-            # the login window may not pop up automatically and you need to manually click the login button.
-            login_button_ele = await self.context_page.wait_for_selector(
-                selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
-                timeout=5000
-            )
-            await login_button_ele.click()
-
-            # There are also two types of login dialog boxes for pop-ups.
-            # One type directly shows the phone number and verification code.
-            # Another type requires clicking to switch to mobile login.
-            element = await self.context_page.wait_for_selector(
-                selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
-                timeout=5000
-            )
-            await element.click()
-        except:
-            print("have not found mobile button icon and keep going ...")
-        await asyncio.sleep(1)
-
-        login_container_ele = await self.context_page.wait_for_selector("div.login-container")
-        # Fill login phone
-        input_ele = await login_container_ele.query_selector("label.phone > input")
-        await input_ele.fill(self.login_phone)
-        await asyncio.sleep(0.5)
-
-        # Click to send verification code and fill it from redis server.
-        send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
-        await send_btn_ele.click()
-        sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
-        submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
-        redis_obj = aioredis.from_url(url=config.redis_db_host, password=config.redis_db_pwd, decode_responses=True)
-        max_get_sms_code_time = 60 * 2
-        current_cookie = await self.browser_context.cookies()
-        _, cookie_dict = utils.convert_cookies(current_cookie)
-        no_logged_in_session = cookie_dict.get("web_session")
-        while max_get_sms_code_time > 0:
-            print(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
-            await asyncio.sleep(1)
-            sms_code_key = f"xhs_{self.login_phone}"
-            sms_code_value = await redis_obj.get(sms_code_key)
-            if not sms_code_value:
-                max_get_sms_code_time -= 1
-                continue
-
-            await sms_code_input_ele.fill(value=sms_code_value)  # Enter SMS verification code.
-            await asyncio.sleep(0.5)
-            agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
-            await agree_privacy_ele.click()  # Click "Agree" to the privacy policy.
-            await asyncio.sleep(0.5)
-
-            await submit_btn_ele.click()  # Click login button
-            # todo ... It is necessary to check the correctness of the verification code,
-            #  as it is possible that the entered verification code is incorrect.
-            break
-
-        login_flag: bool = await self.check_login_state(no_logged_in_session)
-        if not login_flag:
-            print("login failed please confirm sms code ...")
-            sys.exit()
-
-        wait_redirect_seconds = 5
-        print(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
-        await asyncio.sleep(wait_redirect_seconds)
-
-    async def login_by_qrcode(self):
-        """login xiaohongshu website and keep webdriver login state"""
-        print("Start scanning QR code to log in to Xiaohongshu. ...")
-        qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
-
-        # find login qrcode
-        base64_qrcode_img = await utils.find_login_qrcode(
-            self.context_page,
-            selector=qrcode_img_selector
-        )
-        if not base64_qrcode_img:
-            print("have not found qrcode and try again get it ....")
-            # if this website does not automatically popup login dialog box, we will manual click login button
-            login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
-            await login_button_ele.click()
-            base64_qrcode_img = await utils.find_login_qrcode(
-                self.context_page,
-                selector=qrcode_img_selector
-            )
-            if not base64_qrcode_img:
-                print("login failed , program exit ...")
-                sys.exit()
-
-        # get not logged session
-        current_cookie = await self.browser_context.cookies()
-        _, cookie_dict = utils.convert_cookies(current_cookie)
-        no_logged_in_session = cookie_dict.get("web_session")
-
-        # show login qrcode
-        utils.show_qrcode(base64_qrcode_img)
-        print(f"waiting for scan code login, remaining time is 20s")
-        login_flag: bool = await self.check_login_state(no_logged_in_session)
-        if not login_flag:
-            print("login failed please confirm ...")
-            sys.exit()
-
-        wait_redirect_seconds = 5
-        print(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
-        await asyncio.sleep(wait_redirect_seconds)
-
-    @retry(stop=stop_after_attempt(30), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
-    async def check_login_state(self, no_logged_in_session: str) -> bool:
-        """Check if the current login status is successful and return True otherwise return False"""
-        # If login is unsuccessful, a retry exception will be thrown.
-        current_cookie = await self.browser_context.cookies()
-        _, cookie_dict = utils.convert_cookies(current_cookie)
-        current_web_session = cookie_dict.get("web_session")
-        if current_web_session != no_logged_in_session:
-            return True
-        return False
+    async def close(self):
+        await self.browser_context.close()
+        await self.browser_context.close()
+        logging.info("Browser context closed ...")

    async def search_posts(self):
-        print("Begin search xiaohongshu keywords")
-        # It is possible to modify the source code to allow for the passing of a batch of keywords.
-        for keyword in [self.keywords]:
+        logging.info("Begin search xiaohongshu keywords")
+        for keyword in config.KEYWORDS.split(","):
+            logging.info(f"Current keyword: {keyword}")
            note_list: List[str] = []
            max_note_len = 10
            page = 1
@ -253,7 +133,7 @@ class XiaoHongShuCrawler(Crawler):
                    await xhs_model.update_xhs_note(note_detail)
                    await asyncio.sleep(0.05)
                    note_list.append(note_id)
-            print(f"keyword:{keyword}, note_list:{note_list}")
+            logging.info(f"keyword:{keyword}, note_list:{note_list}")
            await self.batch_get_note_comments(note_list)

    async def batch_get_note_comments(self, note_list: List[str]):
@ -264,7 +144,7 @@ class XiaoHongShuCrawler(Crawler):
        await asyncio.wait(task_list)

    async def get_comments(self, note_id: str):
-        print("Begin get note id comments ", note_id)
+        logging.info(f"Begin get note id comments {note_id}")
        all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
        for comment in all_comments:
            await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)
--- a/media_platform/xhs/login.py
+++ b/media_platform/xhs/login.py
@ -0,0 +1,168 @@
+import sys
+import asyncio
+import logging
+
+import aioredis
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_fixed,
+    retry_if_result
+)
+from playwright.async_api import Page
+from playwright.async_api import BrowserContext
+
+import config
+from tools import utils
+from base.base_crawler import AbstractLogin
+
+
+class XHSLogin(AbstractLogin):
+
+    def __init__(self,
+                 login_type: str,
+                 browser_context: BrowserContext,
+                 context_page: Page,
+                 login_phone: str = None,
+                 cookie_str: str = None
+                 ):
+        self.login_type = login_type
+        self.browser_context = browser_context
+        self.context_page = context_page
+        self.login_phone = login_phone
+        self.cookie_str = cookie_str
+
+    @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
+    async def check_login_state(self, no_logged_in_session: str) -> bool:
+        """Check if the current login status is successful and return True otherwise return False"""
+        current_cookie = await self.browser_context.cookies()
+        _, cookie_dict = utils.convert_cookies(current_cookie)
+        current_web_session = cookie_dict.get("web_session")
+        if current_web_session != no_logged_in_session:
+            return True
+        return False
+
+    async def begin(self):
+        if self.login_type == "qrcode":
+            await self.login_by_qrcode()
+        elif self.login_type == "phone":
+            await self.login_by_mobile()
+        elif self.login_type == "cookies":
+            await self.login_by_cookies()
+        else:
+            raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookies ...")
+
+    async def login_by_mobile(self):
+        logging.info("Begin login xiaohongshu by mobile ...")
+        await asyncio.sleep(1)
+        try:
+            # 小红书进入首页后，有可能不会自动弹出登录框，需要手动点击登录按钮
+            login_button_ele = await self.context_page.wait_for_selector(
+                selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
+                timeout=5000
+            )
+            await login_button_ele.click()
+            # 弹窗的登录对话框也有两种形态，一种是直接可以看到手机号和验证码的
+            # 另一种是需要点击切换到手机登录的
+            element = await self.context_page.wait_for_selector(
+                selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
+                timeout=5000
+            )
+            await element.click()
+        except Exception as e:
+            logging.info("have not found mobile button icon and keep going ...")
+        await asyncio.sleep(1)
+        login_container_ele = await self.context_page.wait_for_selector("div.login-container")
+        input_ele = await login_container_ele.query_selector("label.phone > input")
+        await input_ele.fill(self.login_phone)
+        await asyncio.sleep(0.5)
+        send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
+        await send_btn_ele.click()  # 点击发送验证码
+        sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
+        submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
+        redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
+        max_get_sms_code_time = 60 * 2  # 最长获取验证码的时间为2分钟
+        no_logged_in_session = ""
+        while max_get_sms_code_time > 0:
+            logging.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
+            await asyncio.sleep(1)
+            sms_code_key = f"xhs_{self.login_phone}"
+            sms_code_value = await redis_obj.get(sms_code_key)
+            if not sms_code_value:
+                max_get_sms_code_time -= 1
+                continue
+
+            current_cookie = await self.browser_context.cookies()
+            _, cookie_dict = utils.convert_cookies(current_cookie)
+            no_logged_in_session = cookie_dict.get("web_session")
+
+            await sms_code_input_ele.fill(value=sms_code_value)  # 输入短信验证码
+            await asyncio.sleep(0.5)
+            agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
+            await agree_privacy_ele.click()  # 点击同意隐私协议
+            await asyncio.sleep(0.5)
+
+            await submit_btn_ele.click()  # 点击登录
+
+            # todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
+            break
+
+        login_flag: bool = await self.check_login_state(no_logged_in_session)
+        if not login_flag:
+            logging.info("login failed please confirm ...")
+            sys.exit()
+
+        wait_redirect_seconds = 5
+        logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
+        await asyncio.sleep(wait_redirect_seconds)
+
+    async def login_by_qrcode(self):
+        """login xiaohongshu website and keep webdriver login state"""
+        logging.info("Begin login xiaohongshu by qrcode ...")
+        await asyncio.sleep(10)
+        # login_selector = "div.login-container > div.left > div.qrcode > img"
+        qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
+        # find login qrcode
+        base64_qrcode_img = await utils.find_login_qrcode(
+            self.context_page,
+            selector=qrcode_img_selector
+        )
+        if not base64_qrcode_img:
+            logging.info("login failed , have not found qrcode please check ....")
+            # if this website does not automatically popup login dialog box, we will manual click login button
+            await asyncio.sleep(0.5)
+            login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
+            await login_button_ele.click()
+            base64_qrcode_img = await utils.find_login_qrcode(
+                self.context_page,
+                selector=qrcode_img_selector
+            )
+            if not base64_qrcode_img:
+                sys.exit()
+
+        # get not logged session
+        current_cookie = await self.browser_context.cookies()
+        _, cookie_dict = utils.convert_cookies(current_cookie)
+        no_logged_in_session = cookie_dict.get("web_session")
+
+        # show login qrcode
+        utils.show_qrcode(base64_qrcode_img)
+        logging.info(f"waiting for scan code login, remaining time is 20s")
+        login_flag: bool = await self.check_login_state(no_logged_in_session)
+        if not login_flag:
+            logging.info("login failed please confirm ...")
+            sys.exit()
+
+        wait_redirect_seconds = 5
+        logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
+        await asyncio.sleep(wait_redirect_seconds)
+
+    async def login_by_cookies(self):
+        logging.info("Begin login xiaohongshu by cookie ...")
+        for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
+            await self.browser_context.add_cookies([{
+                'name': key,
+                'value': value,
+                'domain': ".xiaohongshu.com",
+                'path': "/"
+            }])
--- a/models/douyin/m_douyin.py
+++ b/models/douyin/m_douyin.py
@ -1,7 +1,7 @@
 import json
 from typing import Dict, List

-import utils
+from tools import utils


 async def update_douyin_aweme(aweme_item: Dict):
@ -24,7 +24,7 @@ async def update_douyin_aweme(aweme_item: Dict):
        "last_modify_ts": utils.get_current_timestamp(),
    }
    # do something ...
-    print(f"update douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
+    print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")


 async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
@ -61,4 +61,4 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
        "last_modify_ts": utils.get_current_timestamp(),
    }
    # do something ...
-    print(f"update aweme comment: {comment_id}, content: {local_db_item.get('content')}")
+    print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")
--- a/models/xhs/m_xhs.py
+++ b/models/xhs/m_xhs.py
@ -1,6 +1,6 @@
 from typing import Dict

-import utils
+from tools import utils


 async def update_xhs_note(note_item: Dict):
@ -24,7 +24,7 @@ async def update_xhs_note(note_item: Dict):
        "last_modify_ts": utils.get_current_timestamp(),
    }
    # do something ...
-    print("update note:", local_db_item)
+    print("xhs note:", local_db_item)


 async def update_xhs_note_comment(note_id: str, comment_item: Dict):
@ -43,4 +43,4 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
        "last_modify_ts": utils.get_current_timestamp(),
    }
    # do something ...
-    print("update comment:", local_db_item)
+    print("xhs note comment:", local_db_item)
--- a/tools/init.py
+++ b/tools/init.py
--- a/tools/recv_sms_notification.py
+++ b/tools/recv_sms_notification.py
--- a/tools/utils.py
+++ b/tools/utils.py
@ -2,6 +2,7 @@ import re
 import time
 import random
 import base64
+import logging
 from io import BytesIO
 from typing import Optional, Dict, List, Tuple

@ -91,3 +92,13 @@ def match_interact_info_count(count_str: str) -> int:
        return int(number)
    else:
        return 0
+
+
+def init_loging_config():
+    level = logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s %(name)s %(levelname)s %(message)s ",
+        datefmt='%Y-%m-%d  %H:%M:%S'
+    )
+    logging.Logger("Media Crawler")