From b8093a2c0f58ca57f1f6cb5390563f7220ed26b3 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Tue, 27 Jun 2023 23:38:30 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=E4=BC=98=E5=8C=96=E9=83=A8=E5=88=86?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=20feat:=20=E5=A2=9E=E5=8A=A0IP=E4=BB=A3?= =?UTF-8?q?=E7=90=86=E8=B4=A6=E5=8F=B7=E6=B1=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 7 +- base/__init__.py | 0 base/base_crawler.py | 41 ++++ base/proxy_account_pool.py | 130 ++++++++++ base_crawler.py | 23 -- config.py | 14 -- config/__init__.py | 2 + config/account_config.py | 27 +++ config/base_config.py | 19 ++ main.py | 33 ++- media_platform/douyin/core.py | 70 ++++-- media_platform/douyin/login.py | 86 +++++++ media_platform/xhs/core.py | 226 ++++-------------- media_platform/xhs/login.py | 168 +++++++++++++ models/douyin/m_douyin.py | 6 +- models/xhs/m_xhs.py | 6 +- tools/__init__.py | 0 .../recv_sms_notification.py | 0 utils.py => tools/utils.py | 11 + 19 files changed, 615 insertions(+), 254 deletions(-) create mode 100644 base/__init__.py create mode 100644 base/base_crawler.py create mode 100644 base/proxy_account_pool.py delete mode 100644 base_crawler.py delete mode 100644 config.py create mode 100644 config/__init__.py create mode 100644 config/account_config.py create mode 100644 config/base_config.py create mode 100644 media_platform/douyin/login.py create mode 100644 media_platform/xhs/login.py create mode 100644 tools/__init__.py rename recv_sms_notification.py => tools/recv_sms_notification.py (100%) rename utils.py => tools/utils.py (92%) diff --git a/README.md b/README.md index 9be1120..47878a2 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ - [x] 小红书 笔记、评论 - [x] 小红书 二维码扫描登录 | 手机号+验证码自动登录 | cookies登录 - [x] 爬取抖音视频、评论 +- [x] IP代理池,账号池 - [ ] To do 抖音滑块 ## 技术栈 @@ -28,7 +29,7 @@ 2. 安装playwright浏览器驱动 `playwright install` 3. 运行爬虫程序 - `python main.py --platform xhs --keywords 健身 --lt qrcode` + `python main.py --platform xhs --lt qrcode` 4. 打开小红书扫二维码登录 ## 小红书运行截图 @@ -46,8 +47,8 @@ - 转发软件中配置WEBHOOK相关的信息,主要分为 消息模板(请查看本项目中的recv_sms_notification.py)、一个能push短信通知的API地址 - push的API地址一般是需要绑定一个域名的(当然也可以是内网的IP地址),我用的是内网穿透方式,会有一个免费的域名绑定到内网的web server,内网穿透工具 [ngrok](https://ngrok.com/docs/) - 安装redis并设置一个密码 [redis安装](https://www.cnblogs.com/hunanzp/p/12304622.html) -- 执行 `python recv_sms_notification.py` 等待短信转发器发送HTTP通知 -- 执行手机号登录的爬虫程序 `python main.py --platform xhs --keywords 健身 --lt phone --phone 13812345678` +- 执行 `python tools/recv_sms_notification.py` 等待短信转发器发送HTTP通知 +- 执行手机号登录的爬虫程序 `python main.py --platform xhs --lt phone` 备注: - 小红书这边一个手机号一天只能发10条短信(悠着点),目前在发验证码时还未触发滑块验证,估计多了之后也会有~ diff --git a/base/__init__.py b/base/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/base/base_crawler.py b/base/base_crawler.py new file mode 100644 index 0000000..5ea12fe --- /dev/null +++ b/base/base_crawler.py @@ -0,0 +1,41 @@ +from abc import ABC, abstractmethod + + +class AbstractCrawler(ABC): + @abstractmethod + def init_config(self, **kwargs): + pass + + @abstractmethod + async def start(self): + pass + + @abstractmethod + async def search_posts(self): + pass + + @abstractmethod + async def get_comments(self, item_id: int): + pass + + +class AbstractLogin(ABC): + @abstractmethod + async def begin(self): + pass + + @abstractmethod + async def check_login_state(self): + pass + + @abstractmethod + async def login_by_qrcode(self): + pass + + @abstractmethod + async def login_by_mobile(self): + pass + + @abstractmethod + async def login_by_cookies(self): + pass diff --git a/base/proxy_account_pool.py b/base/proxy_account_pool.py new file mode 100644 index 0000000..18b7884 --- /dev/null +++ b/base/proxy_account_pool.py @@ -0,0 +1,130 @@ +import config + + +class PhonePool: + """phone pool class""" + + def __init__(self): + self.phones = [] + self.used_phones = set() + + def add_phone(self, phone): + """add phone to the pool""" + if phone not in self.phones: + self.phones.append(phone) + return True + return False + + def remove_phone(self, phone): + """remove phone from the pool""" + if phone in self.used_phones: + self.phones.remove(phone) + self.used_phones.remove(phone) + return True + return False + + def get_phone(self): + """get phone and mark as used""" + if self.phones: + left_phone = self.phones.pop(0) + self.used_phones.add(left_phone) + return left_phone + return None + + def clear(self): + """clear phone pool""" + self.phones = [] + self.used_phones = set() + + +class IPPool: + def __init__(self): + self.ips = [] + self.used_ips = set() + + def add_ip(self, ip): + """添加ip""" + if ip not in self.ips: + self.ips.append(ip) + return True + return False + + def remove_ip(self, ip): + """remove ip""" + if ip in self.used_ips: + self.ips.remove(ip) + self.used_ips.remove(ip) + return True + return False + + def get_ip(self): + """get ip and mark as used""" + if self.ips: + left_ips = self.ips.pop(0) + self.used_ips.add(left_ips) + return left_ips + return None + + def clear(self): + """ clear ip pool""" + self.ips = [] + self.used_ips = set() + + +class AccountPool: + """account pool class""" + + def __init__(self): + self.phone_pool = PhonePool() + self.ip_pool = IPPool() + + def add_account(self, phone, ip): + """add account to pool with phone and ip""" + if self.phone_pool.add_phone(phone) and self.ip_pool.add_ip(ip): + return True + return False + + def remove_account(self, phone, ip): + """remove account from pool """ + if self.phone_pool.remove_phone(phone) and self.ip_pool.remove_ip(ip): + return True + return False + + def get_account(self): + """get account if no account, reload account pool""" + phone = self.phone_pool.get_phone() + ip = self.ip_pool.get_ip() + if not phone or not ip: + reload_account_pool(self) + return self.get_account() + return phone, ip + + def clear_account(self): + """clear account pool""" + self.phone_pool.clear() + self.ip_pool.clear() + + +def reload_account_pool(apo: AccountPool): + """reload account pool""" + apo.clear_account() + for phone, ip in zip(config.PHONE_LIST, config.IP_PROXY_LIST): + apo.add_account(phone, ip) + + +def create_account_pool() -> AccountPool: + """create account pool""" + apo = AccountPool() + reload_account_pool(apo=apo) + return apo + + +if __name__ == '__main__': + import time + + ac_pool = create_account_pool() + p, i = ac_pool.get_account() + while p: + print(f"get phone:{p}, ip proxy:{i} from account pool") + p, i = ac_pool.get_account() + time.sleep(1) diff --git a/base_crawler.py b/base_crawler.py deleted file mode 100644 index 5c5f204..0000000 --- a/base_crawler.py +++ /dev/null @@ -1,23 +0,0 @@ -from abc import ABC, abstractmethod - - -class Crawler(ABC): - @abstractmethod - def init_config(self, **kwargs): - pass - - @abstractmethod - async def start(self): - pass - - @abstractmethod - async def login(self): - pass - - @abstractmethod - async def search_posts(self): - pass - - @abstractmethod - async def get_comments(self, item_id: int): - pass diff --git a/config.py b/config.py deleted file mode 100644 index dad4eb2..0000000 --- a/config.py +++ /dev/null @@ -1,14 +0,0 @@ -# config file - -platform = "xhs" -keyword = "健身" -login_type = "cookie" # qrcode or phone or cookie -login_phone = "" # your login phone - -# If it's on the Xiaohongshu platform, only the web_session cookie will be kept. -# web_session=040069b2acxxxxxxxxxxxxxxxxxxxx; -cookies = "" - -# redis config -redis_db_host = "redis://127.0.0.1" -redis_db_pwd = "123456" # your redis password diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..d77edcc --- /dev/null +++ b/config/__init__.py @@ -0,0 +1,2 @@ +from .base_config import * +from .account_config import * diff --git a/config/account_config.py b/config/account_config.py new file mode 100644 index 0000000..a0f54b2 --- /dev/null +++ b/config/account_config.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +# account_config.py + +PHONE_LIST = [ + "13012345671", + "13012345672", + "13012345673", + "13012345674", + "13012345675", + "13012345676", + # ... +] + +IP_PROXY_LIST = [ + "111.122.xx.xx1:8888", + "111.122.xx.xx2:8888", + "111.122.xx.xx3:8888", + "111.122.xx.xx4:8888", + "111.122.xx.xx5:8888", + "111.122.xx.xx6:8888", + # ... +] + +IP_PROXY_PROTOCOL = "http://" +IP_PROXY_USER = "xxxx" +IP_PROXY_PASSWORD = "xxxx" + diff --git a/config/base_config.py b/config/base_config.py new file mode 100644 index 0000000..4c1c4b1 --- /dev/null +++ b/config/base_config.py @@ -0,0 +1,19 @@ +PLATFORM = "xhs" +KEYWORDS = "健身,旅游" +LOGIN_TYPE = "qrcode" # qrcode or phone or cookies +# If it's on the Xiaohongshu platform, only the web_session cookie will be kept. +# xhs cookie format -> web_session=040069b2acxxxxxxxxxxxxxxxxxxxx; +COOKIES = "" + +# redis config +REDIS_DB_HOST = "redis://127.0.0.1" # your redis host +REDIS_DB_PWD = "123456" # your redis password + +# enable ip proxy +ENABLE_IP_PROXY = False + +# retry_interval +RETRY_INTERVAL = 60 * 30 # 30 minutes + +# playwright headless +HEADLESS = True diff --git a/main.py b/main.py index 7cad9bd..131b5b4 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,8 @@ import asyncio import argparse import config +from tools import utils +from base import proxy_account_pool from media_platform.douyin import DouYinCrawler from media_platform.xhs import XiaoHongShuCrawler @@ -19,24 +21,37 @@ class CrawlerFactory: async def main(): + utils.init_loging_config() # define command line params ... parser = argparse.ArgumentParser(description='Media crawler program.') - parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform) - parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword) - parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.login_type) - parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone) - parser.add_argument('--cookies', type=str, help='cookies to keep log in', default=config.cookies) + parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.PLATFORM) + parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.LOGIN_TYPE) + + # init account pool + account_pool = proxy_account_pool.create_account_pool() args = parser.parse_args() crawler = CrawlerFactory().create_crawler(platform=args.platform) crawler.init_config( - keywords=args.keywords, - login_phone=args.phone, - login_type=args.lt, - cookie_str=args.cookies + command_args=args, + account_pool=account_pool ) await crawler.start() + """ + # retry when exception ... + while True: + try: + await crawler.start() + except Exception as e: + logging.info(f"crawler start error: {e} ...") + await crawler.close() + # If you encounter an exception + # sleep for a period of time before retrying + # to avoid frequent requests that may result in the account being blocked. + await asyncio.sleep(config.RETRY_INTERVAL) + """ + if __name__ == '__main__': try: diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index b8a20c1..126331b 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -1,42 +1,64 @@ import logging import asyncio from asyncio import Task -from typing import Optional, List, Dict +from argparse import Namespace +from typing import Optional, List, Dict, Tuple from playwright.async_api import async_playwright from playwright.async_api import Page from playwright.async_api import Cookie from playwright.async_api import BrowserContext -import utils +import config +from tools import utils from .client import DOUYINClient from .exception import DataFetchError -from base_crawler import Crawler +from .login import DouYinLogin +from base.base_crawler import AbstractCrawler +from base.proxy_account_pool import AccountPool from models import douyin -class DouYinCrawler(Crawler): +class DouYinCrawler(AbstractCrawler): def __init__(self): - self.keywords: Optional[str] = None self.cookies: Optional[List[Cookie]] = None self.browser_context: Optional[BrowserContext] = None self.context_page: Optional[Page] = None self.proxy: Optional[Dict] = None self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed self.dy_client: Optional[DOUYINClient] = None + self.command_args: Optional[Namespace] = None + self.account_pool: Optional[AccountPool] = None def init_config(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) + def create_proxy_info(self) -> Tuple[str, Dict, str]: + """Create proxy info for playwright and httpx""" + # phone: 13012345671 + # ip_proxy: 111.122.xx.xx1:8888 + # 手机号和IP代理都是从账号池中获取的,并且它们是固定绑定的 + phone, ip_proxy = self.account_pool.get_account() + playwright_proxy = { + "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}", + "username": config.IP_PROXY_USER, + "password": config.IP_PROXY_PASSWORD, + } + httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}" + return phone, playwright_proxy, httpx_proxy + async def start(self): + account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() + if not config.ENABLE_IP_PROXY: + playwright_proxy, httpx_proxy = None, None + async with async_playwright() as playwright: chromium = playwright.chromium - browser = await chromium.launch(headless=True) + browser = await chromium.launch(headless=True, proxy=playwright_proxy) self.browser_context = await browser.new_context( viewport={"width": 1800, "height": 900}, user_agent=self.user_agent, - proxy=self.proxy ) # execute JS to bypass anti automation/crawler detection await self.browser_context.add_init_script(path="libs/stealth.min.js") @@ -44,14 +66,23 @@ class DouYinCrawler(Crawler): await self.context_page.goto("https://www.douyin.com", wait_until="domcontentloaded") await asyncio.sleep(3) - # scan qrcode login - # await self.login() + # begin login + login_obj = DouYinLogin( + login_type=self.command_args.lt, + login_phone=account_phone, + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES + ) + # await login_obj.begin() + + # update cookies await self.update_cookies() # init request client cookie_str, cookie_dict = utils.convert_cookies(self.cookies) self.dy_client = DOUYINClient( - proxies=self.proxy, + proxies=httpx_proxy, headers={ "User-Agent": self.user_agent, "Cookie": cookie_str, @@ -73,23 +104,10 @@ class DouYinCrawler(Crawler): async def update_cookies(self): self.cookies = await self.browser_context.cookies() - async def login(self): - """login douyin website and keep webdriver login state""" - print("Begin login douyin ...") - # todo ... - - async def check_login_state(self) -> bool: - """Check if the current login status is successful and return True otherwise return False""" - current_cookie = await self.browser_context.cookies() - _, cookie_dict = utils.convert_cookies(current_cookie) - if cookie_dict.get("LOGIN_STATUS") == "1": - return True - return False - async def search_posts(self): - # It is possible to modify the source code to allow for the passing of a batch of keywords. - for keyword in [self.keywords]: - print("Begin search douyin keywords: ", keyword) + logging.info("Begin search douyin keywords") + for keyword in config.KEYWORDS.split(","): + logging.info(f"Current keyword: {keyword}") aweme_list: List[str] = [] max_note_len = 20 page = 0 diff --git a/media_platform/douyin/login.py b/media_platform/douyin/login.py new file mode 100644 index 0000000..ac9f83e --- /dev/null +++ b/media_platform/douyin/login.py @@ -0,0 +1,86 @@ +import sys +import asyncio + +from playwright.async_api import Page +from playwright.async_api import BrowserContext + +from tools import utils +from base.base_crawler import AbstractLogin + + +class DouYinLogin(AbstractLogin): + async def login_by_cookies(self): + pass + + def __init__(self, + login_type: str, + browser_context: BrowserContext, + context_page: Page, + login_phone: str = None, + cookie_str: str = None + ): + self.login_type = login_type + self.browser_context = browser_context + self.context_page = context_page + self.login_phone = login_phone + self.cookie_str = cookie_str + self.scan_qrcode_time = 60 + + async def check_login_state(self): + """Check if the current login status is successful and return True otherwise return False""" + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + if cookie_dict.get("LOGIN_STATUS") == "1": + return True + return False + + async def login_by_qrcode(self): + """login douyin website and keep webdriver login state""" + print("Begin login douyin ...") + # find login qrcode + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector="xpath=//article[@class='web-login']//img" + ) + if not base64_qrcode_img: + if await self.check_login_state(): + return + # todo ...if this website does not automatically popup login dialog box, we will manual click login button + print("login failed , have not found qrcode please check ....") + sys.exit() + + # show login qrcode + utils.show_qrcode(base64_qrcode_img) + + while self.scan_qrcode_time > 0: + await asyncio.sleep(1) + self.scan_qrcode_time -= 1 + print(f"waiting for scan code login, remaining time is {self.scan_qrcode_time} seconds") + # get login state from browser + if await self.check_login_state(): + # If the QR code login is successful, you need to wait for a moment. + # Because there will be a second redirection after successful login + # executing JS during this period may be performed in a Page that has already been destroyed. + wait_for_seconds = 5 + print(f"Login successful then wait for {wait_for_seconds} seconds redirect ...") + while wait_for_seconds > 0: + await asyncio.sleep(1) + print(f"remaining wait {wait_for_seconds} seconds ...") + wait_for_seconds -= 1 + break + else: + sys.exit() + + async def login_by_mobile(self): + # todo implement login by mobile + pass + + async def begin(self): + if self.login_type == "qrcode": + await self.login_by_qrcode() + elif self.login_type == "phone": + await self.login_by_mobile() + elif self.login_type == "cookies": + await self.login_by_cookies() + else: + raise ValueError("Invalid Login Type Currently only supported qrcode or phone ...") diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index b549ef4..0a46c89 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -1,43 +1,35 @@ -import sys import random import asyncio +import logging from asyncio import Task -from typing import Optional, List, Dict +from typing import Optional, List, Dict, Tuple +from argparse import Namespace -import aioredis -from tenacity import ( - retry, - stop_after_attempt, - wait_fixed, - retry_if_result -) from playwright.async_api import Page from playwright.async_api import Cookie from playwright.async_api import BrowserContext from playwright.async_api import async_playwright -import utils import config -from .client import XHSClient -from base_crawler import Crawler -from models import xhs as xhs_model +from tools import utils from .exception import * +from .login import XHSLogin +from .client import XHSClient +from models import xhs as xhs_model +from base.base_crawler import AbstractCrawler +from base.proxy_account_pool import AccountPool -class XiaoHongShuCrawler(Crawler): +class XiaoHongShuCrawler(AbstractCrawler): def __init__(self): - self.login_phone = None - self.login_type = None - self.keywords = None - self.web_session = None self.cookies: Optional[List[Cookie]] = None # cookies from browser context - self.cookie_str: Optional[str] = None # cookie string from config or command line self.browser_context: Optional[BrowserContext] = None self.context_page: Optional[Page] = None - self.proxy: Optional[Dict] = None self.user_agent = utils.get_user_agent() self.xhs_client: Optional[XHSClient] = None self.index_url = "https://www.xiaohongshu.com" + self.command_args: Optional[Namespace] = None + self.account_pool: Optional[AccountPool] = None def init_config(self, **kwargs): for key in kwargs.keys(): @@ -46,15 +38,32 @@ class XiaoHongShuCrawler(Crawler): async def update_cookies(self): self.cookies = await self.browser_context.cookies() + def create_proxy_info(self) -> Tuple[str, Dict, str]: + """Create proxy info for playwright and httpx""" + # phone: 13012345671 + # ip_proxy: 111.122.xx.xx1:8888 + # 手机号和IP代理都是从账号池中获取的,并且它们是固定绑定的 + phone, ip_proxy = self.account_pool.get_account() + playwright_proxy = { + "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}", + "username": config.IP_PROXY_USER, + "password": config.IP_PROXY_PASSWORD, + } + httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}" + return phone, playwright_proxy, httpx_proxy + async def start(self): + account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() + if not config.ENABLE_IP_PROXY: + playwright_proxy, httpx_proxy = None, None + async with async_playwright() as playwright: # launch browser and create single browser context chromium = playwright.chromium - browser = await chromium.launch(headless=True) + browser = await chromium.launch(headless=config.HEADLESS, proxy=playwright_proxy) self.browser_context = await browser.new_context( viewport={"width": 1920, "height": 1080}, - user_agent=self.user_agent, - proxy=self.proxy + user_agent=self.user_agent ) # execute JS to bypass anti automation/crawler detection @@ -62,14 +71,23 @@ class XiaoHongShuCrawler(Crawler): self.context_page = await self.browser_context.new_page() await self.context_page.goto(self.index_url) - # scan qrcode login - await self.login() + # begin login + login_obj = XHSLogin( + login_type=self.command_args.lt, + login_phone=account_phone, + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES + ) + await login_obj.begin() + + # update cookies await self.update_cookies() # init request client cookie_str, cookie_dict = utils.convert_cookies(self.cookies) self.xhs_client = XHSClient( - proxies=self.proxy, + proxies=httpx_proxy, headers={ "User-Agent": self.user_agent, "Cookie": cookie_str, @@ -87,153 +105,15 @@ class XiaoHongShuCrawler(Crawler): # block main crawler coroutine await asyncio.Event().wait() - async def login(self): - """login xiaohongshu website and keep webdriver login state""" - # There are three ways to log in: - # 1. Semi-automatic: Log in by scanning the QR code. - # 2. Fully automatic: Log in using forwarded text message notifications - # 3. Semi-automatic: Log in using preset cookie - if self.login_type == "qrcode": - await self.login_by_qrcode() - elif self.login_type == "phone": - await self.login_by_mobile() - elif self.login_type == "cookie": - # cookie str convert to cookie dict - for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): - await self.browser_context.add_cookies([{ - 'name': key, - 'value': value, - 'domain': ".xiaohongshu.com", - 'path': "/" - }]) - else: - pass - - async def login_by_mobile(self): - print("Start executing mobile phone number + verification code login on Xiaohongshu. ...") - - await asyncio.sleep(1) - try: - # After entering the main page of Xiaohongshu, - # the login window may not pop up automatically and you need to manually click the login button. - login_button_ele = await self.context_page.wait_for_selector( - selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button", - timeout=5000 - ) - await login_button_ele.click() - - # There are also two types of login dialog boxes for pop-ups. - # One type directly shows the phone number and verification code. - # Another type requires clicking to switch to mobile login. - element = await self.context_page.wait_for_selector( - selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]', - timeout=5000 - ) - await element.click() - except: - print("have not found mobile button icon and keep going ...") - await asyncio.sleep(1) - - login_container_ele = await self.context_page.wait_for_selector("div.login-container") - # Fill login phone - input_ele = await login_container_ele.query_selector("label.phone > input") - await input_ele.fill(self.login_phone) - await asyncio.sleep(0.5) - - # Click to send verification code and fill it from redis server. - send_btn_ele = await login_container_ele.query_selector("label.auth-code > span") - await send_btn_ele.click() - sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input") - submit_btn_ele = await login_container_ele.query_selector("div.input-container > button") - redis_obj = aioredis.from_url(url=config.redis_db_host, password=config.redis_db_pwd, decode_responses=True) - max_get_sms_code_time = 60 * 2 - current_cookie = await self.browser_context.cookies() - _, cookie_dict = utils.convert_cookies(current_cookie) - no_logged_in_session = cookie_dict.get("web_session") - while max_get_sms_code_time > 0: - print(f"get sms code from redis remaining time {max_get_sms_code_time}s ...") - await asyncio.sleep(1) - sms_code_key = f"xhs_{self.login_phone}" - sms_code_value = await redis_obj.get(sms_code_key) - if not sms_code_value: - max_get_sms_code_time -= 1 - continue - - await sms_code_input_ele.fill(value=sms_code_value) # Enter SMS verification code. - await asyncio.sleep(0.5) - agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']") - await agree_privacy_ele.click() # Click "Agree" to the privacy policy. - await asyncio.sleep(0.5) - - await submit_btn_ele.click() # Click login button - # todo ... It is necessary to check the correctness of the verification code, - # as it is possible that the entered verification code is incorrect. - break - - login_flag: bool = await self.check_login_state(no_logged_in_session) - if not login_flag: - print("login failed please confirm sms code ...") - sys.exit() - - wait_redirect_seconds = 5 - print(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") - await asyncio.sleep(wait_redirect_seconds) - - async def login_by_qrcode(self): - """login xiaohongshu website and keep webdriver login state""" - print("Start scanning QR code to log in to Xiaohongshu. ...") - qrcode_img_selector = "xpath=//img[@class='qrcode-img']" - - # find login qrcode - base64_qrcode_img = await utils.find_login_qrcode( - self.context_page, - selector=qrcode_img_selector - ) - if not base64_qrcode_img: - print("have not found qrcode and try again get it ....") - # if this website does not automatically popup login dialog box, we will manual click login button - login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button") - await login_button_ele.click() - base64_qrcode_img = await utils.find_login_qrcode( - self.context_page, - selector=qrcode_img_selector - ) - if not base64_qrcode_img: - print("login failed , program exit ...") - sys.exit() - - # get not logged session - current_cookie = await self.browser_context.cookies() - _, cookie_dict = utils.convert_cookies(current_cookie) - no_logged_in_session = cookie_dict.get("web_session") - - # show login qrcode - utils.show_qrcode(base64_qrcode_img) - print(f"waiting for scan code login, remaining time is 20s") - login_flag: bool = await self.check_login_state(no_logged_in_session) - if not login_flag: - print("login failed please confirm ...") - sys.exit() - - wait_redirect_seconds = 5 - print(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") - await asyncio.sleep(wait_redirect_seconds) - - @retry(stop=stop_after_attempt(30), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) - async def check_login_state(self, no_logged_in_session: str) -> bool: - """Check if the current login status is successful and return True otherwise return False""" - # If login is unsuccessful, a retry exception will be thrown. - current_cookie = await self.browser_context.cookies() - _, cookie_dict = utils.convert_cookies(current_cookie) - current_web_session = cookie_dict.get("web_session") - if current_web_session != no_logged_in_session: - return True - return False + async def close(self): + await self.browser_context.close() + await self.browser_context.close() + logging.info("Browser context closed ...") async def search_posts(self): - print("Begin search xiaohongshu keywords") - # It is possible to modify the source code to allow for the passing of a batch of keywords. - for keyword in [self.keywords]: + logging.info("Begin search xiaohongshu keywords") + for keyword in config.KEYWORDS.split(","): + logging.info(f"Current keyword: {keyword}") note_list: List[str] = [] max_note_len = 10 page = 1 @@ -253,7 +133,7 @@ class XiaoHongShuCrawler(Crawler): await xhs_model.update_xhs_note(note_detail) await asyncio.sleep(0.05) note_list.append(note_id) - print(f"keyword:{keyword}, note_list:{note_list}") + logging.info(f"keyword:{keyword}, note_list:{note_list}") await self.batch_get_note_comments(note_list) async def batch_get_note_comments(self, note_list: List[str]): @@ -264,7 +144,7 @@ class XiaoHongShuCrawler(Crawler): await asyncio.wait(task_list) async def get_comments(self, note_id: str): - print("Begin get note id comments ", note_id) + logging.info(f"Begin get note id comments {note_id}") all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random()) for comment in all_comments: await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment) diff --git a/media_platform/xhs/login.py b/media_platform/xhs/login.py new file mode 100644 index 0000000..2377916 --- /dev/null +++ b/media_platform/xhs/login.py @@ -0,0 +1,168 @@ +import sys +import asyncio +import logging + +import aioredis +from tenacity import ( + retry, + stop_after_attempt, + wait_fixed, + retry_if_result +) +from playwright.async_api import Page +from playwright.async_api import BrowserContext + +import config +from tools import utils +from base.base_crawler import AbstractLogin + + +class XHSLogin(AbstractLogin): + + def __init__(self, + login_type: str, + browser_context: BrowserContext, + context_page: Page, + login_phone: str = None, + cookie_str: str = None + ): + self.login_type = login_type + self.browser_context = browser_context + self.context_page = context_page + self.login_phone = login_phone + self.cookie_str = cookie_str + + @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) + async def check_login_state(self, no_logged_in_session: str) -> bool: + """Check if the current login status is successful and return True otherwise return False""" + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + current_web_session = cookie_dict.get("web_session") + if current_web_session != no_logged_in_session: + return True + return False + + async def begin(self): + if self.login_type == "qrcode": + await self.login_by_qrcode() + elif self.login_type == "phone": + await self.login_by_mobile() + elif self.login_type == "cookies": + await self.login_by_cookies() + else: + raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookies ...") + + async def login_by_mobile(self): + logging.info("Begin login xiaohongshu by mobile ...") + await asyncio.sleep(1) + try: + # 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮 + login_button_ele = await self.context_page.wait_for_selector( + selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button", + timeout=5000 + ) + await login_button_ele.click() + # 弹窗的登录对话框也有两种形态,一种是直接可以看到手机号和验证码的 + # 另一种是需要点击切换到手机登录的 + element = await self.context_page.wait_for_selector( + selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]', + timeout=5000 + ) + await element.click() + except Exception as e: + logging.info("have not found mobile button icon and keep going ...") + await asyncio.sleep(1) + login_container_ele = await self.context_page.wait_for_selector("div.login-container") + input_ele = await login_container_ele.query_selector("label.phone > input") + await input_ele.fill(self.login_phone) + await asyncio.sleep(0.5) + send_btn_ele = await login_container_ele.query_selector("label.auth-code > span") + await send_btn_ele.click() # 点击发送验证码 + sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input") + submit_btn_ele = await login_container_ele.query_selector("div.input-container > button") + redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True) + max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 + no_logged_in_session = "" + while max_get_sms_code_time > 0: + logging.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...") + await asyncio.sleep(1) + sms_code_key = f"xhs_{self.login_phone}" + sms_code_value = await redis_obj.get(sms_code_key) + if not sms_code_value: + max_get_sms_code_time -= 1 + continue + + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + no_logged_in_session = cookie_dict.get("web_session") + + await sms_code_input_ele.fill(value=sms_code_value) # 输入短信验证码 + await asyncio.sleep(0.5) + agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']") + await agree_privacy_ele.click() # 点击同意隐私协议 + await asyncio.sleep(0.5) + + await submit_btn_ele.click() # 点击登录 + + # todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确 + break + + login_flag: bool = await self.check_login_state(no_logged_in_session) + if not login_flag: + logging.info("login failed please confirm ...") + sys.exit() + + wait_redirect_seconds = 5 + logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + await asyncio.sleep(wait_redirect_seconds) + + async def login_by_qrcode(self): + """login xiaohongshu website and keep webdriver login state""" + logging.info("Begin login xiaohongshu by qrcode ...") + await asyncio.sleep(10) + # login_selector = "div.login-container > div.left > div.qrcode > img" + qrcode_img_selector = "xpath=//img[@class='qrcode-img']" + # find login qrcode + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector=qrcode_img_selector + ) + if not base64_qrcode_img: + logging.info("login failed , have not found qrcode please check ....") + # if this website does not automatically popup login dialog box, we will manual click login button + await asyncio.sleep(0.5) + login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button") + await login_button_ele.click() + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector=qrcode_img_selector + ) + if not base64_qrcode_img: + sys.exit() + + # get not logged session + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + no_logged_in_session = cookie_dict.get("web_session") + + # show login qrcode + utils.show_qrcode(base64_qrcode_img) + logging.info(f"waiting for scan code login, remaining time is 20s") + login_flag: bool = await self.check_login_state(no_logged_in_session) + if not login_flag: + logging.info("login failed please confirm ...") + sys.exit() + + wait_redirect_seconds = 5 + logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + await asyncio.sleep(wait_redirect_seconds) + + async def login_by_cookies(self): + logging.info("Begin login xiaohongshu by cookie ...") + for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): + await self.browser_context.add_cookies([{ + 'name': key, + 'value': value, + 'domain': ".xiaohongshu.com", + 'path': "/" + }]) diff --git a/models/douyin/m_douyin.py b/models/douyin/m_douyin.py index fd818d9..08c5b36 100644 --- a/models/douyin/m_douyin.py +++ b/models/douyin/m_douyin.py @@ -1,7 +1,7 @@ import json from typing import Dict, List -import utils +from tools import utils async def update_douyin_aweme(aweme_item: Dict): @@ -24,7 +24,7 @@ async def update_douyin_aweme(aweme_item: Dict): "last_modify_ts": utils.get_current_timestamp(), } # do something ... - print(f"update douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}") + print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}") async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]): @@ -61,4 +61,4 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict): "last_modify_ts": utils.get_current_timestamp(), } # do something ... - print(f"update aweme comment: {comment_id}, content: {local_db_item.get('content')}") + print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}") diff --git a/models/xhs/m_xhs.py b/models/xhs/m_xhs.py index 4b564a1..3604aae 100644 --- a/models/xhs/m_xhs.py +++ b/models/xhs/m_xhs.py @@ -1,6 +1,6 @@ from typing import Dict -import utils +from tools import utils async def update_xhs_note(note_item: Dict): @@ -24,7 +24,7 @@ async def update_xhs_note(note_item: Dict): "last_modify_ts": utils.get_current_timestamp(), } # do something ... - print("update note:", local_db_item) + print("xhs note:", local_db_item) async def update_xhs_note_comment(note_id: str, comment_item: Dict): @@ -43,4 +43,4 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict): "last_modify_ts": utils.get_current_timestamp(), } # do something ... - print("update comment:", local_db_item) + print("xhs note comment:", local_db_item) diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/recv_sms_notification.py b/tools/recv_sms_notification.py similarity index 100% rename from recv_sms_notification.py rename to tools/recv_sms_notification.py diff --git a/utils.py b/tools/utils.py similarity index 92% rename from utils.py rename to tools/utils.py index 7cddea4..650edd7 100644 --- a/utils.py +++ b/tools/utils.py @@ -2,6 +2,7 @@ import re import time import random import base64 +import logging from io import BytesIO from typing import Optional, Dict, List, Tuple @@ -91,3 +92,13 @@ def match_interact_info_count(count_str: str) -> int: return int(number) else: return 0 + + +def init_loging_config(): + level = logging.INFO + logging.basicConfig( + level=level, + format="%(asctime)s %(name)s %(levelname)s %(message)s ", + datefmt='%Y-%m-%d %H:%M:%S' + ) + logging.Logger("Media Crawler")