refactor:优化部分代码

feat: 增加IP代理账号池
This commit is contained in:
Relakkes 2023-06-27 23:38:30 +08:00
parent 963d9a16d3
commit b8093a2c0f
19 changed files with 615 additions and 254 deletions

View File

@ -13,6 +13,7 @@
- [x] 小红书 笔记、评论 - [x] 小红书 笔记、评论
- [x] 小红书 二维码扫描登录 | 手机号+验证码自动登录 | cookies登录 - [x] 小红书 二维码扫描登录 | 手机号+验证码自动登录 | cookies登录
- [x] 爬取抖音视频、评论 - [x] 爬取抖音视频、评论
- [x] IP代理池账号池
- [ ] To do 抖音滑块 - [ ] To do 抖音滑块
## 技术栈 ## 技术栈
@ -28,7 +29,7 @@
2. 安装playwright浏览器驱动 2. 安装playwright浏览器驱动
`playwright install` `playwright install`
3. 运行爬虫程序 3. 运行爬虫程序
`python main.py --platform xhs --keywords 健身 --lt qrcode` `python main.py --platform xhs --lt qrcode`
4. 打开小红书扫二维码登录 4. 打开小红书扫二维码登录
## 小红书运行截图 ## 小红书运行截图
@ -46,8 +47,8 @@
- 转发软件中配置WEBHOOK相关的信息主要分为 消息模板请查看本项目中的recv_sms_notification.py、一个能push短信通知的API地址 - 转发软件中配置WEBHOOK相关的信息主要分为 消息模板请查看本项目中的recv_sms_notification.py、一个能push短信通知的API地址
- push的API地址一般是需要绑定一个域名的当然也可以是内网的IP地址我用的是内网穿透方式会有一个免费的域名绑定到内网的web server内网穿透工具 [ngrok](https://ngrok.com/docs/) - push的API地址一般是需要绑定一个域名的当然也可以是内网的IP地址我用的是内网穿透方式会有一个免费的域名绑定到内网的web server内网穿透工具 [ngrok](https://ngrok.com/docs/)
- 安装redis并设置一个密码 [redis安装](https://www.cnblogs.com/hunanzp/p/12304622.html) - 安装redis并设置一个密码 [redis安装](https://www.cnblogs.com/hunanzp/p/12304622.html)
- 执行 `python recv_sms_notification.py` 等待短信转发器发送HTTP通知 - 执行 `python tools/recv_sms_notification.py` 等待短信转发器发送HTTP通知
- 执行手机号登录的爬虫程序 `python main.py --platform xhs --keywords 健身 --lt phone --phone 13812345678` - 执行手机号登录的爬虫程序 `python main.py --platform xhs --lt phone`
备注: 备注:
- 小红书这边一个手机号一天只能发10条短信悠着点目前在发验证码时还未触发滑块验证估计多了之后也会有~ - 小红书这边一个手机号一天只能发10条短信悠着点目前在发验证码时还未触发滑块验证估计多了之后也会有~

0
base/__init__.py Normal file
View File

41
base/base_crawler.py Normal file
View File

@ -0,0 +1,41 @@
from abc import ABC, abstractmethod
class AbstractCrawler(ABC):
@abstractmethod
def init_config(self, **kwargs):
pass
@abstractmethod
async def start(self):
pass
@abstractmethod
async def search_posts(self):
pass
@abstractmethod
async def get_comments(self, item_id: int):
pass
class AbstractLogin(ABC):
@abstractmethod
async def begin(self):
pass
@abstractmethod
async def check_login_state(self):
pass
@abstractmethod
async def login_by_qrcode(self):
pass
@abstractmethod
async def login_by_mobile(self):
pass
@abstractmethod
async def login_by_cookies(self):
pass

130
base/proxy_account_pool.py Normal file
View File

@ -0,0 +1,130 @@
import config
class PhonePool:
"""phone pool class"""
def __init__(self):
self.phones = []
self.used_phones = set()
def add_phone(self, phone):
"""add phone to the pool"""
if phone not in self.phones:
self.phones.append(phone)
return True
return False
def remove_phone(self, phone):
"""remove phone from the pool"""
if phone in self.used_phones:
self.phones.remove(phone)
self.used_phones.remove(phone)
return True
return False
def get_phone(self):
"""get phone and mark as used"""
if self.phones:
left_phone = self.phones.pop(0)
self.used_phones.add(left_phone)
return left_phone
return None
def clear(self):
"""clear phone pool"""
self.phones = []
self.used_phones = set()
class IPPool:
def __init__(self):
self.ips = []
self.used_ips = set()
def add_ip(self, ip):
"""添加ip"""
if ip not in self.ips:
self.ips.append(ip)
return True
return False
def remove_ip(self, ip):
"""remove ip"""
if ip in self.used_ips:
self.ips.remove(ip)
self.used_ips.remove(ip)
return True
return False
def get_ip(self):
"""get ip and mark as used"""
if self.ips:
left_ips = self.ips.pop(0)
self.used_ips.add(left_ips)
return left_ips
return None
def clear(self):
""" clear ip pool"""
self.ips = []
self.used_ips = set()
class AccountPool:
"""account pool class"""
def __init__(self):
self.phone_pool = PhonePool()
self.ip_pool = IPPool()
def add_account(self, phone, ip):
"""add account to pool with phone and ip"""
if self.phone_pool.add_phone(phone) and self.ip_pool.add_ip(ip):
return True
return False
def remove_account(self, phone, ip):
"""remove account from pool """
if self.phone_pool.remove_phone(phone) and self.ip_pool.remove_ip(ip):
return True
return False
def get_account(self):
"""get account if no account, reload account pool"""
phone = self.phone_pool.get_phone()
ip = self.ip_pool.get_ip()
if not phone or not ip:
reload_account_pool(self)
return self.get_account()
return phone, ip
def clear_account(self):
"""clear account pool"""
self.phone_pool.clear()
self.ip_pool.clear()
def reload_account_pool(apo: AccountPool):
"""reload account pool"""
apo.clear_account()
for phone, ip in zip(config.PHONE_LIST, config.IP_PROXY_LIST):
apo.add_account(phone, ip)
def create_account_pool() -> AccountPool:
"""create account pool"""
apo = AccountPool()
reload_account_pool(apo=apo)
return apo
if __name__ == '__main__':
import time
ac_pool = create_account_pool()
p, i = ac_pool.get_account()
while p:
print(f"get phone:{p}, ip proxy:{i} from account pool")
p, i = ac_pool.get_account()
time.sleep(1)

View File

@ -1,23 +0,0 @@
from abc import ABC, abstractmethod
class Crawler(ABC):
@abstractmethod
def init_config(self, **kwargs):
pass
@abstractmethod
async def start(self):
pass
@abstractmethod
async def login(self):
pass
@abstractmethod
async def search_posts(self):
pass
@abstractmethod
async def get_comments(self, item_id: int):
pass

View File

@ -1,14 +0,0 @@
# config file
platform = "xhs"
keyword = "健身"
login_type = "cookie" # qrcode or phone or cookie
login_phone = "" # your login phone
# If it's on the Xiaohongshu platform, only the web_session cookie will be kept.
# web_session=040069b2acxxxxxxxxxxxxxxxxxxxx;
cookies = ""
# redis config
redis_db_host = "redis://127.0.0.1"
redis_db_pwd = "123456" # your redis password

2
config/__init__.py Normal file
View File

@ -0,0 +1,2 @@
from .base_config import *
from .account_config import *

27
config/account_config.py Normal file
View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
# account_config.py
PHONE_LIST = [
"13012345671",
"13012345672",
"13012345673",
"13012345674",
"13012345675",
"13012345676",
# ...
]
IP_PROXY_LIST = [
"111.122.xx.xx1:8888",
"111.122.xx.xx2:8888",
"111.122.xx.xx3:8888",
"111.122.xx.xx4:8888",
"111.122.xx.xx5:8888",
"111.122.xx.xx6:8888",
# ...
]
IP_PROXY_PROTOCOL = "http://"
IP_PROXY_USER = "xxxx"
IP_PROXY_PASSWORD = "xxxx"

19
config/base_config.py Normal file
View File

@ -0,0 +1,19 @@
PLATFORM = "xhs"
KEYWORDS = "健身,旅游"
LOGIN_TYPE = "qrcode" # qrcode or phone or cookies
# If it's on the Xiaohongshu platform, only the web_session cookie will be kept.
# xhs cookie format -> web_session=040069b2acxxxxxxxxxxxxxxxxxxxx;
COOKIES = ""
# redis config
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
REDIS_DB_PWD = "123456" # your redis password
# enable ip proxy
ENABLE_IP_PROXY = False
# retry_interval
RETRY_INTERVAL = 60 * 30 # 30 minutes
# playwright headless
HEADLESS = True

33
main.py
View File

@ -3,6 +3,8 @@ import asyncio
import argparse import argparse
import config import config
from tools import utils
from base import proxy_account_pool
from media_platform.douyin import DouYinCrawler from media_platform.douyin import DouYinCrawler
from media_platform.xhs import XiaoHongShuCrawler from media_platform.xhs import XiaoHongShuCrawler
@ -19,24 +21,37 @@ class CrawlerFactory:
async def main(): async def main():
utils.init_loging_config()
# define command line params ... # define command line params ...
parser = argparse.ArgumentParser(description='Media crawler program.') parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform) parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.PLATFORM)
parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword) parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.LOGIN_TYPE)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.login_type)
parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone) # init account pool
parser.add_argument('--cookies', type=str, help='cookies to keep log in', default=config.cookies) account_pool = proxy_account_pool.create_account_pool()
args = parser.parse_args() args = parser.parse_args()
crawler = CrawlerFactory().create_crawler(platform=args.platform) crawler = CrawlerFactory().create_crawler(platform=args.platform)
crawler.init_config( crawler.init_config(
keywords=args.keywords, command_args=args,
login_phone=args.phone, account_pool=account_pool
login_type=args.lt,
cookie_str=args.cookies
) )
await crawler.start() await crawler.start()
"""
# retry when exception ...
while True:
try:
await crawler.start()
except Exception as e:
logging.info(f"crawler start error: {e} ...")
await crawler.close()
# If you encounter an exception
# sleep for a period of time before retrying
# to avoid frequent requests that may result in the account being blocked.
await asyncio.sleep(config.RETRY_INTERVAL)
"""
if __name__ == '__main__': if __name__ == '__main__':
try: try:

View File

@ -1,42 +1,64 @@
import logging import logging
import asyncio import asyncio
from asyncio import Task from asyncio import Task
from typing import Optional, List, Dict from argparse import Namespace
from typing import Optional, List, Dict, Tuple
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from playwright.async_api import Page from playwright.async_api import Page
from playwright.async_api import Cookie from playwright.async_api import Cookie
from playwright.async_api import BrowserContext from playwright.async_api import BrowserContext
import utils import config
from tools import utils
from .client import DOUYINClient from .client import DOUYINClient
from .exception import DataFetchError from .exception import DataFetchError
from base_crawler import Crawler from .login import DouYinLogin
from base.base_crawler import AbstractCrawler
from base.proxy_account_pool import AccountPool
from models import douyin from models import douyin
class DouYinCrawler(Crawler): class DouYinCrawler(AbstractCrawler):
def __init__(self): def __init__(self):
self.keywords: Optional[str] = None
self.cookies: Optional[List[Cookie]] = None self.cookies: Optional[List[Cookie]] = None
self.browser_context: Optional[BrowserContext] = None self.browser_context: Optional[BrowserContext] = None
self.context_page: Optional[Page] = None self.context_page: Optional[Page] = None
self.proxy: Optional[Dict] = None self.proxy: Optional[Dict] = None
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.dy_client: Optional[DOUYINClient] = None self.dy_client: Optional[DOUYINClient] = None
self.command_args: Optional[Namespace] = None
self.account_pool: Optional[AccountPool] = None
def init_config(self, **kwargs): def init_config(self, **kwargs):
for key, value in kwargs.items(): for key, value in kwargs.items():
setattr(self, key, value) setattr(self, key, value)
def create_proxy_info(self) -> Tuple[str, Dict, str]:
"""Create proxy info for playwright and httpx"""
# phone: 13012345671
# ip_proxy: 111.122.xx.xx1:8888
# 手机号和IP代理都是从账号池中获取的并且它们是固定绑定的
phone, ip_proxy = self.account_pool.get_account()
playwright_proxy = {
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
"username": config.IP_PROXY_USER,
"password": config.IP_PROXY_PASSWORD,
}
httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
return phone, playwright_proxy, httpx_proxy
async def start(self): async def start(self):
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
if not config.ENABLE_IP_PROXY:
playwright_proxy, httpx_proxy = None, None
async with async_playwright() as playwright: async with async_playwright() as playwright:
chromium = playwright.chromium chromium = playwright.chromium
browser = await chromium.launch(headless=True) browser = await chromium.launch(headless=True, proxy=playwright_proxy)
self.browser_context = await browser.new_context( self.browser_context = await browser.new_context(
viewport={"width": 1800, "height": 900}, viewport={"width": 1800, "height": 900},
user_agent=self.user_agent, user_agent=self.user_agent,
proxy=self.proxy
) )
# execute JS to bypass anti automation/crawler detection # execute JS to bypass anti automation/crawler detection
await self.browser_context.add_init_script(path="libs/stealth.min.js") await self.browser_context.add_init_script(path="libs/stealth.min.js")
@ -44,14 +66,23 @@ class DouYinCrawler(Crawler):
await self.context_page.goto("https://www.douyin.com", wait_until="domcontentloaded") await self.context_page.goto("https://www.douyin.com", wait_until="domcontentloaded")
await asyncio.sleep(3) await asyncio.sleep(3)
# scan qrcode login # begin login
# await self.login() login_obj = DouYinLogin(
login_type=self.command_args.lt,
login_phone=account_phone,
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES
)
# await login_obj.begin()
# update cookies
await self.update_cookies() await self.update_cookies()
# init request client # init request client
cookie_str, cookie_dict = utils.convert_cookies(self.cookies) cookie_str, cookie_dict = utils.convert_cookies(self.cookies)
self.dy_client = DOUYINClient( self.dy_client = DOUYINClient(
proxies=self.proxy, proxies=httpx_proxy,
headers={ headers={
"User-Agent": self.user_agent, "User-Agent": self.user_agent,
"Cookie": cookie_str, "Cookie": cookie_str,
@ -73,23 +104,10 @@ class DouYinCrawler(Crawler):
async def update_cookies(self): async def update_cookies(self):
self.cookies = await self.browser_context.cookies() self.cookies = await self.browser_context.cookies()
async def login(self):
"""login douyin website and keep webdriver login state"""
print("Begin login douyin ...")
# todo ...
async def check_login_state(self) -> bool:
"""Check if the current login status is successful and return True otherwise return False"""
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
if cookie_dict.get("LOGIN_STATUS") == "1":
return True
return False
async def search_posts(self): async def search_posts(self):
# It is possible to modify the source code to allow for the passing of a batch of keywords. logging.info("Begin search douyin keywords")
for keyword in [self.keywords]: for keyword in config.KEYWORDS.split(","):
print("Begin search douyin keywords: ", keyword) logging.info(f"Current keyword: {keyword}")
aweme_list: List[str] = [] aweme_list: List[str] = []
max_note_len = 20 max_note_len = 20
page = 0 page = 0

View File

@ -0,0 +1,86 @@
import sys
import asyncio
from playwright.async_api import Page
from playwright.async_api import BrowserContext
from tools import utils
from base.base_crawler import AbstractLogin
class DouYinLogin(AbstractLogin):
async def login_by_cookies(self):
pass
def __init__(self,
login_type: str,
browser_context: BrowserContext,
context_page: Page,
login_phone: str = None,
cookie_str: str = None
):
self.login_type = login_type
self.browser_context = browser_context
self.context_page = context_page
self.login_phone = login_phone
self.cookie_str = cookie_str
self.scan_qrcode_time = 60
async def check_login_state(self):
"""Check if the current login status is successful and return True otherwise return False"""
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
if cookie_dict.get("LOGIN_STATUS") == "1":
return True
return False
async def login_by_qrcode(self):
"""login douyin website and keep webdriver login state"""
print("Begin login douyin ...")
# find login qrcode
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector="xpath=//article[@class='web-login']//img"
)
if not base64_qrcode_img:
if await self.check_login_state():
return
# todo ...if this website does not automatically popup login dialog box, we will manual click login button
print("login failed , have not found qrcode please check ....")
sys.exit()
# show login qrcode
utils.show_qrcode(base64_qrcode_img)
while self.scan_qrcode_time > 0:
await asyncio.sleep(1)
self.scan_qrcode_time -= 1
print(f"waiting for scan code login, remaining time is {self.scan_qrcode_time} seconds")
# get login state from browser
if await self.check_login_state():
# If the QR code login is successful, you need to wait for a moment.
# Because there will be a second redirection after successful login
# executing JS during this period may be performed in a Page that has already been destroyed.
wait_for_seconds = 5
print(f"Login successful then wait for {wait_for_seconds} seconds redirect ...")
while wait_for_seconds > 0:
await asyncio.sleep(1)
print(f"remaining wait {wait_for_seconds} seconds ...")
wait_for_seconds -= 1
break
else:
sys.exit()
async def login_by_mobile(self):
# todo implement login by mobile
pass
async def begin(self):
if self.login_type == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
await self.login_by_mobile()
elif self.login_type == "cookies":
await self.login_by_cookies()
else:
raise ValueError("Invalid Login Type Currently only supported qrcode or phone ...")

View File

@ -1,43 +1,35 @@
import sys
import random import random
import asyncio import asyncio
import logging
from asyncio import Task from asyncio import Task
from typing import Optional, List, Dict from typing import Optional, List, Dict, Tuple
from argparse import Namespace
import aioredis
from tenacity import (
retry,
stop_after_attempt,
wait_fixed,
retry_if_result
)
from playwright.async_api import Page from playwright.async_api import Page
from playwright.async_api import Cookie from playwright.async_api import Cookie
from playwright.async_api import BrowserContext from playwright.async_api import BrowserContext
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
import utils
import config import config
from .client import XHSClient from tools import utils
from base_crawler import Crawler
from models import xhs as xhs_model
from .exception import * from .exception import *
from .login import XHSLogin
from .client import XHSClient
from models import xhs as xhs_model
from base.base_crawler import AbstractCrawler
from base.proxy_account_pool import AccountPool
class XiaoHongShuCrawler(Crawler): class XiaoHongShuCrawler(AbstractCrawler):
def __init__(self): def __init__(self):
self.login_phone = None
self.login_type = None
self.keywords = None
self.web_session = None
self.cookies: Optional[List[Cookie]] = None # cookies from browser context self.cookies: Optional[List[Cookie]] = None # cookies from browser context
self.cookie_str: Optional[str] = None # cookie string from config or command line
self.browser_context: Optional[BrowserContext] = None self.browser_context: Optional[BrowserContext] = None
self.context_page: Optional[Page] = None self.context_page: Optional[Page] = None
self.proxy: Optional[Dict] = None
self.user_agent = utils.get_user_agent() self.user_agent = utils.get_user_agent()
self.xhs_client: Optional[XHSClient] = None self.xhs_client: Optional[XHSClient] = None
self.index_url = "https://www.xiaohongshu.com" self.index_url = "https://www.xiaohongshu.com"
self.command_args: Optional[Namespace] = None
self.account_pool: Optional[AccountPool] = None
def init_config(self, **kwargs): def init_config(self, **kwargs):
for key in kwargs.keys(): for key in kwargs.keys():
@ -46,15 +38,32 @@ class XiaoHongShuCrawler(Crawler):
async def update_cookies(self): async def update_cookies(self):
self.cookies = await self.browser_context.cookies() self.cookies = await self.browser_context.cookies()
def create_proxy_info(self) -> Tuple[str, Dict, str]:
"""Create proxy info for playwright and httpx"""
# phone: 13012345671
# ip_proxy: 111.122.xx.xx1:8888
# 手机号和IP代理都是从账号池中获取的并且它们是固定绑定的
phone, ip_proxy = self.account_pool.get_account()
playwright_proxy = {
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
"username": config.IP_PROXY_USER,
"password": config.IP_PROXY_PASSWORD,
}
httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
return phone, playwright_proxy, httpx_proxy
async def start(self): async def start(self):
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
if not config.ENABLE_IP_PROXY:
playwright_proxy, httpx_proxy = None, None
async with async_playwright() as playwright: async with async_playwright() as playwright:
# launch browser and create single browser context # launch browser and create single browser context
chromium = playwright.chromium chromium = playwright.chromium
browser = await chromium.launch(headless=True) browser = await chromium.launch(headless=config.HEADLESS, proxy=playwright_proxy)
self.browser_context = await browser.new_context( self.browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080}, viewport={"width": 1920, "height": 1080},
user_agent=self.user_agent, user_agent=self.user_agent
proxy=self.proxy
) )
# execute JS to bypass anti automation/crawler detection # execute JS to bypass anti automation/crawler detection
@ -62,14 +71,23 @@ class XiaoHongShuCrawler(Crawler):
self.context_page = await self.browser_context.new_page() self.context_page = await self.browser_context.new_page()
await self.context_page.goto(self.index_url) await self.context_page.goto(self.index_url)
# scan qrcode login # begin login
await self.login() login_obj = XHSLogin(
login_type=self.command_args.lt,
login_phone=account_phone,
browser_context=self.browser_context,
context_page=self.context_page,
cookie_str=config.COOKIES
)
await login_obj.begin()
# update cookies
await self.update_cookies() await self.update_cookies()
# init request client # init request client
cookie_str, cookie_dict = utils.convert_cookies(self.cookies) cookie_str, cookie_dict = utils.convert_cookies(self.cookies)
self.xhs_client = XHSClient( self.xhs_client = XHSClient(
proxies=self.proxy, proxies=httpx_proxy,
headers={ headers={
"User-Agent": self.user_agent, "User-Agent": self.user_agent,
"Cookie": cookie_str, "Cookie": cookie_str,
@ -87,153 +105,15 @@ class XiaoHongShuCrawler(Crawler):
# block main crawler coroutine # block main crawler coroutine
await asyncio.Event().wait() await asyncio.Event().wait()
async def login(self): async def close(self):
"""login xiaohongshu website and keep webdriver login state""" await self.browser_context.close()
# There are three ways to log in: await self.browser_context.close()
# 1. Semi-automatic: Log in by scanning the QR code. logging.info("Browser context closed ...")
# 2. Fully automatic: Log in using forwarded text message notifications
# 3. Semi-automatic: Log in using preset cookie
if self.login_type == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
await self.login_by_mobile()
elif self.login_type == "cookie":
# cookie str convert to cookie dict
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,
'value': value,
'domain': ".xiaohongshu.com",
'path': "/"
}])
else:
pass
async def login_by_mobile(self):
print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")
await asyncio.sleep(1)
try:
# After entering the main page of Xiaohongshu,
# the login window may not pop up automatically and you need to manually click the login button.
login_button_ele = await self.context_page.wait_for_selector(
selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
timeout=5000
)
await login_button_ele.click()
# There are also two types of login dialog boxes for pop-ups.
# One type directly shows the phone number and verification code.
# Another type requires clicking to switch to mobile login.
element = await self.context_page.wait_for_selector(
selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
timeout=5000
)
await element.click()
except:
print("have not found mobile button icon and keep going ...")
await asyncio.sleep(1)
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
# Fill login phone
input_ele = await login_container_ele.query_selector("label.phone > input")
await input_ele.fill(self.login_phone)
await asyncio.sleep(0.5)
# Click to send verification code and fill it from redis server.
send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
await send_btn_ele.click()
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
redis_obj = aioredis.from_url(url=config.redis_db_host, password=config.redis_db_pwd, decode_responses=True)
max_get_sms_code_time = 60 * 2
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
no_logged_in_session = cookie_dict.get("web_session")
while max_get_sms_code_time > 0:
print(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"xhs_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key)
if not sms_code_value:
max_get_sms_code_time -= 1
continue
await sms_code_input_ele.fill(value=sms_code_value) # Enter SMS verification code.
await asyncio.sleep(0.5)
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
await agree_privacy_ele.click() # Click "Agree" to the privacy policy.
await asyncio.sleep(0.5)
await submit_btn_ele.click() # Click login button
# todo ... It is necessary to check the correctness of the verification code,
# as it is possible that the entered verification code is incorrect.
break
login_flag: bool = await self.check_login_state(no_logged_in_session)
if not login_flag:
print("login failed please confirm sms code ...")
sys.exit()
wait_redirect_seconds = 5
print(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_qrcode(self):
"""login xiaohongshu website and keep webdriver login state"""
print("Start scanning QR code to log in to Xiaohongshu. ...")
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
# find login qrcode
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
print("have not found qrcode and try again get it ....")
# if this website does not automatically popup login dialog box, we will manual click login button
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
await login_button_ele.click()
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
print("login failed , program exit ...")
sys.exit()
# get not logged session
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
no_logged_in_session = cookie_dict.get("web_session")
# show login qrcode
utils.show_qrcode(base64_qrcode_img)
print(f"waiting for scan code login, remaining time is 20s")
login_flag: bool = await self.check_login_state(no_logged_in_session)
if not login_flag:
print("login failed please confirm ...")
sys.exit()
wait_redirect_seconds = 5
print(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
@retry(stop=stop_after_attempt(30), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self, no_logged_in_session: str) -> bool:
"""Check if the current login status is successful and return True otherwise return False"""
# If login is unsuccessful, a retry exception will be thrown.
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
current_web_session = cookie_dict.get("web_session")
if current_web_session != no_logged_in_session:
return True
return False
async def search_posts(self): async def search_posts(self):
print("Begin search xiaohongshu keywords") logging.info("Begin search xiaohongshu keywords")
# It is possible to modify the source code to allow for the passing of a batch of keywords. for keyword in config.KEYWORDS.split(","):
for keyword in [self.keywords]: logging.info(f"Current keyword: {keyword}")
note_list: List[str] = [] note_list: List[str] = []
max_note_len = 10 max_note_len = 10
page = 1 page = 1
@ -253,7 +133,7 @@ class XiaoHongShuCrawler(Crawler):
await xhs_model.update_xhs_note(note_detail) await xhs_model.update_xhs_note(note_detail)
await asyncio.sleep(0.05) await asyncio.sleep(0.05)
note_list.append(note_id) note_list.append(note_id)
print(f"keyword:{keyword}, note_list:{note_list}") logging.info(f"keyword:{keyword}, note_list:{note_list}")
await self.batch_get_note_comments(note_list) await self.batch_get_note_comments(note_list)
async def batch_get_note_comments(self, note_list: List[str]): async def batch_get_note_comments(self, note_list: List[str]):
@ -264,7 +144,7 @@ class XiaoHongShuCrawler(Crawler):
await asyncio.wait(task_list) await asyncio.wait(task_list)
async def get_comments(self, note_id: str): async def get_comments(self, note_id: str):
print("Begin get note id comments ", note_id) logging.info(f"Begin get note id comments {note_id}")
all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random()) all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
for comment in all_comments: for comment in all_comments:
await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment) await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)

168
media_platform/xhs/login.py Normal file
View File

@ -0,0 +1,168 @@
import sys
import asyncio
import logging
import aioredis
from tenacity import (
retry,
stop_after_attempt,
wait_fixed,
retry_if_result
)
from playwright.async_api import Page
from playwright.async_api import BrowserContext
import config
from tools import utils
from base.base_crawler import AbstractLogin
class XHSLogin(AbstractLogin):
def __init__(self,
login_type: str,
browser_context: BrowserContext,
context_page: Page,
login_phone: str = None,
cookie_str: str = None
):
self.login_type = login_type
self.browser_context = browser_context
self.context_page = context_page
self.login_phone = login_phone
self.cookie_str = cookie_str
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
async def check_login_state(self, no_logged_in_session: str) -> bool:
"""Check if the current login status is successful and return True otherwise return False"""
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
current_web_session = cookie_dict.get("web_session")
if current_web_session != no_logged_in_session:
return True
return False
async def begin(self):
if self.login_type == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
await self.login_by_mobile()
elif self.login_type == "cookies":
await self.login_by_cookies()
else:
raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookies ...")
async def login_by_mobile(self):
logging.info("Begin login xiaohongshu by mobile ...")
await asyncio.sleep(1)
try:
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
login_button_ele = await self.context_page.wait_for_selector(
selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
timeout=5000
)
await login_button_ele.click()
# 弹窗的登录对话框也有两种形态,一种是直接可以看到手机号和验证码的
# 另一种是需要点击切换到手机登录的
element = await self.context_page.wait_for_selector(
selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
timeout=5000
)
await element.click()
except Exception as e:
logging.info("have not found mobile button icon and keep going ...")
await asyncio.sleep(1)
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
input_ele = await login_container_ele.query_selector("label.phone > input")
await input_ele.fill(self.login_phone)
await asyncio.sleep(0.5)
send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
await send_btn_ele.click() # 点击发送验证码
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
no_logged_in_session = ""
while max_get_sms_code_time > 0:
logging.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"xhs_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key)
if not sms_code_value:
max_get_sms_code_time -= 1
continue
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
no_logged_in_session = cookie_dict.get("web_session")
await sms_code_input_ele.fill(value=sms_code_value) # 输入短信验证码
await asyncio.sleep(0.5)
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
await agree_privacy_ele.click() # 点击同意隐私协议
await asyncio.sleep(0.5)
await submit_btn_ele.click() # 点击登录
# todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确
break
login_flag: bool = await self.check_login_state(no_logged_in_session)
if not login_flag:
logging.info("login failed please confirm ...")
sys.exit()
wait_redirect_seconds = 5
logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_qrcode(self):
"""login xiaohongshu website and keep webdriver login state"""
logging.info("Begin login xiaohongshu by qrcode ...")
await asyncio.sleep(10)
# login_selector = "div.login-container > div.left > div.qrcode > img"
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
# find login qrcode
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
logging.info("login failed , have not found qrcode please check ....")
# if this website does not automatically popup login dialog box, we will manual click login button
await asyncio.sleep(0.5)
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
await login_button_ele.click()
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
sys.exit()
# get not logged session
current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie)
no_logged_in_session = cookie_dict.get("web_session")
# show login qrcode
utils.show_qrcode(base64_qrcode_img)
logging.info(f"waiting for scan code login, remaining time is 20s")
login_flag: bool = await self.check_login_state(no_logged_in_session)
if not login_flag:
logging.info("login failed please confirm ...")
sys.exit()
wait_redirect_seconds = 5
logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds)
async def login_by_cookies(self):
logging.info("Begin login xiaohongshu by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,
'value': value,
'domain': ".xiaohongshu.com",
'path': "/"
}])

View File

@ -1,7 +1,7 @@
import json import json
from typing import Dict, List from typing import Dict, List
import utils from tools import utils
async def update_douyin_aweme(aweme_item: Dict): async def update_douyin_aweme(aweme_item: Dict):
@ -24,7 +24,7 @@ async def update_douyin_aweme(aweme_item: Dict):
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
} }
# do something ... # do something ...
print(f"update douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}") print(f"douyin aweme id:{aweme_id}, title:{local_db_item.get('title')}")
async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]): async def batch_update_dy_aweme_comments(aweme_id: str, comments: List[Dict]):
@ -61,4 +61,4 @@ async def update_dy_aweme_comment(aweme_id: str, comment_item: Dict):
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
} }
# do something ... # do something ...
print(f"update aweme comment: {comment_id}, content: {local_db_item.get('content')}") print(f"douyin aweme comment: {comment_id}, content: {local_db_item.get('content')}")

View File

@ -1,6 +1,6 @@
from typing import Dict from typing import Dict
import utils from tools import utils
async def update_xhs_note(note_item: Dict): async def update_xhs_note(note_item: Dict):
@ -24,7 +24,7 @@ async def update_xhs_note(note_item: Dict):
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
} }
# do something ... # do something ...
print("update note:", local_db_item) print("xhs note:", local_db_item)
async def update_xhs_note_comment(note_id: str, comment_item: Dict): async def update_xhs_note_comment(note_id: str, comment_item: Dict):
@ -43,4 +43,4 @@ async def update_xhs_note_comment(note_id: str, comment_item: Dict):
"last_modify_ts": utils.get_current_timestamp(), "last_modify_ts": utils.get_current_timestamp(),
} }
# do something ... # do something ...
print("update comment:", local_db_item) print("xhs note comment:", local_db_item)

0
tools/__init__.py Normal file
View File

View File

@ -2,6 +2,7 @@ import re
import time import time
import random import random
import base64 import base64
import logging
from io import BytesIO from io import BytesIO
from typing import Optional, Dict, List, Tuple from typing import Optional, Dict, List, Tuple
@ -91,3 +92,13 @@ def match_interact_info_count(count_str: str) -> int:
return int(number) return int(number)
else: else:
return 0 return 0
def init_loging_config():
level = logging.INFO
logging.basicConfig(
level=level,
format="%(asctime)s %(name)s %(levelname)s %(message)s ",
datefmt='%Y-%m-%d %H:%M:%S'
)
logging.Logger("Media Crawler")