fix: issue #22

This commit is contained in:
Relakkes 2023-07-30 20:43:02 +08:00
parent 4ff2cf8661
commit bf659455bb
7 changed files with 51 additions and 35 deletions

View File

@ -27,16 +27,32 @@
## 使用方法
1. 安装依赖库
`pip install -r requirements.txt`
2. 安装playwright浏览器驱动
`playwright install`
3. 是否选择开启保存数据到DB中
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量
<br>再执行 `python db.py` 初始化数据库信息,生成相关的数据库表结构
4. 运行爬虫程序
`python main.py --platform xhs --lt qrcode`
5. 打开对应APP扫二维码登录
```shell
pip install -r requirements.txt
```
2. 安装playwright浏览器驱动
```shell
playwright install
```
3. 是否保存数据到DB中
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量。然后执行以下命令初始化数据库信息,生成相关的数据库表结构:
```shell
python db.py
```
4. 运行爬虫程序
```shell
python main.py --platform xhs --lt qrcode
```
5. 打开对应APP扫二维码登录
## 项目代码结构
@ -46,11 +62,9 @@ MediaCrawler
│ ├── base_crawler.py # 项目的抽象类
│ └── proxy_account_pool.py # 账号与IP代理池
├── config
│ ├── account_config.py # 基础配置
│ └── base_config.py # 账号池配置
├── images
│ ├── douyin.gif
│ └── xiaohongshu.git
│ ├── account_config.py # 账号代理池配置
│ ├── base_config.py # 基础配置
│ └── db_config.py # 数据库配置
├── libs
│ ├── douyin.js # 抖音Sign函数
│ └── stealth.min.js # 去除浏览器自动化特征的JS

View File

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
# account_config.py
import os
PHONE_LIST = [
"13012345671",
@ -22,6 +23,5 @@ IP_PROXY_LIST = [
]
IP_PROXY_PROTOCOL = "http://"
IP_PROXY_USER = "xxxx"
IP_PROXY_PASSWORD = "xxxx"
IP_PROXY_USER = os.getenv("IP_PROXY_USER", "test")
IP_PROXY_PASSWORD = os.getenv("IP_PROXY_PASSWORD", "123456")

View File

@ -1,9 +1,12 @@
import os
# redis config
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
REDIS_DB_PWD = "123456" # your redis password
REDIS_DB_HOST = "127.0.0.1" # your redis host
REDIS_DB_PWD = os.getenv("REDIS_DB_PWD", "123456") # your redis password
# mysql config
RELATION_DB_URL = "mysql://root:youdbpassword@localhost:3306/media_crawler"
RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456") # your relation db password
RELATION_DB_URL = f"mysql://root:{RELATION_DB_PWD}@localhost:3306/media_crawler"
# save data to database option
IS_SAVED_DATABASED = True # if you want to save data to database, set True

View File

@ -3,7 +3,7 @@ import functools
import sys
from typing import Optional
import aioredis
import redis
from playwright.async_api import BrowserContext, Page
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
@ -121,20 +121,19 @@ class DouYinLogin(AbstractLogin):
# 检查是否有滑动验证码
await self.check_page_display_slider(move_step=10, slider_level="easy")
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
while max_get_sms_code_time > 0:
utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"dy_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key)
sms_code_value = redis_obj.get(sms_code_key)
if not sms_code_value:
max_get_sms_code_time -= 1
continue
sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']")
await sms_code_input_ele.fill(value=sms_code_value)
await sms_code_input_ele.fill(value=sms_code_value.decode())
await asyncio.sleep(0.5)
submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']")
await submit_btn_ele.click() # 点击登录

View File

@ -3,7 +3,7 @@ import functools
import sys
from typing import Optional
import aioredis
import redis
from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed)
@ -85,15 +85,14 @@ class XHSLogin(AbstractLogin):
await send_btn_ele.click() # 点击发送验证码
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
no_logged_in_session = ""
while max_get_sms_code_time > 0:
utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1)
sms_code_key = f"xhs_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key)
sms_code_value = redis_obj.get(sms_code_key)
if not sms_code_value:
max_get_sms_code_time -= 1
continue
@ -102,7 +101,7 @@ class XHSLogin(AbstractLogin):
_, cookie_dict = utils.convert_cookies(current_cookie)
no_logged_in_session = cookie_dict.get("web_session")
await sms_code_input_ele.fill(value=sms_code_value) # 输入短信验证码
await sms_code_input_ele.fill(value=sms_code_value.decode()) # 输入短信验证码
await asyncio.sleep(0.5)
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
await agree_privacy_ele.click() # 点击同意隐私协议

View File

@ -4,7 +4,7 @@ import json
import re
from typing import List
import aioredis
import redis
import tornado.web
import config
@ -15,7 +15,7 @@ def extract_verification_code(message) -> str:
Extract verification code of 6 digits from the SMS.
"""
pattern = re.compile(r'\b[0-9]{6}\b')
codes: List[str]= pattern.findall(message)
codes: List[str] = pattern.findall(message)
return codes[0] if codes and len(codes) > 0 else ""
@ -47,7 +47,7 @@ class RecvSmsNotificationHandler(tornado.web.RequestHandler):
request_body = self.request.body.decode("utf-8")
req_body_dict = json.loads(request_body)
print("recv sms notification and body content: ", req_body_dict)
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
sms_content = req_body_dict.get("sms_content")
sms_code = extract_verification_code(sms_content)
if sms_code:
@ -55,7 +55,7 @@ class RecvSmsNotificationHandler(tornado.web.RequestHandler):
# Use Redis string data structure, in the following format:
# xhs_138xxxxxxxx -> 171959
key = f"{req_body_dict.get('platform')}_{req_body_dict.get('current_number')}"
await redis_obj.set(name=key, value=sms_code, ex=60 * 3)
redis_obj.set(name=key, value=sms_code, ex=60 * 3)
self.set_status(200)
self.write("ok")

View File

@ -1,10 +1,11 @@
httpx==0.24.0
Pillow==9.5.0
playwright==1.33.0
aioredis==2.0.1
tenacity==8.2.2
tornado==6.3.2
PyExecJS==1.5.1
opencv-python==4.7.0.72
tortoise-orm[asyncmy]==0.19.3
aerich==0.7.2
numpy~=1.24.4
redis~=4.6.0