fix: issue #22
This commit is contained in:
parent
4ff2cf8661
commit
bf659455bb
42
README.md
42
README.md
|
@ -27,16 +27,32 @@
|
|||
## 使用方法
|
||||
|
||||
1. 安装依赖库
|
||||
`pip install -r requirements.txt`
|
||||
2. 安装playwright浏览器驱动
|
||||
`playwright install`
|
||||
3. 是否选择开启保存数据到DB中
|
||||
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量
|
||||
<br>再执行 `python db.py` 初始化数据库信息,生成相关的数据库表结构
|
||||
4. 运行爬虫程序
|
||||
`python main.py --platform xhs --lt qrcode`
|
||||
5. 打开对应APP扫二维码登录
|
||||
|
||||
```shell
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
2. 安装playwright浏览器驱动
|
||||
|
||||
```shell
|
||||
playwright install
|
||||
```
|
||||
|
||||
3. 是否保存数据到DB中
|
||||
|
||||
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量。然后执行以下命令初始化数据库信息,生成相关的数据库表结构:
|
||||
|
||||
```shell
|
||||
python db.py
|
||||
```
|
||||
|
||||
4. 运行爬虫程序
|
||||
|
||||
```shell
|
||||
python main.py --platform xhs --lt qrcode
|
||||
```
|
||||
|
||||
5. 打开对应APP扫二维码登录
|
||||
|
||||
## 项目代码结构
|
||||
|
||||
|
@ -46,11 +62,9 @@ MediaCrawler
|
|||
│ ├── base_crawler.py # 项目的抽象类
|
||||
│ └── proxy_account_pool.py # 账号与IP代理池
|
||||
├── config
|
||||
│ ├── account_config.py # 基础配置
|
||||
│ └── base_config.py # 账号池配置
|
||||
├── images
|
||||
│ ├── douyin.gif
|
||||
│ └── xiaohongshu.git
|
||||
│ ├── account_config.py # 账号代理池配置
|
||||
│ ├── base_config.py # 基础配置
|
||||
│ └── db_config.py # 数据库配置
|
||||
├── libs
|
||||
│ ├── douyin.js # 抖音Sign函数
|
||||
│ └── stealth.min.js # 去除浏览器自动化特征的JS
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# account_config.py
|
||||
import os
|
||||
|
||||
PHONE_LIST = [
|
||||
"13012345671",
|
||||
|
@ -22,6 +23,5 @@ IP_PROXY_LIST = [
|
|||
]
|
||||
|
||||
IP_PROXY_PROTOCOL = "http://"
|
||||
IP_PROXY_USER = "xxxx"
|
||||
IP_PROXY_PASSWORD = "xxxx"
|
||||
|
||||
IP_PROXY_USER = os.getenv("IP_PROXY_USER", "test")
|
||||
IP_PROXY_PASSWORD = os.getenv("IP_PROXY_PASSWORD", "123456")
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
import os
|
||||
|
||||
# redis config
|
||||
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
|
||||
REDIS_DB_PWD = "123456" # your redis password
|
||||
REDIS_DB_HOST = "127.0.0.1" # your redis host
|
||||
REDIS_DB_PWD = os.getenv("REDIS_DB_PWD", "123456") # your redis password
|
||||
|
||||
# mysql config
|
||||
RELATION_DB_URL = "mysql://root:youdbpassword@localhost:3306/media_crawler"
|
||||
RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456") # your relation db password
|
||||
RELATION_DB_URL = f"mysql://root:{RELATION_DB_PWD}@localhost:3306/media_crawler"
|
||||
|
||||
# save data to database option
|
||||
IS_SAVED_DATABASED = True # if you want to save data to database, set True
|
||||
|
|
|
@ -3,7 +3,7 @@ import functools
|
|||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import aioredis
|
||||
import redis
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
|
@ -121,20 +121,19 @@ class DouYinLogin(AbstractLogin):
|
|||
|
||||
# 检查是否有滑动验证码
|
||||
await self.check_page_display_slider(move_step=10, slider_level="easy")
|
||||
|
||||
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
|
||||
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
|
||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
await asyncio.sleep(1)
|
||||
sms_code_key = f"dy_{self.login_phone}"
|
||||
sms_code_value = await redis_obj.get(sms_code_key)
|
||||
sms_code_value = redis_obj.get(sms_code_key)
|
||||
if not sms_code_value:
|
||||
max_get_sms_code_time -= 1
|
||||
continue
|
||||
|
||||
sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']")
|
||||
await sms_code_input_ele.fill(value=sms_code_value)
|
||||
await sms_code_input_ele.fill(value=sms_code_value.decode())
|
||||
await asyncio.sleep(0.5)
|
||||
submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']")
|
||||
await submit_btn_ele.click() # 点击登录
|
||||
|
|
|
@ -3,7 +3,7 @@ import functools
|
|||
import sys
|
||||
from typing import Optional
|
||||
|
||||
import aioredis
|
||||
import redis
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||
wait_fixed)
|
||||
|
@ -85,15 +85,14 @@ class XHSLogin(AbstractLogin):
|
|||
await send_btn_ele.click() # 点击发送验证码
|
||||
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
|
||||
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
|
||||
|
||||
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
|
||||
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
|
||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||
no_logged_in_session = ""
|
||||
while max_get_sms_code_time > 0:
|
||||
utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||
await asyncio.sleep(1)
|
||||
sms_code_key = f"xhs_{self.login_phone}"
|
||||
sms_code_value = await redis_obj.get(sms_code_key)
|
||||
sms_code_value = redis_obj.get(sms_code_key)
|
||||
if not sms_code_value:
|
||||
max_get_sms_code_time -= 1
|
||||
continue
|
||||
|
@ -102,7 +101,7 @@ class XHSLogin(AbstractLogin):
|
|||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
no_logged_in_session = cookie_dict.get("web_session")
|
||||
|
||||
await sms_code_input_ele.fill(value=sms_code_value) # 输入短信验证码
|
||||
await sms_code_input_ele.fill(value=sms_code_value.decode()) # 输入短信验证码
|
||||
await asyncio.sleep(0.5)
|
||||
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
|
||||
await agree_privacy_ele.click() # 点击同意隐私协议
|
||||
|
|
|
@ -4,7 +4,7 @@ import json
|
|||
import re
|
||||
from typing import List
|
||||
|
||||
import aioredis
|
||||
import redis
|
||||
import tornado.web
|
||||
|
||||
import config
|
||||
|
@ -15,7 +15,7 @@ def extract_verification_code(message) -> str:
|
|||
Extract verification code of 6 digits from the SMS.
|
||||
"""
|
||||
pattern = re.compile(r'\b[0-9]{6}\b')
|
||||
codes: List[str]= pattern.findall(message)
|
||||
codes: List[str] = pattern.findall(message)
|
||||
return codes[0] if codes and len(codes) > 0 else ""
|
||||
|
||||
|
||||
|
@ -47,7 +47,7 @@ class RecvSmsNotificationHandler(tornado.web.RequestHandler):
|
|||
request_body = self.request.body.decode("utf-8")
|
||||
req_body_dict = json.loads(request_body)
|
||||
print("recv sms notification and body content: ", req_body_dict)
|
||||
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
|
||||
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
|
||||
sms_content = req_body_dict.get("sms_content")
|
||||
sms_code = extract_verification_code(sms_content)
|
||||
if sms_code:
|
||||
|
@ -55,7 +55,7 @@ class RecvSmsNotificationHandler(tornado.web.RequestHandler):
|
|||
# Use Redis string data structure, in the following format:
|
||||
# xhs_138xxxxxxxx -> 171959
|
||||
key = f"{req_body_dict.get('platform')}_{req_body_dict.get('current_number')}"
|
||||
await redis_obj.set(name=key, value=sms_code, ex=60 * 3)
|
||||
redis_obj.set(name=key, value=sms_code, ex=60 * 3)
|
||||
self.set_status(200)
|
||||
self.write("ok")
|
||||
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
httpx==0.24.0
|
||||
Pillow==9.5.0
|
||||
playwright==1.33.0
|
||||
aioredis==2.0.1
|
||||
tenacity==8.2.2
|
||||
tornado==6.3.2
|
||||
PyExecJS==1.5.1
|
||||
opencv-python==4.7.0.72
|
||||
tortoise-orm[asyncmy]==0.19.3
|
||||
aerich==0.7.2
|
||||
numpy~=1.24.4
|
||||
redis~=4.6.0
|
Loading…
Reference in New Issue