fix: issue #22

This commit is contained in:
Relakkes 2023-07-30 20:43:02 +08:00
parent 4ff2cf8661
commit bf659455bb
7 changed files with 51 additions and 35 deletions

View File

@ -27,16 +27,32 @@
## 使用方法 ## 使用方法
1. 安装依赖库 1. 安装依赖库
`pip install -r requirements.txt`
2. 安装playwright浏览器驱动
`playwright install`
3. 是否选择开启保存数据到DB中
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量
<br>再执行 `python db.py` 初始化数据库信息,生成相关的数据库表结构
4. 运行爬虫程序
`python main.py --platform xhs --lt qrcode`
5. 打开对应APP扫二维码登录
```shell
pip install -r requirements.txt
```
2. 安装playwright浏览器驱动
```shell
playwright install
```
3. 是否保存数据到DB中
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量。然后执行以下命令初始化数据库信息,生成相关的数据库表结构:
```shell
python db.py
```
4. 运行爬虫程序
```shell
python main.py --platform xhs --lt qrcode
```
5. 打开对应APP扫二维码登录
## 项目代码结构 ## 项目代码结构
@ -46,11 +62,9 @@ MediaCrawler
│ ├── base_crawler.py # 项目的抽象类 │ ├── base_crawler.py # 项目的抽象类
│ └── proxy_account_pool.py # 账号与IP代理池 │ └── proxy_account_pool.py # 账号与IP代理池
├── config ├── config
│ ├── account_config.py # 基础配置 │ ├── account_config.py # 账号代理池配置
│ └── base_config.py # 账号池配置 │ ├── base_config.py # 基础配置
├── images │ └── db_config.py # 数据库配置
│ ├── douyin.gif
│ └── xiaohongshu.git
├── libs ├── libs
│ ├── douyin.js # 抖音Sign函数 │ ├── douyin.js # 抖音Sign函数
│ └── stealth.min.js # 去除浏览器自动化特征的JS │ └── stealth.min.js # 去除浏览器自动化特征的JS

View File

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# account_config.py # account_config.py
import os
PHONE_LIST = [ PHONE_LIST = [
"13012345671", "13012345671",
@ -22,6 +23,5 @@ IP_PROXY_LIST = [
] ]
IP_PROXY_PROTOCOL = "http://" IP_PROXY_PROTOCOL = "http://"
IP_PROXY_USER = "xxxx" IP_PROXY_USER = os.getenv("IP_PROXY_USER", "test")
IP_PROXY_PASSWORD = "xxxx" IP_PROXY_PASSWORD = os.getenv("IP_PROXY_PASSWORD", "123456")

View File

@ -1,9 +1,12 @@
import os
# redis config # redis config
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host REDIS_DB_HOST = "127.0.0.1" # your redis host
REDIS_DB_PWD = "123456" # your redis password REDIS_DB_PWD = os.getenv("REDIS_DB_PWD", "123456") # your redis password
# mysql config # mysql config
RELATION_DB_URL = "mysql://root:youdbpassword@localhost:3306/media_crawler" RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456") # your relation db password
RELATION_DB_URL = f"mysql://root:{RELATION_DB_PWD}@localhost:3306/media_crawler"
# save data to database option # save data to database option
IS_SAVED_DATABASED = True # if you want to save data to database, set True IS_SAVED_DATABASED = True # if you want to save data to database, set True

View File

@ -3,7 +3,7 @@ import functools
import sys import sys
from typing import Optional from typing import Optional
import aioredis import redis
from playwright.async_api import BrowserContext, Page from playwright.async_api import BrowserContext, Page
from playwright.async_api import TimeoutError as PlaywrightTimeoutError from playwright.async_api import TimeoutError as PlaywrightTimeoutError
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
@ -121,20 +121,19 @@ class DouYinLogin(AbstractLogin):
# 检查是否有滑动验证码 # 检查是否有滑动验证码
await self.check_page_display_slider(move_step=10, slider_level="easy") await self.check_page_display_slider(move_step=10, slider_level="easy")
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
while max_get_sms_code_time > 0: while max_get_sms_code_time > 0:
utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...") utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1) await asyncio.sleep(1)
sms_code_key = f"dy_{self.login_phone}" sms_code_key = f"dy_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key) sms_code_value = redis_obj.get(sms_code_key)
if not sms_code_value: if not sms_code_value:
max_get_sms_code_time -= 1 max_get_sms_code_time -= 1
continue continue
sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']") sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']")
await sms_code_input_ele.fill(value=sms_code_value) await sms_code_input_ele.fill(value=sms_code_value.decode())
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']") submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']")
await submit_btn_ele.click() # 点击登录 await submit_btn_ele.click() # 点击登录

View File

@ -3,7 +3,7 @@ import functools
import sys import sys
from typing import Optional from typing import Optional
import aioredis import redis
from playwright.async_api import BrowserContext, Page from playwright.async_api import BrowserContext, Page
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
wait_fixed) wait_fixed)
@ -85,15 +85,14 @@ class XHSLogin(AbstractLogin):
await send_btn_ele.click() # 点击发送验证码 await send_btn_ele.click() # 点击发送验证码
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input") sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button") submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
no_logged_in_session = "" no_logged_in_session = ""
while max_get_sms_code_time > 0: while max_get_sms_code_time > 0:
utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...") utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1) await asyncio.sleep(1)
sms_code_key = f"xhs_{self.login_phone}" sms_code_key = f"xhs_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key) sms_code_value = redis_obj.get(sms_code_key)
if not sms_code_value: if not sms_code_value:
max_get_sms_code_time -= 1 max_get_sms_code_time -= 1
continue continue
@ -102,7 +101,7 @@ class XHSLogin(AbstractLogin):
_, cookie_dict = utils.convert_cookies(current_cookie) _, cookie_dict = utils.convert_cookies(current_cookie)
no_logged_in_session = cookie_dict.get("web_session") no_logged_in_session = cookie_dict.get("web_session")
await sms_code_input_ele.fill(value=sms_code_value) # 输入短信验证码 await sms_code_input_ele.fill(value=sms_code_value.decode()) # 输入短信验证码
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']") agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
await agree_privacy_ele.click() # 点击同意隐私协议 await agree_privacy_ele.click() # 点击同意隐私协议

View File

@ -4,7 +4,7 @@ import json
import re import re
from typing import List from typing import List
import aioredis import redis
import tornado.web import tornado.web
import config import config
@ -47,7 +47,7 @@ class RecvSmsNotificationHandler(tornado.web.RequestHandler):
request_body = self.request.body.decode("utf-8") request_body = self.request.body.decode("utf-8")
req_body_dict = json.loads(request_body) req_body_dict = json.loads(request_body)
print("recv sms notification and body content: ", req_body_dict) print("recv sms notification and body content: ", req_body_dict)
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True) redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
sms_content = req_body_dict.get("sms_content") sms_content = req_body_dict.get("sms_content")
sms_code = extract_verification_code(sms_content) sms_code = extract_verification_code(sms_content)
if sms_code: if sms_code:
@ -55,7 +55,7 @@ class RecvSmsNotificationHandler(tornado.web.RequestHandler):
# Use Redis string data structure, in the following format: # Use Redis string data structure, in the following format:
# xhs_138xxxxxxxx -> 171959 # xhs_138xxxxxxxx -> 171959
key = f"{req_body_dict.get('platform')}_{req_body_dict.get('current_number')}" key = f"{req_body_dict.get('platform')}_{req_body_dict.get('current_number')}"
await redis_obj.set(name=key, value=sms_code, ex=60 * 3) redis_obj.set(name=key, value=sms_code, ex=60 * 3)
self.set_status(200) self.set_status(200)
self.write("ok") self.write("ok")

View File

@ -1,10 +1,11 @@
httpx==0.24.0 httpx==0.24.0
Pillow==9.5.0 Pillow==9.5.0
playwright==1.33.0 playwright==1.33.0
aioredis==2.0.1
tenacity==8.2.2 tenacity==8.2.2
tornado==6.3.2 tornado==6.3.2
PyExecJS==1.5.1 PyExecJS==1.5.1
opencv-python==4.7.0.72 opencv-python==4.7.0.72
tortoise-orm[asyncmy]==0.19.3 tortoise-orm[asyncmy]==0.19.3
aerich==0.7.2 aerich==0.7.2
numpy~=1.24.4
redis~=4.6.0