fix: issue #22
This commit is contained in:
parent
4ff2cf8661
commit
bf659455bb
42
README.md
42
README.md
|
@ -27,16 +27,32 @@
|
||||||
## 使用方法
|
## 使用方法
|
||||||
|
|
||||||
1. 安装依赖库
|
1. 安装依赖库
|
||||||
`pip install -r requirements.txt`
|
|
||||||
2. 安装playwright浏览器驱动
|
|
||||||
`playwright install`
|
|
||||||
3. 是否选择开启保存数据到DB中
|
|
||||||
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量
|
|
||||||
<br>再执行 `python db.py` 初始化数据库信息,生成相关的数据库表结构
|
|
||||||
4. 运行爬虫程序
|
|
||||||
`python main.py --platform xhs --lt qrcode`
|
|
||||||
5. 打开对应APP扫二维码登录
|
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
2. 安装playwright浏览器驱动
|
||||||
|
|
||||||
|
```shell
|
||||||
|
playwright install
|
||||||
|
```
|
||||||
|
|
||||||
|
3. 是否保存数据到DB中
|
||||||
|
|
||||||
|
如果选择开启,则需要配置数据库连接信息,`config/db_config.py` 中的 `IS_SAVED_DATABASED`和`RELATION_DB_URL` 变量。然后执行以下命令初始化数据库信息,生成相关的数据库表结构:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
python db.py
|
||||||
|
```
|
||||||
|
|
||||||
|
4. 运行爬虫程序
|
||||||
|
|
||||||
|
```shell
|
||||||
|
python main.py --platform xhs --lt qrcode
|
||||||
|
```
|
||||||
|
|
||||||
|
5. 打开对应APP扫二维码登录
|
||||||
|
|
||||||
## 项目代码结构
|
## 项目代码结构
|
||||||
|
|
||||||
|
@ -46,11 +62,9 @@ MediaCrawler
|
||||||
│ ├── base_crawler.py # 项目的抽象类
|
│ ├── base_crawler.py # 项目的抽象类
|
||||||
│ └── proxy_account_pool.py # 账号与IP代理池
|
│ └── proxy_account_pool.py # 账号与IP代理池
|
||||||
├── config
|
├── config
|
||||||
│ ├── account_config.py # 基础配置
|
│ ├── account_config.py # 账号代理池配置
|
||||||
│ └── base_config.py # 账号池配置
|
│ ├── base_config.py # 基础配置
|
||||||
├── images
|
│ └── db_config.py # 数据库配置
|
||||||
│ ├── douyin.gif
|
|
||||||
│ └── xiaohongshu.git
|
|
||||||
├── libs
|
├── libs
|
||||||
│ ├── douyin.js # 抖音Sign函数
|
│ ├── douyin.js # 抖音Sign函数
|
||||||
│ └── stealth.min.js # 去除浏览器自动化特征的JS
|
│ └── stealth.min.js # 去除浏览器自动化特征的JS
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# account_config.py
|
# account_config.py
|
||||||
|
import os
|
||||||
|
|
||||||
PHONE_LIST = [
|
PHONE_LIST = [
|
||||||
"13012345671",
|
"13012345671",
|
||||||
|
@ -22,6 +23,5 @@ IP_PROXY_LIST = [
|
||||||
]
|
]
|
||||||
|
|
||||||
IP_PROXY_PROTOCOL = "http://"
|
IP_PROXY_PROTOCOL = "http://"
|
||||||
IP_PROXY_USER = "xxxx"
|
IP_PROXY_USER = os.getenv("IP_PROXY_USER", "test")
|
||||||
IP_PROXY_PASSWORD = "xxxx"
|
IP_PROXY_PASSWORD = os.getenv("IP_PROXY_PASSWORD", "123456")
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
|
import os
|
||||||
|
|
||||||
# redis config
|
# redis config
|
||||||
REDIS_DB_HOST = "redis://127.0.0.1" # your redis host
|
REDIS_DB_HOST = "127.0.0.1" # your redis host
|
||||||
REDIS_DB_PWD = "123456" # your redis password
|
REDIS_DB_PWD = os.getenv("REDIS_DB_PWD", "123456") # your redis password
|
||||||
|
|
||||||
# mysql config
|
# mysql config
|
||||||
RELATION_DB_URL = "mysql://root:youdbpassword@localhost:3306/media_crawler"
|
RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456") # your relation db password
|
||||||
|
RELATION_DB_URL = f"mysql://root:{RELATION_DB_PWD}@localhost:3306/media_crawler"
|
||||||
|
|
||||||
# save data to database option
|
# save data to database option
|
||||||
IS_SAVED_DATABASED = True # if you want to save data to database, set True
|
IS_SAVED_DATABASED = True # if you want to save data to database, set True
|
||||||
|
|
|
@ -3,7 +3,7 @@ import functools
|
||||||
import sys
|
import sys
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import aioredis
|
import redis
|
||||||
from playwright.async_api import BrowserContext, Page
|
from playwright.async_api import BrowserContext, Page
|
||||||
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||||
|
@ -121,20 +121,19 @@ class DouYinLogin(AbstractLogin):
|
||||||
|
|
||||||
# 检查是否有滑动验证码
|
# 检查是否有滑动验证码
|
||||||
await self.check_page_display_slider(move_step=10, slider_level="easy")
|
await self.check_page_display_slider(move_step=10, slider_level="easy")
|
||||||
|
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
|
||||||
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
|
|
||||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||||
while max_get_sms_code_time > 0:
|
while max_get_sms_code_time > 0:
|
||||||
utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
|
utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||||
await asyncio.sleep(1)
|
await asyncio.sleep(1)
|
||||||
sms_code_key = f"dy_{self.login_phone}"
|
sms_code_key = f"dy_{self.login_phone}"
|
||||||
sms_code_value = await redis_obj.get(sms_code_key)
|
sms_code_value = redis_obj.get(sms_code_key)
|
||||||
if not sms_code_value:
|
if not sms_code_value:
|
||||||
max_get_sms_code_time -= 1
|
max_get_sms_code_time -= 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']")
|
sms_code_input_ele = self.context_page.locator("xpath=//input[@placeholder='请输入验证码']")
|
||||||
await sms_code_input_ele.fill(value=sms_code_value)
|
await sms_code_input_ele.fill(value=sms_code_value.decode())
|
||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(0.5)
|
||||||
submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']")
|
submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']")
|
||||||
await submit_btn_ele.click() # 点击登录
|
await submit_btn_ele.click() # 点击登录
|
||||||
|
|
|
@ -3,7 +3,7 @@ import functools
|
||||||
import sys
|
import sys
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import aioredis
|
import redis
|
||||||
from playwright.async_api import BrowserContext, Page
|
from playwright.async_api import BrowserContext, Page
|
||||||
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
|
||||||
wait_fixed)
|
wait_fixed)
|
||||||
|
@ -85,15 +85,14 @@ class XHSLogin(AbstractLogin):
|
||||||
await send_btn_ele.click() # 点击发送验证码
|
await send_btn_ele.click() # 点击发送验证码
|
||||||
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
|
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
|
||||||
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
|
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
|
||||||
|
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
|
||||||
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
|
|
||||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||||
no_logged_in_session = ""
|
no_logged_in_session = ""
|
||||||
while max_get_sms_code_time > 0:
|
while max_get_sms_code_time > 0:
|
||||||
utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||||
await asyncio.sleep(1)
|
await asyncio.sleep(1)
|
||||||
sms_code_key = f"xhs_{self.login_phone}"
|
sms_code_key = f"xhs_{self.login_phone}"
|
||||||
sms_code_value = await redis_obj.get(sms_code_key)
|
sms_code_value = redis_obj.get(sms_code_key)
|
||||||
if not sms_code_value:
|
if not sms_code_value:
|
||||||
max_get_sms_code_time -= 1
|
max_get_sms_code_time -= 1
|
||||||
continue
|
continue
|
||||||
|
@ -102,7 +101,7 @@ class XHSLogin(AbstractLogin):
|
||||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||||
no_logged_in_session = cookie_dict.get("web_session")
|
no_logged_in_session = cookie_dict.get("web_session")
|
||||||
|
|
||||||
await sms_code_input_ele.fill(value=sms_code_value) # 输入短信验证码
|
await sms_code_input_ele.fill(value=sms_code_value.decode()) # 输入短信验证码
|
||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(0.5)
|
||||||
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
|
agree_privacy_ele = self.context_page.locator("xpath=//div[@class='agreements']//*[local-name()='svg']")
|
||||||
await agree_privacy_ele.click() # 点击同意隐私协议
|
await agree_privacy_ele.click() # 点击同意隐私协议
|
||||||
|
|
|
@ -4,7 +4,7 @@ import json
|
||||||
import re
|
import re
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import aioredis
|
import redis
|
||||||
import tornado.web
|
import tornado.web
|
||||||
|
|
||||||
import config
|
import config
|
||||||
|
@ -47,7 +47,7 @@ class RecvSmsNotificationHandler(tornado.web.RequestHandler):
|
||||||
request_body = self.request.body.decode("utf-8")
|
request_body = self.request.body.decode("utf-8")
|
||||||
req_body_dict = json.loads(request_body)
|
req_body_dict = json.loads(request_body)
|
||||||
print("recv sms notification and body content: ", req_body_dict)
|
print("recv sms notification and body content: ", req_body_dict)
|
||||||
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
|
redis_obj = redis.Redis(host=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD)
|
||||||
sms_content = req_body_dict.get("sms_content")
|
sms_content = req_body_dict.get("sms_content")
|
||||||
sms_code = extract_verification_code(sms_content)
|
sms_code = extract_verification_code(sms_content)
|
||||||
if sms_code:
|
if sms_code:
|
||||||
|
@ -55,7 +55,7 @@ class RecvSmsNotificationHandler(tornado.web.RequestHandler):
|
||||||
# Use Redis string data structure, in the following format:
|
# Use Redis string data structure, in the following format:
|
||||||
# xhs_138xxxxxxxx -> 171959
|
# xhs_138xxxxxxxx -> 171959
|
||||||
key = f"{req_body_dict.get('platform')}_{req_body_dict.get('current_number')}"
|
key = f"{req_body_dict.get('platform')}_{req_body_dict.get('current_number')}"
|
||||||
await redis_obj.set(name=key, value=sms_code, ex=60 * 3)
|
redis_obj.set(name=key, value=sms_code, ex=60 * 3)
|
||||||
self.set_status(200)
|
self.set_status(200)
|
||||||
self.write("ok")
|
self.write("ok")
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
httpx==0.24.0
|
httpx==0.24.0
|
||||||
Pillow==9.5.0
|
Pillow==9.5.0
|
||||||
playwright==1.33.0
|
playwright==1.33.0
|
||||||
aioredis==2.0.1
|
|
||||||
tenacity==8.2.2
|
tenacity==8.2.2
|
||||||
tornado==6.3.2
|
tornado==6.3.2
|
||||||
PyExecJS==1.5.1
|
PyExecJS==1.5.1
|
||||||
opencv-python==4.7.0.72
|
opencv-python==4.7.0.72
|
||||||
tortoise-orm[asyncmy]==0.19.3
|
tortoise-orm[asyncmy]==0.19.3
|
||||||
aerich==0.7.2
|
aerich==0.7.2
|
||||||
|
numpy~=1.24.4
|
||||||
|
redis~=4.6.0
|
Loading…
Reference in New Issue