开始抓取数据
This commit is contained in:
parent
7a8610cd0c
commit
c8f21183f6
|
@ -19,18 +19,19 @@
|
|||
| 链接地址 | 链接名称 |
|
||||
|---------------------------|----------------------|
|
||||
| https://audiences.me/ | Audiences【观众/奥迪】 |
|
||||
| https://ptchdbits.co/ | CHDBits【新岛/金钱岛】 |
|
||||
| https://hdchina.org/ | HDChina【瓷器】 |
|
||||
| http://hdhome.org/ | HDHome【家园】 |
|
||||
| https://hdsky.me/ | HDSky【高清天空】 |
|
||||
| https://hhanclub.top/ | HHanClub【憨憨】 |
|
||||
| https://pt.keepfrds.com/ | KeepFRDS【朋友/月月】 |
|
||||
| https://kp.m-team.cc/ | M-Team【馒头】 |
|
||||
| https://open.cd/ | OpenCD【皇后】 |
|
||||
| https://ourbits.club/ | OurBits【我堡】 |
|
||||
| https://ptchdbits.co/ | CHDBits【新岛/金钱岛】 |
|
||||
| https://pterclub.com/ | PTerClub【PT之友俱乐部/猫站】 |
|
||||
| https://springsunday.net/ | SSD【春天/不可说】 |
|
||||
| https://totheglory.im/ | TTG【听听歌/套套哥】 |
|
||||
| https://kp.m-team.cc/ | M-Team【馒头】 |
|
||||
|
||||
|
||||
## nas-tools 认证站点
|
||||
| 链接地址 | 链接名称 | 备注 |
|
||||
|
|
|
@ -7,4 +7,97 @@
|
|||
5. 数据入库
|
||||
|
||||
数据如何展示呢??
|
||||
"""
|
||||
"""
|
||||
import sys
|
||||
import time
|
||||
|
||||
import requests
|
||||
import toml
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class PtGetData:
|
||||
|
||||
def __init__(self):
|
||||
logger.add("../log/PtGetData_{time:YYYY-MM-DD}.log", rotation="1 day", level="INFO")
|
||||
logger.add(sys.stderr, level="INFO")
|
||||
self.toml_file = 'PT/pt_config.toml'
|
||||
self.headers = {
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'accept-language': 'zh,zh-CN;q=0.9',
|
||||
'cache-control': 'max-age=0',
|
||||
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"macOS"',
|
||||
'sec-fetch-dest': 'document',
|
||||
'sec-fetch-mode': 'navigate',
|
||||
'sec-fetch-site': 'same-origin',
|
||||
'sec-fetch-user': '?1',
|
||||
'upgrade-insecure-requests': '1',
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||
}
|
||||
|
||||
def get_data(self, section_name, section_data):
|
||||
res_txt = f"开始对 [{section_name}] 进行操作...,抓取数据:"
|
||||
url, cookie = section_data.get('url'), section_data.get('cookie')
|
||||
if cookie is not None and len(cookie.strip()) > 0:
|
||||
self.headers["cookie"] = cookie
|
||||
html = self.get_website_html(uri="/torrents.php", section_name=section_name, section_data=section_data)
|
||||
if len(html) == 0:
|
||||
return
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
torrents_table = soup.find('table', class_='torrents')
|
||||
print(torrents_table)
|
||||
except Exception as e:
|
||||
logger.error(f"{section_name} , 页面无法解析,请知晓!!!")
|
||||
|
||||
def get_type(self, section_name, section_data):
|
||||
res_txt = f"开始对 [{section_name}] 进行操作...,抓取网站分类:"
|
||||
url, cookie = section_data.get('url'), section_data.get('cookie')
|
||||
if cookie is not None and len(cookie.strip()) > 0:
|
||||
self.headers["cookie"] = cookie
|
||||
html = self.get_website_html(uri="/getrss.php", section_name=section_name, section_data=section_data)
|
||||
if len(html) == 0:
|
||||
return
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
except Exception as e:
|
||||
logger.error(f"{section_name} , 页面无法解析,请知晓!!!")
|
||||
|
||||
def get_website_html(self, uri, section_name, section_data):
|
||||
url, cookie, attendance_uri = section_data.get('url'), section_data.get('cookie'), section_data.get(
|
||||
'attendance_uri')
|
||||
# cookie不为空时候,可以签到
|
||||
url = url + uri
|
||||
for _ in range(5):
|
||||
try:
|
||||
response = requests.get(url, headers=self.headers, timeout=5 * 60)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
else:
|
||||
logger.error(f"{section_name} , 出现错误,code码是:{response.status_code}, {response.text}!!!")
|
||||
return ""
|
||||
except Exception as e:
|
||||
time.sleep(2)
|
||||
else:
|
||||
logger.error(f"{section_name} , 5次出现错误,无法访问!!!")
|
||||
return ""
|
||||
|
||||
|
||||
def opt(self):
|
||||
try:
|
||||
with open(self.toml_file, 'r', encoding='utf-8') as file:
|
||||
config_data = toml.load(file)
|
||||
# 迭代每个 section
|
||||
for section_name, section_data in config_data.items():
|
||||
print(f"Processing section: {section_name} --- {section_data.get('url')}")
|
||||
url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
|
||||
if flag != 1:
|
||||
# 拉取数据
|
||||
self.get_data(section_name, section_data)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: The file '{self.toml_file}' was not found.")
|
||||
except toml.TomlDecodeError as e:
|
||||
print(f"Error decoding TOML: {e}")
|
||||
|
|
|
@ -0,0 +1,159 @@
|
|||
<bodv>
|
||||
<table class="head" cellspacing="g" cellpadding="g" align="center">..</table>
|
||||
<table class="mainouter" width="g82" cellspacing="g" cellpadding="5" align="center">
|
||||
<tbody>
|
||||
<tr></tr>
|
||||
<tr>
|
||||
<td id="outer" align="center" class="outer" style="padding-top: 20px; padding-bottom: 20px">
|
||||
<div align="center" style="margin-bottom: 10px" id="ad belownav"></div>
|
||||
<table width="1080" class="main" border="g" cellspacing="g" cellpadding="g">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="embedded">
|
||||
<form method="get' name=" searchbox" action="?">.</form>
|
||||
<div align="center" style="margin-top: 10px" id="ad belowsearchbox"></div>
|
||||
<p align="center">.</p>
|
||||
<p align="center">.</p>
|
||||
<table class="torrents" cellspacing="g" cellpadding="5" width="100%">
|
||||
<tr>
|
||||
<td class="colhead" style="padding: 0px">类型</td>
|
||||
<td class="colhead">
|
||||
<a href="?sort=1&type=asc">标题</a>
|
||||
</td>
|
||||
<td class="colhead">
|
||||
<a href="?sort=3&type=desc">
|
||||
<img class="comments" src="pic/trans.gif" alt="comments" title="评论数" />
|
||||
</a>
|
||||
</td>
|
||||
<td class="colhead">
|
||||
<a href="?sort=4&type=desc">
|
||||
<img class="time" src="pic/trans.gif" alt="time" title="存活时间" />
|
||||
</a>
|
||||
</td>
|
||||
<td class="colhead">
|
||||
<a href="?sort=5&type=desc">
|
||||
<img class="size" src="pic/trans.gif" alt="size" title="大小" />
|
||||
</a>
|
||||
</td>
|
||||
<td class="colhead">
|
||||
<a href="?sort=7&type=desc">
|
||||
<img class="seeders" src="pic/trans.gif" alt="seeders" title="种子数" />
|
||||
</a>
|
||||
</td>
|
||||
<td class="colhead">
|
||||
<a href="?sort=8&type=desc">
|
||||
<img class="leechers" src="pic/trans.gif" alt="leechers" title="下载数" />
|
||||
</a>
|
||||
</td>
|
||||
<td class="colhead">
|
||||
<a href="?sort=6&type=desc">
|
||||
<img class="snatched" src="pic/trans.gif" alt="snatched" title="完成数" />
|
||||
</a>
|
||||
</td>
|
||||
<td class="colhead">
|
||||
<a href="?sort=9&type=desc">发布者</a>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="rowfollow nowrap" valign="middle" style='padding: 0px'>
|
||||
<a href="?cat=405">
|
||||
<img class="c_movie" src="pic/cattrans.gif" alt="电影/Movies" title="电影/Movies" style="background-image: url(pic/category/chd/scenetorrents/chs/catsprites.png);" />
|
||||
</a>
|
||||
<img src="pic/cattrans.gif" style="background-image: url(pic/category/chd/scenetorrents/chs/additional/encode.png);" alt="encode" title="encode" />
|
||||
</td>
|
||||
<td class="rowfollow" width="100%" align="left">
|
||||
<table class="torrentname" width="100%">
|
||||
<tr>
|
||||
<td class="embedded">
|
||||
<img class="top0" src="pic/trans.gif" alt="Sticky" title="一级置顶" />
|
||||
|
||||
<a title="The Man in the Iron Mask 1998 Blu-ray 1080p x264 DTS-HD MA 5.1-BtsHD" href="details.php?id=130681&hit=1">
|
||||
<b>The Man in the Iron Mask 1998 Blu-ray 1080p x264 DTS-HD MA 5.1-BtsHD</b>
|
||||
</a>
|
||||
<b>
|
||||
[<font class='hot'>热门</font>
|
||||
]
|
||||
</b>
|
||||
<img class="pro_free" src="pic/trans.gif" alt="Free" onmouseover="domTT_activate(this, event, 'content', '<b><font class="free">免费</font></b>剩余时间:<b><span title="2024-01-18 22:24:08">3天7时</span></b>', 'trail', false, 'delay',500,'lifetime',3000,'fade','both','styleClass','niceTitle', 'fadeMax',87, 'maxWidth', 300);" />
|
||||
(
|
||||
<font color='#0000FF'>
|
||||
剩余时间:<span title="2024-01-18 22:24:08">3天7时</span>
|
||||
</font>
|
||||
)
|
||||
<br />
|
||||
<span class="label label-primary">官方</span>
|
||||
铁面人
|
||||
<div class="progressarea" title="做种中">
|
||||
<img src="styles/progress_seeding.gif" />
|
||||
<div class="progress">
|
||||
<div class="progress_seeding" style="width:100%;"></div>
|
||||
</div>
|
||||
</div>
|
||||
</td>
|
||||
<td width="50" class="embedded" style="text-align: left; " valign="middle">
|
||||
<div>
|
||||
<img src="pic/icon-douban.png" style='padding-bottom: 2px;vertical-align:middle;height: 16px;' alt="豆瓣评分" title="豆瓣评分" /> 无
|
||||
</div>
|
||||
<div>
|
||||
<img src="pic/icon-imdb.png" style='padding-bottom: 2px;vertical-align:middle;height: 16px;' alt="IMDB评分" title="IMDB评分" /> 无
|
||||
</div>
|
||||
</td>
|
||||
<td width="20" class="embedded" style="text-align: right; " valign="middle">
|
||||
<a href="download.php?id=130681">
|
||||
<img class="download" src="pic/trans.gif" style='padding-bottom: 2px;' alt="download" title="下载本种" />
|
||||
</a>
|
||||
<br />
|
||||
<a id="bookmark0" href="javascript: bookmark(130681,0);">
|
||||
<img class="delbookmark" src="pic/trans.gif" alt="Unbookmarked" title="收藏" />
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
<td class="rowfollow">
|
||||
<a href="comment.php?action=add&pid=130681&type=torrent" title="添加评论">0</a>
|
||||
</td>
|
||||
<td class="rowfollow nowrap">
|
||||
<span title="2024-01-13 22:24:08">
|
||||
1天
|
||||
<br />16时
|
||||
</span>
|
||||
</td>
|
||||
<td class="rowfollow">
|
||||
11.17
|
||||
<br />GB
|
||||
</td>
|
||||
<td class="rowfollow" align="center">
|
||||
<b>
|
||||
<a href="details.php?id=130681&hit=1&dllist=1#seeders">186</a>
|
||||
</b>
|
||||
</td>
|
||||
<td class="rowfollow">
|
||||
<b>
|
||||
<a href="details.php?id=130681&hit=1&dllist=1#leechers">2</a>
|
||||
</b>
|
||||
</td>
|
||||
<td class="rowfollow">
|
||||
<a href="viewsnatches.php?id=130681">
|
||||
<b>246</b>
|
||||
</a>
|
||||
</td>
|
||||
<td class="rowfollow">
|
||||
<i>匿名</i>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
<p align="center">.</p>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<div id="footer'></div>
|
||||
<span id=" pinbox-extension-installed">
|
||||
</span>
|
||||
</body>
|
128
PT/test.py
128
PT/test.py
|
@ -1,53 +1,91 @@
|
|||
import json
|
||||
import time
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import toml
|
||||
from lxml import html as lhtml
|
||||
|
||||
headers = {
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'accept-language': 'zh,zh-CN;q=0.9',
|
||||
'cache-control': 'max-age=0',
|
||||
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"macOS"',
|
||||
'sec-fetch-dest': 'document',
|
||||
'sec-fetch-mode': 'navigate',
|
||||
'sec-fetch-site': 'same-origin',
|
||||
'sec-fetch-user': '?1',
|
||||
'upgrade-insecure-requests': '1',
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||
}
|
||||
url = "https://sharkpt.net/signup.php"
|
||||
|
||||
|
||||
response_result = requests.get(url, headers=headers)
|
||||
print(response_result.status_code)
|
||||
print(response_result.text)
|
||||
|
||||
|
||||
def flaresolverr_get(url, text):
|
||||
# headers = {
|
||||
# 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
# 'accept-language': 'zh,zh-CN;q=0.9',
|
||||
# 'cache-control': 'max-age=0',
|
||||
# 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
|
||||
# 'sec-ch-ua-mobile': '?0',
|
||||
# 'sec-ch-ua-platform': '"macOS"',
|
||||
# 'sec-fetch-dest': 'document',
|
||||
# 'sec-fetch-mode': 'navigate',
|
||||
# 'sec-fetch-site': 'same-origin',
|
||||
# 'sec-fetch-user': '?1',
|
||||
# 'upgrade-insecure-requests': '1',
|
||||
# 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||
# }
|
||||
# with open("pt_config.toml", 'r', encoding='utf-8') as file:
|
||||
# config_data = toml.load(file)
|
||||
# # 迭代每个 section
|
||||
# for section_name, section_data in config_data.items():
|
||||
# print(f"Processing section: {section_name} --- {section_data.get('url')}")
|
||||
# url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
|
||||
# if flag != 1 and cookie is not None and len(cookie.strip()) > 0:
|
||||
# headers["cookie"] = cookie
|
||||
# url = url + "/torrents.php"
|
||||
# html = ""
|
||||
# for _ in range(5):
|
||||
# try:
|
||||
# response = requests.get(url, headers=headers, timeout=5 * 60)
|
||||
# if response.status_code == 200:
|
||||
# html = response.text
|
||||
# break
|
||||
# else:
|
||||
# print(f"{section_name} , 出现错误,code码是:{response.status_code}, {response.text}!!!")
|
||||
# break
|
||||
# except Exception as e:
|
||||
# time.sleep(2)
|
||||
# else:
|
||||
# print(f"{section_name} , 5次出现错误,无法访问!!!")
|
||||
# if len(html) == 0:
|
||||
# break
|
||||
with open('/Users/renmeng/Downloads/test.html', 'r', encoding='utf-8') as file:
|
||||
html_code = file.read()
|
||||
try:
|
||||
flaresolverr_url = "http://152.136.50.100:7024/v1"
|
||||
payload = json.dumps({
|
||||
"cmd": "request.get",
|
||||
"url": url,
|
||||
"maxTimeout": 5 * 1000 * 60
|
||||
})
|
||||
headers = {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
response = requests.post(flaresolverr_url, headers=headers, data=payload)
|
||||
res = json.loads(response.text)
|
||||
print(res)
|
||||
if res['status'] == 'ok' and res['solution']['status'] == 200:
|
||||
print(f"最终耗时:{(res['endTimestamp'] - res['startTimestamp'])/ 1000 / 60:.2f} 分钟")
|
||||
elif res['status'] == 'error':
|
||||
print(f"{text} , 访问返回 {res['message']} !!!")
|
||||
return ""
|
||||
# 使用lxml解析HTML
|
||||
doc = lhtml.fromstring(html_code)
|
||||
# 使用XPath获取目标元素
|
||||
title_elements = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@title]/@title')
|
||||
print(title_elements[0])
|
||||
# 详情链接地址
|
||||
details_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@href]/@href')
|
||||
print(f"/{details_link[0]}")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print("出现错误!!!")
|
||||
print(f"页面无法解析,请知晓!!!")
|
||||
|
||||
"""
|
||||
主键id,来源名称,一级标题,二级标题,种子状态,状态剩余时间,做种状态,评论数,做种数,下载数,完成数,发布者,豆瓣评分,IMDB评分,下载链接,详情链接,资源大小
|
||||
CREATE TABLE IF NOT EXISTS pt_website_data (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_name TEXT NOT NULL,
|
||||
first_title TEXT NOT NULL,
|
||||
second_title TEXT NOT NULL,
|
||||
seed_status TEXT,
|
||||
status_remaining_time TEXT,
|
||||
seeding_status TEXT,
|
||||
comment_count INTEGER,
|
||||
seed_count INTEGER,
|
||||
download_count INTEGER,
|
||||
completion_count INTEGER,
|
||||
publisher TEXT,
|
||||
douban_rating REAL,
|
||||
imdb_rating REAL,
|
||||
download_link TEXT,
|
||||
details_link TEXT,
|
||||
size TEXT,
|
||||
UNIQUE(source_name, first_title, second_title)
|
||||
);
|
||||
|
||||
# flaresolverr_get(url, "ourbits")
|
||||
|
||||
CREATE TABLE IF NOT EXISTS pt_website_type (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_name TEXT NOT NULL,
|
||||
type_name TEXT NOT NULL,
|
||||
type_url TEXT NOT NULL
|
||||
);
|
||||
"""
|
Loading…
Reference in New Issue