开始抓取数据

This commit is contained in:
rm 2024-01-15 18:11:58 +08:00
parent 7a8610cd0c
commit c8f21183f6
4 changed files with 339 additions and 48 deletions

View File

@ -19,18 +19,19 @@
| 链接地址 | 链接名称 |
|---------------------------|----------------------|
| https://audiences.me/ | Audiences【观众/奥迪】 |
| https://ptchdbits.co/ | CHDBits【新岛/金钱岛】 |
| https://hdchina.org/ | HDChina【瓷器】 |
| http://hdhome.org/ | HDHome【家园】 |
| https://hdsky.me/ | HDSky【高清天空】 |
| https://hhanclub.top/ | HHanClub【憨憨】 |
| https://pt.keepfrds.com/ | KeepFRDS【朋友/月月】 |
| https://kp.m-team.cc/ | M-Team【馒头】 |
| https://open.cd/ | OpenCD【皇后】 |
| https://ourbits.club/ | OurBits【我堡】 |
| https://ptchdbits.co/ | CHDBits【新岛/金钱岛】 |
| https://pterclub.com/ | PTerClub【PT之友俱乐部/猫站】 |
| https://springsunday.net/ | SSD【春天/不可说】 |
| https://totheglory.im/ | TTG【听听歌/套套哥】 |
| https://kp.m-team.cc/ | M-Team【馒头】 |
## nas-tools 认证站点
| 链接地址 | 链接名称 | 备注 |

View File

@ -7,4 +7,97 @@
5. 数据入库
数据如何展示呢
"""
"""
import sys
import time
import requests
import toml
from bs4 import BeautifulSoup
from loguru import logger
class PtGetData:
def __init__(self):
logger.add("../log/PtGetData_{time:YYYY-MM-DD}.log", rotation="1 day", level="INFO")
logger.add(sys.stderr, level="INFO")
self.toml_file = 'PT/pt_config.toml'
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh,zh-CN;q=0.9',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
}
def get_data(self, section_name, section_data):
res_txt = f"开始对 [{section_name}] 进行操作...,抓取数据:"
url, cookie = section_data.get('url'), section_data.get('cookie')
if cookie is not None and len(cookie.strip()) > 0:
self.headers["cookie"] = cookie
html = self.get_website_html(uri="/torrents.php", section_name=section_name, section_data=section_data)
if len(html) == 0:
return
try:
soup = BeautifulSoup(html, 'html.parser')
torrents_table = soup.find('table', class_='torrents')
print(torrents_table)
except Exception as e:
logger.error(f"{section_name} , 页面无法解析,请知晓!!!")
def get_type(self, section_name, section_data):
res_txt = f"开始对 [{section_name}] 进行操作...,抓取网站分类:"
url, cookie = section_data.get('url'), section_data.get('cookie')
if cookie is not None and len(cookie.strip()) > 0:
self.headers["cookie"] = cookie
html = self.get_website_html(uri="/getrss.php", section_name=section_name, section_data=section_data)
if len(html) == 0:
return
try:
soup = BeautifulSoup(html, 'html.parser')
except Exception as e:
logger.error(f"{section_name} , 页面无法解析,请知晓!!!")
def get_website_html(self, uri, section_name, section_data):
url, cookie, attendance_uri = section_data.get('url'), section_data.get('cookie'), section_data.get(
'attendance_uri')
# cookie不为空时候可以签到
url = url + uri
for _ in range(5):
try:
response = requests.get(url, headers=self.headers, timeout=5 * 60)
if response.status_code == 200:
return response.text
else:
logger.error(f"{section_name} , 出现错误code码是{response.status_code}, {response.text}")
return ""
except Exception as e:
time.sleep(2)
else:
logger.error(f"{section_name} , 5次出现错误无法访问")
return ""
def opt(self):
try:
with open(self.toml_file, 'r', encoding='utf-8') as file:
config_data = toml.load(file)
# 迭代每个 section
for section_name, section_data in config_data.items():
print(f"Processing section: {section_name} --- {section_data.get('url')}")
url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
if flag != 1:
# 拉取数据
self.get_data(section_name, section_data)
except FileNotFoundError:
print(f"Error: The file '{self.toml_file}' was not found.")
except toml.TomlDecodeError as e:
print(f"Error decoding TOML: {e}")

159
PT/test.html Normal file
View File

@ -0,0 +1,159 @@
<bodv>
<table class="head" cellspacing="g" cellpadding="g" align="center">..</table>
<table class="mainouter" width="g82" cellspacing="g" cellpadding="5" align="center">
<tbody>
<tr></tr>
<tr>
<td id="outer" align="center" class="outer" style="padding-top: 20px; padding-bottom: 20px">
<div align="center" style="margin-bottom: 10px" id="ad belownav"></div>
<table width="1080" class="main" border="g" cellspacing="g" cellpadding="g">
<tbody>
<tr>
<td class="embedded">
<form method="get' name=" searchbox" action="?">.</form>
<div align="center" style="margin-top: 10px" id="ad belowsearchbox"></div>
<p align="center">.</p>
<p align="center">.</p>
<table class="torrents" cellspacing="g" cellpadding="5" width="100%">
<tr>
<td class="colhead" style="padding: 0px">类型</td>
<td class="colhead">
<a href="?sort=1&amp;type=asc">标题</a>
</td>
<td class="colhead">
<a href="?sort=3&amp;type=desc">
<img class="comments" src="pic/trans.gif" alt="comments" title="评论数" />
</a>
</td>
<td class="colhead">
<a href="?sort=4&amp;type=desc">
<img class="time" src="pic/trans.gif" alt="time" title="存活时间" />
</a>
</td>
<td class="colhead">
<a href="?sort=5&amp;type=desc">
<img class="size" src="pic/trans.gif" alt="size" title="大小" />
</a>
</td>
<td class="colhead">
<a href="?sort=7&amp;type=desc">
<img class="seeders" src="pic/trans.gif" alt="seeders" title="种子数" />
</a>
</td>
<td class="colhead">
<a href="?sort=8&amp;type=desc">
<img class="leechers" src="pic/trans.gif" alt="leechers" title="下载数" />
</a>
</td>
<td class="colhead">
<a href="?sort=6&amp;type=desc">
<img class="snatched" src="pic/trans.gif" alt="snatched" title="完成数" />
</a>
</td>
<td class="colhead">
<a href="?sort=9&amp;type=desc">发布者</a>
</td>
</tr>
<tr>
<td class="rowfollow nowrap" valign="middle" style='padding: 0px'>
<a href="?cat=405">
<img class="c_movie" src="pic/cattrans.gif" alt="电影/Movies" title="电影/Movies" style="background-image: url(pic/category/chd/scenetorrents/chs/catsprites.png);" />
</a>
<img src="pic/cattrans.gif" style="background-image: url(pic/category/chd/scenetorrents/chs/additional/encode.png);" alt="encode" title="encode" />
</td>
<td class="rowfollow" width="100%" align="left">
<table class="torrentname" width="100%">
<tr>
<td class="embedded">
<img class="top0" src="pic/trans.gif" alt="Sticky" title="一级置顶" />
&nbsp;
<a title="The Man in the Iron Mask 1998 Blu-ray 1080p x264 DTS-HD MA 5.1-BtsHD" href="details.php?id=130681&amp;hit=1">
<b>The Man in the Iron Mask 1998 Blu-ray 1080p x264 DTS-HD MA 5.1-BtsHD</b>
</a>
<b>
[<font class='hot'>热门</font>
]
</b>
<img class="pro_free" src="pic/trans.gif" alt="Free" onmouseover="domTT_activate(this, event, 'content', '&lt;b&gt;&lt;font class=&quot;free&quot;&gt;免费&lt;/font&gt;&lt;/b&gt;剩余时间:&lt;b&gt;&lt;span title=&quot;2024-01-18 22:24:08&quot;&gt;3天7时&lt;/span&gt;&lt;/b&gt;', 'trail', false, 'delay',500,'lifetime',3000,'fade','both','styleClass','niceTitle', 'fadeMax',87, 'maxWidth', 300);" />
(
<font color='#0000FF'>
剩余时间:<span title="2024-01-18 22:24:08">3天7时</span>
</font>
)
<br />
<span class="label label-primary">官方</span>
铁面人
<div class="progressarea" title="做种中">
<img src="styles/progress_seeding.gif" />
<div class="progress">
<div class="progress_seeding" style="width:100%;"></div>
</div>
</div>
</td>
<td width="50" class="embedded" style="text-align: left; " valign="middle">
<div>
<img src="pic/icon-douban.png" style='padding-bottom: 2px;vertical-align:middle;height: 16px;' alt="豆瓣评分" title="豆瓣评分" />&nbsp;&nbsp;
</div>
<div>
<img src="pic/icon-imdb.png" style='padding-bottom: 2px;vertical-align:middle;height: 16px;' alt="IMDB评分" title="IMDB评分" />&nbsp;&nbsp;
</div>
</td>
<td width="20" class="embedded" style="text-align: right; " valign="middle">
<a href="download.php?id=130681">
<img class="download" src="pic/trans.gif" style='padding-bottom: 2px;' alt="download" title="下载本种" />
</a>
<br />
<a id="bookmark0" href="javascript: bookmark(130681,0);">
<img class="delbookmark" src="pic/trans.gif" alt="Unbookmarked" title="收藏" />
</a>
</td>
</tr>
</table>
</td>
<td class="rowfollow">
<a href="comment.php?action=add&amp;pid=130681&amp;type=torrent" title="添加评论">0</a>
</td>
<td class="rowfollow nowrap">
<span title="2024-01-13 22:24:08">
1天
<br />16时
</span>
</td>
<td class="rowfollow">
11.17
<br />GB
</td>
<td class="rowfollow" align="center">
<b>
<a href="details.php?id=130681&amp;hit=1&amp;dllist=1#seeders">186</a>
</b>
</td>
<td class="rowfollow">
<b>
<a href="details.php?id=130681&amp;hit=1&amp;dllist=1#leechers">2</a>
</b>
</td>
<td class="rowfollow">
<a href="viewsnatches.php?id=130681">
<b>246</b>
</a>
</td>
<td class="rowfollow">
<i>匿名</i>
</td>
</tr>
</table>
<p align="center">.</p>
</td>
</tr>
</tbody>
</table>
</td>
</tr>
</tbody>
</table>
<div id="footer'></div>
<span id=" pinbox-extension-installed">
</span>
</body>

View File

@ -1,53 +1,91 @@
import json
import time
import requests
from bs4 import BeautifulSoup
import toml
from lxml import html as lhtml
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh,zh-CN;q=0.9',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
}
url = "https://sharkpt.net/signup.php"
response_result = requests.get(url, headers=headers)
print(response_result.status_code)
print(response_result.text)
def flaresolverr_get(url, text):
# headers = {
# 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'accept-language': 'zh,zh-CN;q=0.9',
# 'cache-control': 'max-age=0',
# 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"macOS"',
# 'sec-fetch-dest': 'document',
# 'sec-fetch-mode': 'navigate',
# 'sec-fetch-site': 'same-origin',
# 'sec-fetch-user': '?1',
# 'upgrade-insecure-requests': '1',
# 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
# }
# with open("pt_config.toml", 'r', encoding='utf-8') as file:
# config_data = toml.load(file)
# # 迭代每个 section
# for section_name, section_data in config_data.items():
# print(f"Processing section: {section_name} --- {section_data.get('url')}")
# url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
# if flag != 1 and cookie is not None and len(cookie.strip()) > 0:
# headers["cookie"] = cookie
# url = url + "/torrents.php"
# html = ""
# for _ in range(5):
# try:
# response = requests.get(url, headers=headers, timeout=5 * 60)
# if response.status_code == 200:
# html = response.text
# break
# else:
# print(f"{section_name} , 出现错误code码是{response.status_code}, {response.text}")
# break
# except Exception as e:
# time.sleep(2)
# else:
# print(f"{section_name} , 5次出现错误无法访问")
# if len(html) == 0:
# break
with open('/Users/renmeng/Downloads/test.html', 'r', encoding='utf-8') as file:
html_code = file.read()
try:
flaresolverr_url = "http://152.136.50.100:7024/v1"
payload = json.dumps({
"cmd": "request.get",
"url": url,
"maxTimeout": 5 * 1000 * 60
})
headers = {
'Content-Type': 'application/json'
}
response = requests.post(flaresolverr_url, headers=headers, data=payload)
res = json.loads(response.text)
print(res)
if res['status'] == 'ok' and res['solution']['status'] == 200:
print(f"最终耗时:{(res['endTimestamp'] - res['startTimestamp'])/ 1000 / 60:.2f} 分钟")
elif res['status'] == 'error':
print(f"{text} , 访问返回 {res['message']} ")
return ""
# 使用lxml解析HTML
doc = lhtml.fromstring(html_code)
# 使用XPath获取目标元素
title_elements = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@title]/@title')
print(title_elements[0])
# 详情链接地址
details_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@href]/@href')
print(f"/{details_link[0]}")
except Exception as e:
print("出现错误")
print(f"页面无法解析,请知晓!!!")
"""
主键id,来源名称,一级标题,二级标题,种子状态,状态剩余时间,做种状态,评论数,做种数,下载数,完成数发布者豆瓣评分IMDB评分下载链接详情链接,资源大小
CREATE TABLE IF NOT EXISTS pt_website_data (
id INTEGER PRIMARY KEY,
source_name TEXT NOT NULL,
first_title TEXT NOT NULL,
second_title TEXT NOT NULL,
seed_status TEXT,
status_remaining_time TEXT,
seeding_status TEXT,
comment_count INTEGER,
seed_count INTEGER,
download_count INTEGER,
completion_count INTEGER,
publisher TEXT,
douban_rating REAL,
imdb_rating REAL,
download_link TEXT,
details_link TEXT,
size TEXT,
UNIQUE(source_name, first_title, second_title)
);
# flaresolverr_get(url, "ourbits")
CREATE TABLE IF NOT EXISTS pt_website_type (
id INTEGER PRIMARY KEY,
source_name TEXT NOT NULL,
type_name TEXT NOT NULL,
type_url TEXT NOT NULL
);
"""