From c8f21183f6bf1f84ea8701503bef8aa41adb3542 Mon Sep 17 00:00:00 2001 From: rm Date: Mon, 15 Jan 2024 18:11:58 +0800 Subject: [PATCH] =?UTF-8?q?=E5=BC=80=E5=A7=8B=E6=8A=93=E5=8F=96=E6=95=B0?= =?UTF-8?q?=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PT/PT站点汇总.md | 5 +- PT/pt_get_data.py | 95 ++++++++++++++++++++++++++- PT/test.html | 159 ++++++++++++++++++++++++++++++++++++++++++++++ PT/test.py | 128 ++++++++++++++++++++++++------------- 4 files changed, 339 insertions(+), 48 deletions(-) create mode 100644 PT/test.html diff --git a/PT/PT站点汇总.md b/PT/PT站点汇总.md index 1ae80f7..cca4686 100644 --- a/PT/PT站点汇总.md +++ b/PT/PT站点汇总.md @@ -19,18 +19,19 @@ | 链接地址 | 链接名称 | |---------------------------|----------------------| | https://audiences.me/ | Audiences【观众/奥迪】 | -| https://ptchdbits.co/ | CHDBits【新岛/金钱岛】 | | https://hdchina.org/ | HDChina【瓷器】 | | http://hdhome.org/ | HDHome【家园】 | | https://hdsky.me/ | HDSky【高清天空】 | | https://hhanclub.top/ | HHanClub【憨憨】 | | https://pt.keepfrds.com/ | KeepFRDS【朋友/月月】 | -| https://kp.m-team.cc/ | M-Team【馒头】 | | https://open.cd/ | OpenCD【皇后】 | | https://ourbits.club/ | OurBits【我堡】 | +| https://ptchdbits.co/ | CHDBits【新岛/金钱岛】 | | https://pterclub.com/ | PTerClub【PT之友俱乐部/猫站】 | | https://springsunday.net/ | SSD【春天/不可说】 | | https://totheglory.im/ | TTG【听听歌/套套哥】 | +| https://kp.m-team.cc/ | M-Team【馒头】 | + ## nas-tools 认证站点 | 链接地址 | 链接名称 | 备注 | diff --git a/PT/pt_get_data.py b/PT/pt_get_data.py index 660f312..001a1b8 100644 --- a/PT/pt_get_data.py +++ b/PT/pt_get_data.py @@ -7,4 +7,97 @@ 5. 数据入库 数据如何展示呢?? -""" \ No newline at end of file +""" +import sys +import time + +import requests +import toml +from bs4 import BeautifulSoup +from loguru import logger + + +class PtGetData: + + def __init__(self): + logger.add("../log/PtGetData_{time:YYYY-MM-DD}.log", rotation="1 day", level="INFO") + logger.add(sys.stderr, level="INFO") + self.toml_file = 'PT/pt_config.toml' + self.headers = { + 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'accept-language': 'zh,zh-CN;q=0.9', + 'cache-control': 'max-age=0', + 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"macOS"', + 'sec-fetch-dest': 'document', + 'sec-fetch-mode': 'navigate', + 'sec-fetch-site': 'same-origin', + 'sec-fetch-user': '?1', + 'upgrade-insecure-requests': '1', + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', + } + + def get_data(self, section_name, section_data): + res_txt = f"开始对 [{section_name}] 进行操作...,抓取数据:" + url, cookie = section_data.get('url'), section_data.get('cookie') + if cookie is not None and len(cookie.strip()) > 0: + self.headers["cookie"] = cookie + html = self.get_website_html(uri="/torrents.php", section_name=section_name, section_data=section_data) + if len(html) == 0: + return + try: + soup = BeautifulSoup(html, 'html.parser') + torrents_table = soup.find('table', class_='torrents') + print(torrents_table) + except Exception as e: + logger.error(f"{section_name} , 页面无法解析,请知晓!!!") + + def get_type(self, section_name, section_data): + res_txt = f"开始对 [{section_name}] 进行操作...,抓取网站分类:" + url, cookie = section_data.get('url'), section_data.get('cookie') + if cookie is not None and len(cookie.strip()) > 0: + self.headers["cookie"] = cookie + html = self.get_website_html(uri="/getrss.php", section_name=section_name, section_data=section_data) + if len(html) == 0: + return + try: + soup = BeautifulSoup(html, 'html.parser') + except Exception as e: + logger.error(f"{section_name} , 页面无法解析,请知晓!!!") + + def get_website_html(self, uri, section_name, section_data): + url, cookie, attendance_uri = section_data.get('url'), section_data.get('cookie'), section_data.get( + 'attendance_uri') + # cookie不为空时候,可以签到 + url = url + uri + for _ in range(5): + try: + response = requests.get(url, headers=self.headers, timeout=5 * 60) + if response.status_code == 200: + return response.text + else: + logger.error(f"{section_name} , 出现错误,code码是:{response.status_code}, {response.text}!!!") + return "" + except Exception as e: + time.sleep(2) + else: + logger.error(f"{section_name} , 5次出现错误,无法访问!!!") + return "" + + +def opt(self): + try: + with open(self.toml_file, 'r', encoding='utf-8') as file: + config_data = toml.load(file) + # 迭代每个 section + for section_name, section_data in config_data.items(): + print(f"Processing section: {section_name} --- {section_data.get('url')}") + url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag') + if flag != 1: + # 拉取数据 + self.get_data(section_name, section_data) + except FileNotFoundError: + print(f"Error: The file '{self.toml_file}' was not found.") + except toml.TomlDecodeError as e: + print(f"Error decoding TOML: {e}") diff --git a/PT/test.html b/PT/test.html new file mode 100644 index 0000000..f03eceb --- /dev/null +++ b/PT/test.html @@ -0,0 +1,159 @@ + + ..
+ + + + + + + +
+
+ + + + + + +
+
.
+
+

.

+

.

+ + + + + + + + + + + + + + + + + + + + + + + + +
类型 + 标题 + + + comments + + + + time + + + + size + + + + seeders + + + + leechers + + + + snatched + + + 发布者 +
+ + 电影/Movies + + encode + + + + + + + +
+ Sticky +   + + The Man in the Iron Mask 1998 Blu-ray 1080p x264 DTS-HD MA 5.1-BtsHD + + + [热门 + ] + + Free + ( + + 剩余时间:3天7时 + + ) +
+ 官方 + 铁面人 +
+ +
+
+
+
+
+
+ 豆瓣评分  无 +
+
+ IMDB评分  无 +
+
+ + download + +
+ + Unbookmarked + +
+
+ 0 + + + 1天 +
16时 +
+
+ 11.17 +
GB +
+ + 186 + + + + 2 + + + + 246 + + + 匿名 +
+

.

+
+
+
+ + \ No newline at end of file diff --git a/PT/test.py b/PT/test.py index 225a599..ecf5758 100644 --- a/PT/test.py +++ b/PT/test.py @@ -1,53 +1,91 @@ -import json import time import requests -from bs4 import BeautifulSoup +import toml +from lxml import html as lhtml -headers = { - 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', - 'accept-language': 'zh,zh-CN;q=0.9', - 'cache-control': 'max-age=0', - 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"macOS"', - 'sec-fetch-dest': 'document', - 'sec-fetch-mode': 'navigate', - 'sec-fetch-site': 'same-origin', - 'sec-fetch-user': '?1', - 'upgrade-insecure-requests': '1', - 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', -} -url = "https://sharkpt.net/signup.php" - - -response_result = requests.get(url, headers=headers) -print(response_result.status_code) -print(response_result.text) - - -def flaresolverr_get(url, text): +# headers = { +# 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', +# 'accept-language': 'zh,zh-CN;q=0.9', +# 'cache-control': 'max-age=0', +# 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"', +# 'sec-ch-ua-mobile': '?0', +# 'sec-ch-ua-platform': '"macOS"', +# 'sec-fetch-dest': 'document', +# 'sec-fetch-mode': 'navigate', +# 'sec-fetch-site': 'same-origin', +# 'sec-fetch-user': '?1', +# 'upgrade-insecure-requests': '1', +# 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', +# } +# with open("pt_config.toml", 'r', encoding='utf-8') as file: +# config_data = toml.load(file) +# # 迭代每个 section +# for section_name, section_data in config_data.items(): +# print(f"Processing section: {section_name} --- {section_data.get('url')}") +# url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag') +# if flag != 1 and cookie is not None and len(cookie.strip()) > 0: +# headers["cookie"] = cookie +# url = url + "/torrents.php" +# html = "" +# for _ in range(5): +# try: +# response = requests.get(url, headers=headers, timeout=5 * 60) +# if response.status_code == 200: +# html = response.text +# break +# else: +# print(f"{section_name} , 出现错误,code码是:{response.status_code}, {response.text}!!!") +# break +# except Exception as e: +# time.sleep(2) +# else: +# print(f"{section_name} , 5次出现错误,无法访问!!!") +# if len(html) == 0: +# break +with open('/Users/renmeng/Downloads/test.html', 'r', encoding='utf-8') as file: + html_code = file.read() try: - flaresolverr_url = "http://152.136.50.100:7024/v1" - payload = json.dumps({ - "cmd": "request.get", - "url": url, - "maxTimeout": 5 * 1000 * 60 - }) - headers = { - 'Content-Type': 'application/json' - } - response = requests.post(flaresolverr_url, headers=headers, data=payload) - res = json.loads(response.text) - print(res) - if res['status'] == 'ok' and res['solution']['status'] == 200: - print(f"最终耗时:{(res['endTimestamp'] - res['startTimestamp'])/ 1000 / 60:.2f} 分钟") - elif res['status'] == 'error': - print(f"{text} , 访问返回 {res['message']} !!!") - return "" + # 使用lxml解析HTML + doc = lhtml.fromstring(html_code) + # 使用XPath获取目标元素 + title_elements = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@title]/@title') + print(title_elements[0]) + # 详情链接地址 + details_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@href]/@href') + print(f"/{details_link[0]}") + + except Exception as e: - print("出现错误!!!") + print(f"页面无法解析,请知晓!!!") +""" +主键id,来源名称,一级标题,二级标题,种子状态,状态剩余时间,做种状态,评论数,做种数,下载数,完成数,发布者,豆瓣评分,IMDB评分,下载链接,详情链接,资源大小 +CREATE TABLE IF NOT EXISTS pt_website_data ( + id INTEGER PRIMARY KEY, + source_name TEXT NOT NULL, + first_title TEXT NOT NULL, + second_title TEXT NOT NULL, + seed_status TEXT, + status_remaining_time TEXT, + seeding_status TEXT, + comment_count INTEGER, + seed_count INTEGER, + download_count INTEGER, + completion_count INTEGER, + publisher TEXT, + douban_rating REAL, + imdb_rating REAL, + download_link TEXT, + details_link TEXT, + size TEXT, + UNIQUE(source_name, first_title, second_title) +); -# flaresolverr_get(url, "ourbits") - +CREATE TABLE IF NOT EXISTS pt_website_type ( + id INTEGER PRIMARY KEY, + source_name TEXT NOT NULL, + type_name TEXT NOT NULL, + type_url TEXT NOT NULL +); +""" \ No newline at end of file