开始抓取数据

2024-01-15 18:11:58 +08:00 · 2024-01-15 18:11:58 +08:00 · c8f21183f6
parent 7a8610cd0c
commit c8f21183f6
4 changed files with 339 additions and 48 deletions
--- a/PT/PT站点汇总.md
+++ b/PT/PT站点汇总.md
@ -19,18 +19,19 @@
 | 链接地址                      | 链接名称                 |
 |---------------------------|----------------------|
 | https://audiences.me/     | Audiences【观众/奥迪】     |
-| https://ptchdbits.co/     | CHDBits【新岛/金钱岛】      |
 | https://hdchina.org/      | HDChina【瓷器】          |
 | http://hdhome.org/        | HDHome【家园】           |
 | https://hdsky.me/         | HDSky【高清天空】          |
 | https://hhanclub.top/     | HHanClub【憨憨】         |
 | https://pt.keepfrds.com/  | KeepFRDS【朋友/月月】      |
-| https://kp.m-team.cc/     | M-Team【馒头】           |
 | https://open.cd/          | OpenCD【皇后】           |
 | https://ourbits.club/     | OurBits【我堡】          |
+| https://ptchdbits.co/     | CHDBits【新岛/金钱岛】      |
 | https://pterclub.com/     | PTerClub【PT之友俱乐部/猫站】 |
 | https://springsunday.net/ | SSD【春天/不可说】          |
 | https://totheglory.im/    | TTG【听听歌/套套哥】         |
+| https://kp.m-team.cc/     | M-Team【馒头】           |
+

 ## nas-tools 认证站点
 | 链接地址                      | 链接名称             | 备注      |
--- a/PT/pt_get_data.py
+++ b/PT/pt_get_data.py
@ -7,4 +7,97 @@
 5. 数据入库

 数据如何展示呢？？
-"""
+"""
+import sys
+import time
+
+import requests
+import toml
+from bs4 import BeautifulSoup
+from loguru import logger
+
+
+class PtGetData:
+
+	def __init__(self):
+		logger.add("../log/PtGetData_{time:YYYY-MM-DD}.log", rotation="1 day", level="INFO")
+		logger.add(sys.stderr, level="INFO")
+		self.toml_file = 'PT/pt_config.toml'
+		self.headers = {
+			'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+			'accept-language': 'zh,zh-CN;q=0.9',
+			'cache-control': 'max-age=0',
+			'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+			'sec-ch-ua-mobile': '?0',
+			'sec-ch-ua-platform': '"macOS"',
+			'sec-fetch-dest': 'document',
+			'sec-fetch-mode': 'navigate',
+			'sec-fetch-site': 'same-origin',
+			'sec-fetch-user': '?1',
+			'upgrade-insecure-requests': '1',
+			'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+		}
+
+	def get_data(self, section_name, section_data):
+		res_txt = f"开始对 [{section_name}] 进行操作...，抓取数据："
+		url, cookie = section_data.get('url'), section_data.get('cookie')
+		if cookie is not None and len(cookie.strip()) > 0:
+			self.headers["cookie"] = cookie
+			html = self.get_website_html(uri="/torrents.php", section_name=section_name, section_data=section_data)
+			if len(html) == 0:
+				return
+			try:
+				soup = BeautifulSoup(html, 'html.parser')
+				torrents_table = soup.find('table', class_='torrents')
+				print(torrents_table)
+			except Exception as e:
+				logger.error(f"{section_name} , 页面无法解析，请知晓！！！")
+
+	def get_type(self, section_name, section_data):
+		res_txt = f"开始对 [{section_name}] 进行操作...，抓取网站分类："
+		url, cookie = section_data.get('url'), section_data.get('cookie')
+		if cookie is not None and len(cookie.strip()) > 0:
+			self.headers["cookie"] = cookie
+			html = self.get_website_html(uri="/getrss.php", section_name=section_name, section_data=section_data)
+			if len(html) == 0:
+				return
+			try:
+				soup = BeautifulSoup(html, 'html.parser')
+			except Exception as e:
+				logger.error(f"{section_name} , 页面无法解析，请知晓！！！")
+
+	def get_website_html(self, uri, section_name, section_data):
+		url, cookie, attendance_uri = section_data.get('url'), section_data.get('cookie'), section_data.get(
+			'attendance_uri')
+		# cookie不为空时候，可以签到
+		url = url + uri
+		for _ in range(5):
+			try:
+				response = requests.get(url, headers=self.headers, timeout=5 * 60)
+				if response.status_code == 200:
+					return response.text
+				else:
+					logger.error(f"{section_name} , 出现错误，code码是：{response.status_code}, {response.text}！！！")
+					return ""
+			except Exception as e:
+				time.sleep(2)
+		else:
+			logger.error(f"{section_name} , 5次出现错误，无法访问！！！")
+			return ""
+
+
+def opt(self):
+	try:
+		with open(self.toml_file, 'r', encoding='utf-8') as file:
+			config_data = toml.load(file)
+			# 迭代每个 section
+			for section_name, section_data in config_data.items():
+				print(f"Processing section: {section_name} --- {section_data.get('url')}")
+				url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
+				if flag != 1:
+					# 拉取数据
+					self.get_data(section_name, section_data)
+	except FileNotFoundError:
+		print(f"Error: The file '{self.toml_file}' was not found.")
+	except toml.TomlDecodeError as e:
+		print(f"Error decoding TOML: {e}")
--- a/PT/test.html
+++ b/PT/test.html
@ -0,0 +1,159 @@
+<bodv>
+	<table class="head" cellspacing="g" cellpadding="g" align="center">..</table>
+	<table class="mainouter" width="g82" cellspacing="g" cellpadding="5" align="center">
+		<tbody>
+			<tr></tr>
+			<tr>
+				<td id="outer" align="center" class="outer" style="padding-top: 20px; padding-bottom: 20px">
+					<div align="center" style="margin-bottom: 10px" id="ad belownav"></div>
+					<table width="1080" class="main" border="g" cellspacing="g" cellpadding="g">
+						<tbody>
+							<tr>
+								<td class="embedded">
+									<form method="get' name=" searchbox" action="?">.</form>
+									<div align="center" style="margin-top: 10px" id="ad belowsearchbox"></div>
+									<p align="center">.</p>
+									<p align="center">.</p>
+									<table class="torrents" cellspacing="g" cellpadding="5" width="100%">
+										<tr>
+											<td class="colhead" style="padding: 0px">类型</td>
+											<td class="colhead">
+												<a href="?sort=1&amp;type=asc">标题</a>
+											</td>
+											<td class="colhead">
+												<a href="?sort=3&amp;type=desc">
+													<img class="comments" src="pic/trans.gif" alt="comments" title="评论数" />
+												</a>
+											</td>
+											<td class="colhead">
+												<a href="?sort=4&amp;type=desc">
+													<img class="time" src="pic/trans.gif" alt="time" title="存活时间" />
+												</a>
+											</td>
+											<td class="colhead">
+												<a href="?sort=5&amp;type=desc">
+													<img class="size" src="pic/trans.gif" alt="size" title="大小" />
+												</a>
+											</td>
+											<td class="colhead">
+												<a href="?sort=7&amp;type=desc">
+													<img class="seeders" src="pic/trans.gif" alt="seeders" title="种子数" />
+												</a>
+											</td>
+											<td class="colhead">
+												<a href="?sort=8&amp;type=desc">
+													<img class="leechers" src="pic/trans.gif" alt="leechers" title="下载数" />
+												</a>
+											</td>
+											<td class="colhead">
+												<a href="?sort=6&amp;type=desc">
+													<img class="snatched" src="pic/trans.gif" alt="snatched" title="完成数" />
+												</a>
+											</td>
+											<td class="colhead">
+												<a href="?sort=9&amp;type=desc">发布者</a>
+											</td>
+										</tr>
+										<tr>
+											<td class="rowfollow nowrap" valign="middle" style='padding: 0px'>
+												<a href="?cat=405">
+													<img class="c_movie" src="pic/cattrans.gif" alt="电影/Movies" title="电影/Movies" style="background-image: url(pic/category/chd/scenetorrents/chs/catsprites.png);" />
+												</a>
+												<img src="pic/cattrans.gif" style="background-image: url(pic/category/chd/scenetorrents/chs/additional/encode.png);" alt="encode" title="encode" />
+											</td>
+											<td class="rowfollow" width="100%" align="left">
+												<table class="torrentname" width="100%">
+													<tr>
+														<td class="embedded">
+															<img class="top0" src="pic/trans.gif" alt="Sticky" title="一级置顶" />
+															&nbsp;
+															<a title="The Man in the Iron Mask 1998 Blu-ray 1080p x264 DTS-HD MA 5.1-BtsHD" href="details.php?id=130681&amp;hit=1">
+																<b>The Man in the Iron Mask 1998 Blu-ray 1080p x264 DTS-HD MA 5.1-BtsHD</b>
+															</a>
+															<b>
+																[<font class='hot'>热门</font>
+																]
+															</b>
+															<img class="pro_free" src="pic/trans.gif" alt="Free" onmouseover="domTT_activate(this, event, 'content', '&lt;b&gt;&lt;font class=&quot;free&quot;&gt;免费&lt;/font&gt;&lt;/b&gt;剩余时间：&lt;b&gt;&lt;span title=&quot;2024-01-18 22:24:08&quot;&gt;3天7时&lt;/span&gt;&lt;/b&gt;', 'trail', false, 'delay',500,'lifetime',3000,'fade','both','styleClass','niceTitle', 'fadeMax',87, 'maxWidth', 300);" />
+															(
+															<font color='#0000FF'>
+																剩余时间：<span title="2024-01-18 22:24:08">3天7时</span>
+															</font>
+															)
+															<br />
+															<span class="label label-primary">官方</span>
+															铁面人
+															<div class="progressarea" title="做种中">
+																<img src="styles/progress_seeding.gif" />
+																<div class="progress">
+																	<div class="progress_seeding" style="width:100%;"></div>
+																</div>
+															</div>
+														</td>
+														<td width="50" class="embedded" style="text-align: left; " valign="middle">
+															<div>
+																<img src="pic/icon-douban.png" style='padding-bottom: 2px;vertical-align:middle;height: 16px;' alt="豆瓣评分" title="豆瓣评分" />&nbsp;&nbsp;无
+															</div>
+															<div>
+																<img src="pic/icon-imdb.png" style='padding-bottom: 2px;vertical-align:middle;height: 16px;' alt="IMDB评分" title="IMDB评分" />&nbsp;&nbsp;无
+															</div>
+														</td>
+														<td width="20" class="embedded" style="text-align: right; " valign="middle">
+															<a href="download.php?id=130681">
+																<img class="download" src="pic/trans.gif" style='padding-bottom: 2px;' alt="download" title="下载本种" />
+															</a>
+															<br />
+															<a id="bookmark0" href="javascript: bookmark(130681,0);">
+																<img class="delbookmark" src="pic/trans.gif" alt="Unbookmarked" title="收藏" />
+															</a>
+														</td>
+													</tr>
+												</table>
+											</td>
+											<td class="rowfollow">
+												<a href="comment.php?action=add&amp;pid=130681&amp;type=torrent" title="添加评论">0</a>
+											</td>
+											<td class="rowfollow nowrap">
+												<span title="2024-01-13 22:24:08">
+													1天
+													<br />16时
+												</span>
+											</td>
+											<td class="rowfollow">
+												11.17
+												<br />GB
+											</td>
+											<td class="rowfollow" align="center">
+												<b>
+													<a href="details.php?id=130681&amp;hit=1&amp;dllist=1#seeders">186</a>
+												</b>
+											</td>
+											<td class="rowfollow">
+												<b>
+													<a href="details.php?id=130681&amp;hit=1&amp;dllist=1#leechers">2</a>
+												</b>
+											</td>
+											<td class="rowfollow">
+												<a href="viewsnatches.php?id=130681">
+													<b>246</b>
+												</a>
+											</td>
+											<td class="rowfollow">
+												<i>匿名</i>
+											</td>
+										</tr>
+
+									</table>
+									<p align="center">.</p>
+								</td>
+							</tr>
+						</tbody>
+					</table>
+				</td>
+			</tr>
+		</tbody>
+	</table>
+	<div id="footer'></div>
+<span id=" pinbox-extension-installed">
+		</span>
+		</body>
--- a/PT/test.py
+++ b/PT/test.py
@ -1,53 +1,91 @@
-import json
 import time

 import requests
-from bs4 import BeautifulSoup
+import toml
+from lxml import html as lhtml

-headers = {
-	'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-	'accept-language': 'zh,zh-CN;q=0.9',
-	'cache-control': 'max-age=0',
-	'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
-	'sec-ch-ua-mobile': '?0',
-	'sec-ch-ua-platform': '"macOS"',
-	'sec-fetch-dest': 'document',
-	'sec-fetch-mode': 'navigate',
-	'sec-fetch-site': 'same-origin',
-	'sec-fetch-user': '?1',
-	'upgrade-insecure-requests': '1',
-	'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
-}
-url = "https://sharkpt.net/signup.php"
-
-
-response_result = requests.get(url, headers=headers)
-print(response_result.status_code)
-print(response_result.text)
-
-
-def flaresolverr_get(url, text):
+# headers = {
+# 	'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+# 	'accept-language': 'zh,zh-CN;q=0.9',
+# 	'cache-control': 'max-age=0',
+# 	'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+# 	'sec-ch-ua-mobile': '?0',
+# 	'sec-ch-ua-platform': '"macOS"',
+# 	'sec-fetch-dest': 'document',
+# 	'sec-fetch-mode': 'navigate',
+# 	'sec-fetch-site': 'same-origin',
+# 	'sec-fetch-user': '?1',
+# 	'upgrade-insecure-requests': '1',
+# 	'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+# }
+# with open("pt_config.toml", 'r', encoding='utf-8') as file:
+# 	config_data = toml.load(file)
+# 	# 迭代每个 section
+# 	for section_name, section_data in config_data.items():
+# 		print(f"Processing section: {section_name} --- {section_data.get('url')}")
+# 		url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
+# 		if flag != 1 and cookie is not None and len(cookie.strip()) > 0:
+# 			headers["cookie"] = cookie
+# 			url = url + "/torrents.php"
+# 			html = ""
+# 			for _ in range(5):
+# 				try:
+# 					response = requests.get(url, headers=headers, timeout=5 * 60)
+# 					if response.status_code == 200:
+# 						html = response.text
+# 						break
+# 					else:
+# 						print(f"{section_name} , 出现错误，code码是：{response.status_code}, {response.text}！！！")
+# 						break
+# 				except Exception as e:
+# 					time.sleep(2)
+# 			else:
+# 				print(f"{section_name} , 5次出现错误，无法访问！！！")
+# 			if len(html) == 0:
+# 				break
+with open('/Users/renmeng/Downloads/test.html', 'r', encoding='utf-8') as file:
+	html_code = file.read()
 	try:
-		flaresolverr_url = "http://152.136.50.100:7024/v1"
-		payload = json.dumps({
-			"cmd": "request.get",
-			"url": url,
-			"maxTimeout": 5 * 1000 * 60
-		})
-		headers = {
-			'Content-Type': 'application/json'
-		}
-		response = requests.post(flaresolverr_url, headers=headers, data=payload)
-		res = json.loads(response.text)
-		print(res)
-		if res['status'] == 'ok' and res['solution']['status'] == 200:
-			print(f"最终耗时：{(res['endTimestamp'] - res['startTimestamp'])/ 1000 / 60:.2f} 分钟")
-		elif res['status'] == 'error':
-			print(f"{text} , 访问返回 {res['message']} ！！！")
-			return ""
+		# 使用lxml解析HTML
+		doc = lhtml.fromstring(html_code)
+		# 使用XPath获取目标元素
+		title_elements = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@title]/@title')
+		print(title_elements[0])
+		# 详情链接地址
+		details_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@href]/@href')
+		print(f"/{details_link[0]}")
+
+
 	except Exception as e:
-		print("出现错误！！！")
+		print(f"页面无法解析，请知晓！！！")

+"""
+主键id,来源名称,一级标题,二级标题,种子状态,状态剩余时间,做种状态,评论数,做种数,下载数,完成数，发布者，豆瓣评分，IMDB评分，下载链接，详情链接,资源大小
+CREATE TABLE IF NOT EXISTS pt_website_data (
+    id INTEGER PRIMARY KEY,  
+    source_name TEXT NOT NULL,
+    first_title TEXT NOT NULL,
+    second_title TEXT NOT NULL,
+    seed_status TEXT,
+    status_remaining_time TEXT,
+    seeding_status TEXT,
+    comment_count INTEGER,
+    seed_count INTEGER,
+    download_count INTEGER,
+    completion_count INTEGER,
+    publisher TEXT,
+    douban_rating REAL,
+    imdb_rating REAL,
+    download_link TEXT,
+    details_link TEXT,
+    size TEXT,
+    UNIQUE(source_name, first_title, second_title)
+);

-# flaresolverr_get(url, "ourbits")
-
+CREATE TABLE IF NOT EXISTS pt_website_type (
+    id INTEGER PRIMARY KEY,  
+    source_name TEXT NOT NULL,
+    type_name TEXT NOT NULL,
+    type_url TEXT NOT NULL
+);
+"""