明天继续

2024-01-16 01:29:49 +08:00 · 2024-01-16 01:29:49 +08:00 · 917da5efcb
parent ca4e344913
commit 917da5efcb
2 changed files with 159 additions and 96 deletions
--- a/PT/pt_get_data.py
+++ b/PT/pt_get_data.py
@ -15,8 +15,16 @@ import requests
 import toml
 from bs4 import BeautifulSoup
 from loguru import logger
+from lxml import html as lhtml

-
+def contains_alpha_or_chinese(input_str):
+	s = input_str.strip()
+	# 判断是否包含字母
+	has_alpha = any(char.isalpha() for char in s)
+	# 判断是否包含汉字
+	has_chinese = any('\u4e00' <= char <= '\u9fff' for char in s)
+	# 返回结果
+	return s if has_alpha or has_chinese else None
 class PtGetData:

 	def __init__(self):
@ -47,9 +55,63 @@ class PtGetData:
 			if len(html) == 0:
 				return
 			try:
-				soup = BeautifulSoup(html, 'html.parser')
-				torrents_table = soup.find('table', class_='torrents')
-				print(torrents_table)
+				# 使用lxml解析HTML
+				doc = lhtml.fromstring(html)
+				# 使用XPath获取目标元素
+				title_elements = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@title]/@title')
+				print(f"标题：{title_elements[0]}")
+				second_title_s = doc.xpath(
+					'//table[@class="torrents"]//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]')
+
+				second_title = ""
+				for text in second_title_s:
+					second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(text) is not None else None
+				print(f"二级标题：{second_title}")
+
+				seed_status = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//td[1]//img[@alt]/@alt')
+				if len(seed_status) > 0:
+					print(f"种子状态：{seed_status[1]}")
+				else:
+					print(f"种子状态：{seed_status[0]}")
+
+				seeding_status = doc.xpath(
+					'//table[@class="torrents"]//table[@class="torrentname"]//div[@class="progressarea"]/@title')
+				print(f"做种状态：{seeding_status[0]}")
+
+				comment_count = doc.xpath('//td[@class="rowfollow"][2]//a/text()[normalize-space()]')
+				print(f"评论数：{comment_count[0]}")
+
+				upload_time = doc.xpath('//td[contains(@class, "rowfollow")][4]//span/@title')
+				# for td_element in upload_time:
+				# 	html_content = lhtml.tostring(td_element, encoding='unicode')
+				# 	print(html_content)
+				print(f"资源上传时间：{upload_time[0]}")
+
+				size = doc.xpath('//td[@class="rowfollow"][3]/text()[normalize-space()]')
+				print(f"资源大小：{size[0].strip() + '' + size[1].strip()}")
+
+				seed_count = doc.xpath('//td[@class="rowfollow"][4]')[0]
+				print(f"做种数：{seed_count.text_content().strip()}")
+
+				download_count = doc.xpath('//td[@class="rowfollow"][5]')[0]
+				print(f"下载数：{download_count.text_content().strip()}")
+
+				completion_count = doc.xpath('//td[@class="rowfollow"][6]')[0]
+				print(f"完成数：{completion_count.text_content().strip()}")
+
+				publisher = doc.xpath('//td[@class="rowfollow"][7]')[0]
+				print(f"发布者：{publisher.text_content().strip()}")
+				download_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//img[@class="download"]/parent::a/@href')
+				print(f"下载链接：/{download_link[0]}")
+				# 详情链接地址
+				details_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@href]/@href')
+				print(f"详情链接：/{details_link[0]}")
+
+				douban_rating = doc.xpath('')
+				print(f"豆瓣评分：/{douban_rating[0]}")
+
+				imdb_rating = doc.xpath('')
+				print(f"imdb_rating：/{imdb_rating[0]}")
 			except Exception as e:
 				logger.error(f"{section_name} , 页面无法解析，请知晓！！！")

--- a/PT/test.py
+++ b/PT/test.py
@ -15,110 +15,111 @@ def contains_alpha_or_chinese(input_str):
 	return s if has_alpha or has_chinese else None


-# headers = {
-# 	'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
-# 	'accept-language': 'zh,zh-CN;q=0.9',
-# 	'cache-control': 'max-age=0',
-# 	'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
-# 	'sec-ch-ua-mobile': '?0',
-# 	'sec-ch-ua-platform': '"macOS"',
-# 	'sec-fetch-dest': 'document',
-# 	'sec-fetch-mode': 'navigate',
-# 	'sec-fetch-site': 'same-origin',
-# 	'sec-fetch-user': '?1',
-# 	'upgrade-insecure-requests': '1',
-# 	'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
-# }
-# with open("pt_config.toml", 'r', encoding='utf-8') as file:
-# 	config_data = toml.load(file)
-# 	# 迭代每个 section
-# 	for section_name, section_data in config_data.items():
-# 		print(f"Processing section: {section_name} --- {section_data.get('url')}")
-# 		url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
-# 		if flag != 1 and cookie is not None and len(cookie.strip()) > 0:
-# 			headers["cookie"] = cookie
-# 			url = url + "/torrents.php"
-# 			html = ""
-# 			for _ in range(5):
-# 				try:
-# 					response = requests.get(url, headers=headers, timeout=5 * 60)
-# 					if response.status_code == 200:
-# 						html = response.text
-# 						break
-# 					else:
-# 						print(f"{section_name} , 出现错误，code码是：{response.status_code}, {response.text}！！！")
-# 						break
-# 				except Exception as e:
-# 					time.sleep(2)
-# 			else:
-# 				print(f"{section_name} , 5次出现错误，无法访问！！！")
-# 			if len(html) == 0:
-# 				break
-with open('test.html', 'r', encoding='utf-8') as file:
-	html_code = file.read()
-	try:
-		# 使用lxml解析HTML
-		doc = lhtml.fromstring(html_code)
-		# 使用XPath获取目标元素
-		title_elements = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@title]/@title')
-		print(f"标题：{title_elements[0]}")
-		second_title_s = doc.xpath(
-			'//table[@class="torrents"]//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]')
+headers = {
+	'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+	'accept-language': 'zh,zh-CN;q=0.9',
+	'cache-control': 'max-age=0',
+	'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
+	'sec-ch-ua-mobile': '?0',
+	'sec-ch-ua-platform': '"macOS"',
+	'sec-fetch-dest': 'document',
+	'sec-fetch-mode': 'navigate',
+	'sec-fetch-site': 'same-origin',
+	'sec-fetch-user': '?1',
+	'upgrade-insecure-requests': '1',
+	'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
+}
+with open("pt_config.toml", 'r', encoding='utf-8') as file:
+	config_data = toml.load(file)
+	# 迭代每个 section
+	for section_name, section_data in config_data.items():
+		print(f"Processing section: {section_name} --- {section_data.get('url')}")
+		url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
+		if flag != 1 and cookie is not None and len(cookie.strip()) > 0:
+			headers["cookie"] = cookie
+			url = url + "/torrents.php"
+			html = ""
+			for _ in range(5):
+				try:
+					response = requests.get(url, headers=headers, timeout=5 * 60)
+					if response.status_code == 200:
+						html = response.text
+						break
+					else:
+						print(f"{section_name} , 出现错误，code码是：{response.status_code}, {response.text}！！！")
+						break
+				except Exception as e:
+					time.sleep(2)
+			else:
+				print(f"{section_name} , 5次出现错误，无法访问！！！")
+			if len(html) == 0:
+				break
+	# with open('test.html', 'r', encoding='utf-8') as file:
+	# html_code = file.read()
+		try:
+			# 使用lxml解析HTML
+			doc = lhtml.fromstring(html)
+			first_title,second_title,seed_status, = ""
+			# 使用XPath获取目标元素
+			title_elements = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@title]/@title')
+			print(f"标题：{title_elements[0]}")
+			second_title_s = doc.xpath(
+				'//table[@class="torrents"]//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]')

-		second_title = ""
-		for text in second_title_s:
-			second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(text) is not None else None
-		print(f"二级标题：{second_title}")
+			second_title = ""
+			for text in second_title_s:
+				second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(text) is not None else None
+			print(f"二级标题：{second_title}")

-		seed_status = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//td[1]//img[@alt]/@alt')
-		if len(seed_status) > 0:
-			print(f"种子状态：{seed_status[1]}")
-		else:
-			print(f"种子状态：{seed_status[0]}")
+			seed_status_html = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//td[1]//img[@alt]/@alt')
+			if len(seed_status_html) > 0:
+				print(f"种子状态：{seed_status_html[1]}")
+			else:
+				print(f"种子状态：{seed_status[0]}")

-		seeding_status = doc.xpath(
-			'//table[@class="torrents"]//table[@class="torrentname"]//div[@class="progressarea"]/@title')
-		print(f"做种状态：{seeding_status[0]}")
+			seeding_status = doc.xpath(
+				'//table[@class="torrents"]//table[@class="torrentname"]//div[@class="progressarea"]/@title')
+			print(f"做种状态：{seeding_status[0]}")

-		comment_count = doc.xpath('//td[@class="rowfollow"][2]//a/text()[normalize-space()]')
-		print(f"评论数：{comment_count[0]}")
+			comment_count = doc.xpath('//td[@class="rowfollow"][2]//a/text()[normalize-space()]')
+			print(f"评论数：{comment_count[0]}")

-		upload_time = doc.xpath('//td[contains(@class, "rowfollow")][4]//span/@title')
-		# for td_element in upload_time:
-		# 	html_content = lhtml.tostring(td_element, encoding='unicode')
-		# 	print(html_content)
-		print(f"资源上传时间：{upload_time[0]}")
+			upload_time = doc.xpath('//td[contains(@class, "rowfollow")][4]//span/@title')
+			# for td_element in upload_time:
+			# 	html_content = lhtml.tostring(td_element, encoding='unicode')
+			# 	print(html_content)
+			print(f"资源上传时间：{upload_time[0]}")

-		size = doc.xpath('//td[@class="rowfollow"][3]/text()[normalize-space()]')
-		print(f"资源大小：{size[0].strip() + '' + size[1].strip()}")
+			size = doc.xpath('//td[@class="rowfollow"][3]/text()[normalize-space()]')
+			print(f"资源大小：{size[0].strip() + '' + size[1].strip()}")

-		seed_count = doc.xpath('//td[@class="rowfollow"][4]')[0]
-		print(f"做种数：{seed_count.text_content().strip()}")
+			seed_count = doc.xpath('//td[@class="rowfollow"][4]')[0]
+			print(f"做种数：{seed_count.text_content().strip()}")

-		download_count = doc.xpath('//td[@class="rowfollow"][5]')[0]
-		print(f"下载数：{download_count.text_content().strip()}")
+			download_count = doc.xpath('//td[@class="rowfollow"][5]')[0]
+			print(f"下载数：{download_count.text_content().strip()}")

-		completion_count = doc.xpath('//td[@class="rowfollow"][6]')[0]
-		print(f"完成数：{completion_count.text_content().strip()}")
+			completion_count = doc.xpath('//td[@class="rowfollow"][6]')[0]
+			print(f"完成数：{completion_count.text_content().strip()}")

-		publisher = doc.xpath('//td[@class="rowfollow"][7]')[0]
-		print(f"发布者：{publisher.text_content().strip()}")
-		download_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//img[@class="download"]/parent::a/@href')
-		print(f"下载链接：/{download_link[0]}")
-		# 详情链接地址
-		details_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@href]/@href')
-		print(f"详情链接：/{details_link[0]}")
+			publisher = doc.xpath('//td[@class="rowfollow"][7]')[0]
+			print(f"发布者：{publisher.text_content().strip()}")
+			download_link = doc.xpath(
+				'//table[@class="torrents"]//table[@class="torrentname"]//img[@class="download"]/parent::a/@href')
+			print(f"下载链接：/{download_link[0]}")
+			# 详情链接地址
+			details_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@href]/@href')
+			print(f"详情链接：/{details_link[0]}")

-		douban_rating = doc.xpath('')
-		print(f"豆瓣评分：/{douban_rating[0]}")
+			douban_rating = doc.xpath('')
+			print(f"豆瓣评分：/{douban_rating[0]}")

-		imdb_rating = doc.xpath('')
-		print(f"imdb_rating：/{imdb_rating[0]}")
-
-
-	except Exception as e:
-		print(e)
-		print(f"页面无法解析，请知晓！！！")
+			imdb_rating = doc.xpath('')
+			print(f"imdb_rating：/{imdb_rating[0]}")
+			break
+		except Exception as e:
+			print(e)
+			print(f"页面无法解析，请知晓！！！")

 """
 主键id,来源名称,一级标题,二级标题,种子状态,状态剩余时间,做种状态,评论数,资源上传时间,资源大小，做种数,下载数,完成数，发布者，豆瓣评分，IMDB评分，下载链接，详情链接