明天继续
This commit is contained in:
parent
ca4e344913
commit
917da5efcb
|
@ -15,8 +15,16 @@ import requests
|
||||||
import toml
|
import toml
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
from lxml import html as lhtml
|
||||||
|
|
||||||
|
def contains_alpha_or_chinese(input_str):
|
||||||
|
s = input_str.strip()
|
||||||
|
# 判断是否包含字母
|
||||||
|
has_alpha = any(char.isalpha() for char in s)
|
||||||
|
# 判断是否包含汉字
|
||||||
|
has_chinese = any('\u4e00' <= char <= '\u9fff' for char in s)
|
||||||
|
# 返回结果
|
||||||
|
return s if has_alpha or has_chinese else None
|
||||||
class PtGetData:
|
class PtGetData:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -47,9 +55,63 @@ class PtGetData:
|
||||||
if len(html) == 0:
|
if len(html) == 0:
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
# 使用lxml解析HTML
|
||||||
torrents_table = soup.find('table', class_='torrents')
|
doc = lhtml.fromstring(html)
|
||||||
print(torrents_table)
|
# 使用XPath获取目标元素
|
||||||
|
title_elements = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@title]/@title')
|
||||||
|
print(f"标题:{title_elements[0]}")
|
||||||
|
second_title_s = doc.xpath(
|
||||||
|
'//table[@class="torrents"]//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]')
|
||||||
|
|
||||||
|
second_title = ""
|
||||||
|
for text in second_title_s:
|
||||||
|
second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(text) is not None else None
|
||||||
|
print(f"二级标题:{second_title}")
|
||||||
|
|
||||||
|
seed_status = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//td[1]//img[@alt]/@alt')
|
||||||
|
if len(seed_status) > 0:
|
||||||
|
print(f"种子状态:{seed_status[1]}")
|
||||||
|
else:
|
||||||
|
print(f"种子状态:{seed_status[0]}")
|
||||||
|
|
||||||
|
seeding_status = doc.xpath(
|
||||||
|
'//table[@class="torrents"]//table[@class="torrentname"]//div[@class="progressarea"]/@title')
|
||||||
|
print(f"做种状态:{seeding_status[0]}")
|
||||||
|
|
||||||
|
comment_count = doc.xpath('//td[@class="rowfollow"][2]//a/text()[normalize-space()]')
|
||||||
|
print(f"评论数:{comment_count[0]}")
|
||||||
|
|
||||||
|
upload_time = doc.xpath('//td[contains(@class, "rowfollow")][4]//span/@title')
|
||||||
|
# for td_element in upload_time:
|
||||||
|
# html_content = lhtml.tostring(td_element, encoding='unicode')
|
||||||
|
# print(html_content)
|
||||||
|
print(f"资源上传时间:{upload_time[0]}")
|
||||||
|
|
||||||
|
size = doc.xpath('//td[@class="rowfollow"][3]/text()[normalize-space()]')
|
||||||
|
print(f"资源大小:{size[0].strip() + '' + size[1].strip()}")
|
||||||
|
|
||||||
|
seed_count = doc.xpath('//td[@class="rowfollow"][4]')[0]
|
||||||
|
print(f"做种数:{seed_count.text_content().strip()}")
|
||||||
|
|
||||||
|
download_count = doc.xpath('//td[@class="rowfollow"][5]')[0]
|
||||||
|
print(f"下载数:{download_count.text_content().strip()}")
|
||||||
|
|
||||||
|
completion_count = doc.xpath('//td[@class="rowfollow"][6]')[0]
|
||||||
|
print(f"完成数:{completion_count.text_content().strip()}")
|
||||||
|
|
||||||
|
publisher = doc.xpath('//td[@class="rowfollow"][7]')[0]
|
||||||
|
print(f"发布者:{publisher.text_content().strip()}")
|
||||||
|
download_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//img[@class="download"]/parent::a/@href')
|
||||||
|
print(f"下载链接:/{download_link[0]}")
|
||||||
|
# 详情链接地址
|
||||||
|
details_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@href]/@href')
|
||||||
|
print(f"详情链接:/{details_link[0]}")
|
||||||
|
|
||||||
|
douban_rating = doc.xpath('')
|
||||||
|
print(f"豆瓣评分:/{douban_rating[0]}")
|
||||||
|
|
||||||
|
imdb_rating = doc.xpath('')
|
||||||
|
print(f"imdb_rating:/{imdb_rating[0]}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"{section_name} , 页面无法解析,请知晓!!!")
|
logger.error(f"{section_name} , 页面无法解析,请知晓!!!")
|
||||||
|
|
||||||
|
|
185
PT/test.py
185
PT/test.py
|
@ -15,110 +15,111 @@ def contains_alpha_or_chinese(input_str):
|
||||||
return s if has_alpha or has_chinese else None
|
return s if has_alpha or has_chinese else None
|
||||||
|
|
||||||
|
|
||||||
# headers = {
|
headers = {
|
||||||
# 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
# 'accept-language': 'zh,zh-CN;q=0.9',
|
'accept-language': 'zh,zh-CN;q=0.9',
|
||||||
# 'cache-control': 'max-age=0',
|
'cache-control': 'max-age=0',
|
||||||
# 'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
|
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
|
||||||
# 'sec-ch-ua-mobile': '?0',
|
'sec-ch-ua-mobile': '?0',
|
||||||
# 'sec-ch-ua-platform': '"macOS"',
|
'sec-ch-ua-platform': '"macOS"',
|
||||||
# 'sec-fetch-dest': 'document',
|
'sec-fetch-dest': 'document',
|
||||||
# 'sec-fetch-mode': 'navigate',
|
'sec-fetch-mode': 'navigate',
|
||||||
# 'sec-fetch-site': 'same-origin',
|
'sec-fetch-site': 'same-origin',
|
||||||
# 'sec-fetch-user': '?1',
|
'sec-fetch-user': '?1',
|
||||||
# 'upgrade-insecure-requests': '1',
|
'upgrade-insecure-requests': '1',
|
||||||
# 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||||||
# }
|
}
|
||||||
# with open("pt_config.toml", 'r', encoding='utf-8') as file:
|
with open("pt_config.toml", 'r', encoding='utf-8') as file:
|
||||||
# config_data = toml.load(file)
|
config_data = toml.load(file)
|
||||||
# # 迭代每个 section
|
# 迭代每个 section
|
||||||
# for section_name, section_data in config_data.items():
|
for section_name, section_data in config_data.items():
|
||||||
# print(f"Processing section: {section_name} --- {section_data.get('url')}")
|
print(f"Processing section: {section_name} --- {section_data.get('url')}")
|
||||||
# url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
|
url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
|
||||||
# if flag != 1 and cookie is not None and len(cookie.strip()) > 0:
|
if flag != 1 and cookie is not None and len(cookie.strip()) > 0:
|
||||||
# headers["cookie"] = cookie
|
headers["cookie"] = cookie
|
||||||
# url = url + "/torrents.php"
|
url = url + "/torrents.php"
|
||||||
# html = ""
|
html = ""
|
||||||
# for _ in range(5):
|
for _ in range(5):
|
||||||
# try:
|
try:
|
||||||
# response = requests.get(url, headers=headers, timeout=5 * 60)
|
response = requests.get(url, headers=headers, timeout=5 * 60)
|
||||||
# if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
# html = response.text
|
html = response.text
|
||||||
# break
|
break
|
||||||
# else:
|
else:
|
||||||
# print(f"{section_name} , 出现错误,code码是:{response.status_code}, {response.text}!!!")
|
print(f"{section_name} , 出现错误,code码是:{response.status_code}, {response.text}!!!")
|
||||||
# break
|
break
|
||||||
# except Exception as e:
|
except Exception as e:
|
||||||
# time.sleep(2)
|
time.sleep(2)
|
||||||
# else:
|
else:
|
||||||
# print(f"{section_name} , 5次出现错误,无法访问!!!")
|
print(f"{section_name} , 5次出现错误,无法访问!!!")
|
||||||
# if len(html) == 0:
|
if len(html) == 0:
|
||||||
# break
|
break
|
||||||
with open('test.html', 'r', encoding='utf-8') as file:
|
# with open('test.html', 'r', encoding='utf-8') as file:
|
||||||
html_code = file.read()
|
# html_code = file.read()
|
||||||
try:
|
try:
|
||||||
# 使用lxml解析HTML
|
# 使用lxml解析HTML
|
||||||
doc = lhtml.fromstring(html_code)
|
doc = lhtml.fromstring(html)
|
||||||
# 使用XPath获取目标元素
|
first_title,second_title,seed_status, = ""
|
||||||
title_elements = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@title]/@title')
|
# 使用XPath获取目标元素
|
||||||
print(f"标题:{title_elements[0]}")
|
title_elements = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@title]/@title')
|
||||||
second_title_s = doc.xpath(
|
print(f"标题:{title_elements[0]}")
|
||||||
'//table[@class="torrents"]//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]')
|
second_title_s = doc.xpath(
|
||||||
|
'//table[@class="torrents"]//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]')
|
||||||
|
|
||||||
second_title = ""
|
second_title = ""
|
||||||
for text in second_title_s:
|
for text in second_title_s:
|
||||||
second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(text) is not None else None
|
second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(text) is not None else None
|
||||||
print(f"二级标题:{second_title}")
|
print(f"二级标题:{second_title}")
|
||||||
|
|
||||||
seed_status = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//td[1]//img[@alt]/@alt')
|
seed_status_html = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//td[1]//img[@alt]/@alt')
|
||||||
if len(seed_status) > 0:
|
if len(seed_status_html) > 0:
|
||||||
print(f"种子状态:{seed_status[1]}")
|
print(f"种子状态:{seed_status_html[1]}")
|
||||||
else:
|
else:
|
||||||
print(f"种子状态:{seed_status[0]}")
|
print(f"种子状态:{seed_status[0]}")
|
||||||
|
|
||||||
seeding_status = doc.xpath(
|
seeding_status = doc.xpath(
|
||||||
'//table[@class="torrents"]//table[@class="torrentname"]//div[@class="progressarea"]/@title')
|
'//table[@class="torrents"]//table[@class="torrentname"]//div[@class="progressarea"]/@title')
|
||||||
print(f"做种状态:{seeding_status[0]}")
|
print(f"做种状态:{seeding_status[0]}")
|
||||||
|
|
||||||
comment_count = doc.xpath('//td[@class="rowfollow"][2]//a/text()[normalize-space()]')
|
comment_count = doc.xpath('//td[@class="rowfollow"][2]//a/text()[normalize-space()]')
|
||||||
print(f"评论数:{comment_count[0]}")
|
print(f"评论数:{comment_count[0]}")
|
||||||
|
|
||||||
upload_time = doc.xpath('//td[contains(@class, "rowfollow")][4]//span/@title')
|
upload_time = doc.xpath('//td[contains(@class, "rowfollow")][4]//span/@title')
|
||||||
# for td_element in upload_time:
|
# for td_element in upload_time:
|
||||||
# html_content = lhtml.tostring(td_element, encoding='unicode')
|
# html_content = lhtml.tostring(td_element, encoding='unicode')
|
||||||
# print(html_content)
|
# print(html_content)
|
||||||
print(f"资源上传时间:{upload_time[0]}")
|
print(f"资源上传时间:{upload_time[0]}")
|
||||||
|
|
||||||
size = doc.xpath('//td[@class="rowfollow"][3]/text()[normalize-space()]')
|
size = doc.xpath('//td[@class="rowfollow"][3]/text()[normalize-space()]')
|
||||||
print(f"资源大小:{size[0].strip() + '' + size[1].strip()}")
|
print(f"资源大小:{size[0].strip() + '' + size[1].strip()}")
|
||||||
|
|
||||||
seed_count = doc.xpath('//td[@class="rowfollow"][4]')[0]
|
seed_count = doc.xpath('//td[@class="rowfollow"][4]')[0]
|
||||||
print(f"做种数:{seed_count.text_content().strip()}")
|
print(f"做种数:{seed_count.text_content().strip()}")
|
||||||
|
|
||||||
download_count = doc.xpath('//td[@class="rowfollow"][5]')[0]
|
download_count = doc.xpath('//td[@class="rowfollow"][5]')[0]
|
||||||
print(f"下载数:{download_count.text_content().strip()}")
|
print(f"下载数:{download_count.text_content().strip()}")
|
||||||
|
|
||||||
completion_count = doc.xpath('//td[@class="rowfollow"][6]')[0]
|
completion_count = doc.xpath('//td[@class="rowfollow"][6]')[0]
|
||||||
print(f"完成数:{completion_count.text_content().strip()}")
|
print(f"完成数:{completion_count.text_content().strip()}")
|
||||||
|
|
||||||
publisher = doc.xpath('//td[@class="rowfollow"][7]')[0]
|
publisher = doc.xpath('//td[@class="rowfollow"][7]')[0]
|
||||||
print(f"发布者:{publisher.text_content().strip()}")
|
print(f"发布者:{publisher.text_content().strip()}")
|
||||||
download_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//img[@class="download"]/parent::a/@href')
|
download_link = doc.xpath(
|
||||||
print(f"下载链接:/{download_link[0]}")
|
'//table[@class="torrents"]//table[@class="torrentname"]//img[@class="download"]/parent::a/@href')
|
||||||
# 详情链接地址
|
print(f"下载链接:/{download_link[0]}")
|
||||||
details_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@href]/@href')
|
# 详情链接地址
|
||||||
print(f"详情链接:/{details_link[0]}")
|
details_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@href]/@href')
|
||||||
|
print(f"详情链接:/{details_link[0]}")
|
||||||
|
|
||||||
douban_rating = doc.xpath('')
|
douban_rating = doc.xpath('')
|
||||||
print(f"豆瓣评分:/{douban_rating[0]}")
|
print(f"豆瓣评分:/{douban_rating[0]}")
|
||||||
|
|
||||||
imdb_rating = doc.xpath('')
|
imdb_rating = doc.xpath('')
|
||||||
print(f"imdb_rating:/{imdb_rating[0]}")
|
print(f"imdb_rating:/{imdb_rating[0]}")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
except Exception as e:
|
print(e)
|
||||||
print(e)
|
print(f"页面无法解析,请知晓!!!")
|
||||||
print(f"页面无法解析,请知晓!!!")
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
主键id,来源名称,一级标题,二级标题,种子状态,状态剩余时间,做种状态,评论数,资源上传时间,资源大小,做种数,下载数,完成数,发布者,豆瓣评分,IMDB评分,下载链接,详情链接
|
主键id,来源名称,一级标题,二级标题,种子状态,状态剩余时间,做种状态,评论数,资源上传时间,资源大小,做种数,下载数,完成数,发布者,豆瓣评分,IMDB评分,下载链接,详情链接
|
||||||
|
|
Loading…
Reference in New Issue