142 lines
5.6 KiB
Python
142 lines
5.6 KiB
Python
import sys
|
||
import time
|
||
|
||
import requests
|
||
from lxml import html as lhtml
|
||
from PT.pt_impl.subject_pt import SubjectPt
|
||
from loguru import logger
|
||
|
||
from PT.pt_impl.util import extract_id, check_seed_status, contains_alpha_or_chinese
|
||
from dateutil import parser
|
||
|
||
|
||
class DefaultSubject(SubjectPt):
|
||
|
||
def __init__(self):
|
||
logger.add("../log/PtGetData_{time:YYYY-MM-DD}.log", rotation="1 day", level="INFO")
|
||
logger.add(sys.stderr, level="INFO")
|
||
self.headers = {
|
||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||
'accept-language': 'zh,zh-CN;q=0.9',
|
||
'cache-control': 'max-age=0',
|
||
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
|
||
'sec-ch-ua-mobile': '?0',
|
||
'sec-ch-ua-platform': '"macOS"',
|
||
'sec-fetch-dest': 'document',
|
||
'sec-fetch-mode': 'navigate',
|
||
'sec-fetch-site': 'same-origin',
|
||
'sec-fetch-user': '?1',
|
||
'upgrade-insecure-requests': '1',
|
||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
||
}
|
||
|
||
def request_url(self, url):
|
||
html = ""
|
||
for i in range(5):
|
||
logger.info(f"开始对:{url} 进行 第 {i} 抓取!")
|
||
try:
|
||
response = requests.get(url, headers=self.headers, timeout=5 * 60)
|
||
if response.status_code == 200:
|
||
html = response.text
|
||
else:
|
||
logger.error(f"{url} , 出现错误,code码是:{response.status_code}, {response.text}!!!")
|
||
except Exception as e:
|
||
time.sleep(2)
|
||
else:
|
||
logger.error(f"{url} , 5次出现错误,无法访问!!!")
|
||
if len(html) != 0:
|
||
# 用于解析 HTML 文本并将其转换为一个 Element 对象
|
||
return lhtml.fromstring(html)
|
||
return html
|
||
|
||
def get_page_num(self, page_html):
|
||
page_href = page_html.xpath('//td[@class="embedded"]//p[@align="center"][1]//a[last()]/@href')[0]
|
||
pages_str = extract_id(page_href, "page")
|
||
return int(pages_str) if pages_str.isdigit() else 0
|
||
|
||
def get_list_data(self, page_html):
|
||
# 使用lxml解析HTML
|
||
row_follow_tables = page_html.xpath('//table[@class="torrents"]//tr[position() > 1]')
|
||
for row_follow in row_follow_tables:
|
||
html_content = lhtml.tostring(row_follow, encoding='unicode')
|
||
# print(f"html内容:{html_content}")
|
||
# 一级标题
|
||
first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0]
|
||
second_title_s = row_follow.xpath(
|
||
'.//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]'
|
||
'| .//table[@class="torrentname"]//td[@class="embedded"]//font[@title]/text()')
|
||
# 二级标题
|
||
second_title = ""
|
||
for text in second_title_s:
|
||
second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(
|
||
text) is not None else None
|
||
print(f"标题:{first_title} 二级标题:{second_title}")
|
||
type_id, type_name = "", ""
|
||
type_html = row_follow.xpath('.//td[contains(@class, "rowfollow")][1]//a[@href]')
|
||
for td_element in type_html:
|
||
type_id = extract_id(td_element.xpath('./@href')[0], "cat")
|
||
type_name = td_element.xpath('.//img[@title]/@title')[0]
|
||
# html_content = lhtml.tostring(td_element, encoding='unicode')
|
||
print(f"类型是:{type_id} + ' ' + {type_name}")
|
||
# 种子状态
|
||
seed_status = 1
|
||
seed_status_html = row_follow.xpath(
|
||
'.//table[@class="torrentname"]//td[@class="embedded"]//img[@alt]/@alt')
|
||
if len(seed_status_html) > 0:
|
||
for seed in seed_status_html:
|
||
s = check_seed_status(seed)
|
||
if s is not None:
|
||
seed_status = s
|
||
print(f"种子状态:{seed_status}")
|
||
|
||
seeding_status = 0
|
||
seeding_status_html = row_follow.xpath(
|
||
'.//table[@class="torrentname"]//div[@title]/@title')
|
||
if len(seeding_status_html) > 0:
|
||
seeding_status = 1
|
||
print(f"做种状态:{seeding_status}")
|
||
|
||
comment_count = row_follow.xpath('.//td[@class="rowfollow"][2]//a/text()[normalize-space()]')[0]
|
||
print(f"评论数:{comment_count}")
|
||
|
||
upload_time = ""
|
||
upload_time_html = row_follow.xpath('.//span[@title][parent::td]/@title')
|
||
for td_element in upload_time_html:
|
||
try:
|
||
upload_time = parser.parse(td_element)
|
||
except ValueError:
|
||
pass
|
||
print(f"资源上传时间:{upload_time}")
|
||
|
||
# 资源大小
|
||
size_html = row_follow.xpath('.//td[@class="rowfollow"][3]/text()[normalize-space()]')
|
||
size = size_html[0].strip() + '' + size_html[1].strip()
|
||
print(f"资源大小:{size}")
|
||
|
||
seed_count = row_follow.xpath('.//td[@class="rowfollow"][4]')[0].text_content().strip()
|
||
print(f"做种数:{seed_count}")
|
||
|
||
download_count = row_follow.xpath('.//td[@class="rowfollow"][5]')[0].text_content().strip()
|
||
print(f"下载数:{download_count}")
|
||
|
||
completion_count = row_follow.xpath('.//td[@class="rowfollow"][6]')[0].text_content().strip()
|
||
print(f"完成数:{completion_count}")
|
||
|
||
publisher = row_follow.xpath('.//td[@class="rowfollow"][7]')[0].text_content().strip()
|
||
print(f"发布者:{publisher}")
|
||
download_link = row_follow.xpath(
|
||
'.//table[@class="torrentname"]//*[contains(@class, "download")]/parent::a/@href')[0]
|
||
pt_id = extract_id(download_link, "id")
|
||
# 详情链接地址
|
||
details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0]
|
||
print(
|
||
f"PT_ID == {pt_id} 下载链接:/{download_link} 详情链接:/{details_link}")
|
||
|
||
def main_this_pt(self, section_data):
|
||
res_txt = f"开始对 [{section_data.get('name')}] 进行操作...,抓取数据:"
|
||
logger.info(res_txt)
|
||
url, cookie = section_data.get('url'), section_data.get('cookie')
|
||
self.headers["cookie"] = cookie
|
||
if len(section_data.get("torrents")) > 1:
|
||
self.torrents_uri = section_data.get("torrents")
|