qnloft-spider/PT/pt_impl/default_pt.py

142 lines
5.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import sys
import time
import requests
from lxml import html as lhtml
from PT.pt_impl.subject_pt import SubjectPt
from loguru import logger
from PT.pt_impl.util import extract_id, check_seed_status, contains_alpha_or_chinese
from dateutil import parser
class DefaultSubject(SubjectPt):
def __init__(self):
logger.add("../log/PtGetData_{time:YYYY-MM-DD}.log", rotation="1 day", level="INFO")
logger.add(sys.stderr, level="INFO")
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh,zh-CN;q=0.9',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
}
def request_url(self, url):
html = ""
for i in range(5):
logger.info(f"开始对:{url} 进行 第 {i} 抓取!")
try:
response = requests.get(url, headers=self.headers, timeout=5 * 60)
if response.status_code == 200:
html = response.text
else:
logger.error(f"{url} , 出现错误code码是{response.status_code}, {response.text}")
except Exception as e:
time.sleep(2)
else:
logger.error(f"{url} , 5次出现错误无法访问")
if len(html) != 0:
# 用于解析 HTML 文本并将其转换为一个 Element 对象
return lhtml.fromstring(html)
return html
def get_page_num(self, page_html):
page_href = page_html.xpath('//td[@class="embedded"]//p[@align="center"][1]//a[last()]/@href')[0]
pages_str = extract_id(page_href, "page")
return int(pages_str) if pages_str.isdigit() else 0
def get_list_data(self, page_html):
# 使用lxml解析HTML
row_follow_tables = page_html.xpath('//table[@class="torrents"]//tr[position() > 1]')
for row_follow in row_follow_tables:
html_content = lhtml.tostring(row_follow, encoding='unicode')
# print(f"html内容{html_content}")
# 一级标题
first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0]
second_title_s = row_follow.xpath(
'.//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]'
'| .//table[@class="torrentname"]//td[@class="embedded"]//font[@title]/text()')
# 二级标题
second_title = ""
for text in second_title_s:
second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(
text) is not None else None
print(f"标题:{first_title} 二级标题:{second_title}")
type_id, type_name = "", ""
type_html = row_follow.xpath('.//td[contains(@class, "rowfollow")][1]//a[@href]')
for td_element in type_html:
type_id = extract_id(td_element.xpath('./@href')[0], "cat")
type_name = td_element.xpath('.//img[@title]/@title')[0]
# html_content = lhtml.tostring(td_element, encoding='unicode')
print(f"类型是:{type_id} + ' ' + {type_name}")
# 种子状态
seed_status = 1
seed_status_html = row_follow.xpath(
'.//table[@class="torrentname"]//td[@class="embedded"]//img[@alt]/@alt')
if len(seed_status_html) > 0:
for seed in seed_status_html:
s = check_seed_status(seed)
if s is not None:
seed_status = s
print(f"种子状态:{seed_status}")
seeding_status = 0
seeding_status_html = row_follow.xpath(
'.//table[@class="torrentname"]//div[@title]/@title')
if len(seeding_status_html) > 0:
seeding_status = 1
print(f"做种状态:{seeding_status}")
comment_count = row_follow.xpath('.//td[@class="rowfollow"][2]//a/text()[normalize-space()]')[0]
print(f"评论数:{comment_count}")
upload_time = ""
upload_time_html = row_follow.xpath('.//span[@title][parent::td]/@title')
for td_element in upload_time_html:
try:
upload_time = parser.parse(td_element)
except ValueError:
pass
print(f"资源上传时间:{upload_time}")
# 资源大小
size_html = row_follow.xpath('.//td[@class="rowfollow"][3]/text()[normalize-space()]')
size = size_html[0].strip() + '' + size_html[1].strip()
print(f"资源大小:{size}")
seed_count = row_follow.xpath('.//td[@class="rowfollow"][4]')[0].text_content().strip()
print(f"做种数:{seed_count}")
download_count = row_follow.xpath('.//td[@class="rowfollow"][5]')[0].text_content().strip()
print(f"下载数:{download_count}")
completion_count = row_follow.xpath('.//td[@class="rowfollow"][6]')[0].text_content().strip()
print(f"完成数:{completion_count}")
publisher = row_follow.xpath('.//td[@class="rowfollow"][7]')[0].text_content().strip()
print(f"发布者:{publisher}")
download_link = row_follow.xpath(
'.//table[@class="torrentname"]//*[contains(@class, "download")]/parent::a/@href')[0]
pt_id = extract_id(download_link, "id")
# 详情链接地址
details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0]
print(
f"PT_ID == {pt_id} 下载链接:/{download_link} 详情链接:/{details_link}")
def main_this_pt(self, section_data):
res_txt = f"开始对 [{section_data.get('name')}] 进行操作...,抓取数据:"
logger.info(res_txt)
url, cookie = section_data.get('url'), section_data.get('cookie')
self.headers["cookie"] = cookie
if len(section_data.get("torrents")) > 1:
self.torrents_uri = section_data.get("torrents")