From ca4e3449131dacddde0d98c2cfbe18b81384cd2c Mon Sep 17 00:00:00 2001 From: rm Date: Tue, 16 Jan 2024 01:24:19 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=B8=80=E4=BA=9B=E6=9B=B4?= =?UTF-8?q?=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PT/test.html | 4 +-- PT/test.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 70 insertions(+), 8 deletions(-) diff --git a/PT/test.html b/PT/test.html index f03eceb..c409282 100644 --- a/PT/test.html +++ b/PT/test.html @@ -111,7 +111,7 @@ - 0 + 0 @@ -125,7 +125,7 @@ - 186 + 123 diff --git a/PT/test.py b/PT/test.py index ecf5758..0af380c 100644 --- a/PT/test.py +++ b/PT/test.py @@ -4,6 +4,17 @@ import requests import toml from lxml import html as lhtml + +def contains_alpha_or_chinese(input_str): + s = input_str.strip() + # 判断是否包含字母 + has_alpha = any(char.isalpha() for char in s) + # 判断是否包含汉字 + has_chinese = any('\u4e00' <= char <= '\u9fff' for char in s) + # 返回结果 + return s if has_alpha or has_chinese else None + + # headers = { # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', # 'accept-language': 'zh,zh-CN;q=0.9', @@ -43,24 +54,74 @@ from lxml import html as lhtml # print(f"{section_name} , 5次出现错误,无法访问!!!") # if len(html) == 0: # break -with open('/Users/renmeng/Downloads/test.html', 'r', encoding='utf-8') as file: +with open('test.html', 'r', encoding='utf-8') as file: html_code = file.read() try: # 使用lxml解析HTML doc = lhtml.fromstring(html_code) # 使用XPath获取目标元素 title_elements = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@title]/@title') - print(title_elements[0]) + print(f"标题:{title_elements[0]}") + second_title_s = doc.xpath( + '//table[@class="torrents"]//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]') + + second_title = "" + for text in second_title_s: + second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(text) is not None else None + print(f"二级标题:{second_title}") + + seed_status = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//td[1]//img[@alt]/@alt') + if len(seed_status) > 0: + print(f"种子状态:{seed_status[1]}") + else: + print(f"种子状态:{seed_status[0]}") + + seeding_status = doc.xpath( + '//table[@class="torrents"]//table[@class="torrentname"]//div[@class="progressarea"]/@title') + print(f"做种状态:{seeding_status[0]}") + + comment_count = doc.xpath('//td[@class="rowfollow"][2]//a/text()[normalize-space()]') + print(f"评论数:{comment_count[0]}") + + upload_time = doc.xpath('//td[contains(@class, "rowfollow")][4]//span/@title') + # for td_element in upload_time: + # html_content = lhtml.tostring(td_element, encoding='unicode') + # print(html_content) + print(f"资源上传时间:{upload_time[0]}") + + size = doc.xpath('//td[@class="rowfollow"][3]/text()[normalize-space()]') + print(f"资源大小:{size[0].strip() + '' + size[1].strip()}") + + seed_count = doc.xpath('//td[@class="rowfollow"][4]')[0] + print(f"做种数:{seed_count.text_content().strip()}") + + download_count = doc.xpath('//td[@class="rowfollow"][5]')[0] + print(f"下载数:{download_count.text_content().strip()}") + + completion_count = doc.xpath('//td[@class="rowfollow"][6]')[0] + print(f"完成数:{completion_count.text_content().strip()}") + + publisher = doc.xpath('//td[@class="rowfollow"][7]')[0] + print(f"发布者:{publisher.text_content().strip()}") + download_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//img[@class="download"]/parent::a/@href') + print(f"下载链接:/{download_link[0]}") # 详情链接地址 details_link = doc.xpath('//table[@class="torrents"]//table[@class="torrentname"]//a[@href]/@href') - print(f"/{details_link[0]}") + print(f"详情链接:/{details_link[0]}") + + douban_rating = doc.xpath('') + print(f"豆瓣评分:/{douban_rating[0]}") + + imdb_rating = doc.xpath('') + print(f"imdb_rating:/{imdb_rating[0]}") except Exception as e: + print(e) print(f"页面无法解析,请知晓!!!") """ -主键id,来源名称,一级标题,二级标题,种子状态,状态剩余时间,做种状态,评论数,做种数,下载数,完成数,发布者,豆瓣评分,IMDB评分,下载链接,详情链接,资源大小 +主键id,来源名称,一级标题,二级标题,种子状态,状态剩余时间,做种状态,评论数,资源上传时间,资源大小,做种数,下载数,完成数,发布者,豆瓣评分,IMDB评分,下载链接,详情链接 CREATE TABLE IF NOT EXISTS pt_website_data ( id INTEGER PRIMARY KEY, source_name TEXT NOT NULL, @@ -70,6 +131,8 @@ CREATE TABLE IF NOT EXISTS pt_website_data ( status_remaining_time TEXT, seeding_status TEXT, comment_count INTEGER, + upload_time TEXT, + size TEXT, seed_count INTEGER, download_count INTEGER, completion_count INTEGER, @@ -78,7 +141,6 @@ CREATE TABLE IF NOT EXISTS pt_website_data ( imdb_rating REAL, download_link TEXT, details_link TEXT, - size TEXT, UNIQUE(source_name, first_title, second_title) ); @@ -88,4 +150,4 @@ CREATE TABLE IF NOT EXISTS pt_website_type ( type_name TEXT NOT NULL, type_url TEXT NOT NULL ); -""" \ No newline at end of file +"""