diff --git a/PT/pt_get_data.py b/PT/pt_get_data.py index 75adc75..c4283dc 100644 --- a/PT/pt_get_data.py +++ b/PT/pt_get_data.py @@ -21,7 +21,7 @@ from urllib.parse import urlparse, parse_qs from qnloft_db.sqlite_db_main import SqliteDbMain from qnloft_db_model.PtWebsiteData import PtWebsiteData - +from dateutil import parser def extract_id(url, field): parsed_url = urlparse(url) @@ -49,7 +49,6 @@ class PtGetData: def __init__(self): logger.add("../log/PtGetData_{time:YYYY-MM-DD}.log", rotation="1 day", level="INFO") logger.add(sys.stderr, level="INFO") - self.toml_file = 'PT/pt_config.toml' self.torrents_uri = "/torrents.php?sort=0&type=desc" self.headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', @@ -81,10 +80,10 @@ class PtGetData: # 解析网页内容 self.get_common_analysis(section_name, doc_html) # 获取分页 - pages = self.get_common_total_page(doc_html) - for i in range(0, pages): - time.sleep(2) - self.get_data_by_page(section_name, section_data, i) + # pages = self.get_common_total_page(doc_html) + # for i in range(0, pages): + # time.sleep(2) + # self.get_data_by_page(section_name, section_data, i) # 数据入库 except Exception as e: print(f"页面无法解析,请知晓!!!{e}") @@ -112,7 +111,6 @@ class PtGetData: # print(f"html内容:{html_content}") # 一级标题 first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0] - second_title_s = row_follow.xpath( './/table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]' '| .//table[@class="torrentname"]//td[@class="embedded"]//font[@title]/text()') @@ -121,7 +119,7 @@ class PtGetData: for text in second_title_s: second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese( text) is not None else None - + print(f"标题:{first_title} 二级标题:{second_title}") type_id, type_name = "", "" type_html = row_follow.xpath('.//td[contains(@class, "rowfollow")][1]//a[@href]') for td_element in type_html: @@ -150,11 +148,14 @@ class PtGetData: comment_count = row_follow.xpath('.//td[@class="rowfollow"][2]//a/text()[normalize-space()]')[0] print(f"评论数:{comment_count}") - upload_time = row_follow.xpath('.//span[@title][parent::td]/@title') - # for td_element in upload_time: - # html_content = lhtml.tostring(td_element, encoding='unicode') - # print(html_content) - print(f"资源上传时间:{upload_time[0]}") + upload_time = "" + upload_time_html = row_follow.xpath('.//span[@title][parent::td]/@title') + for td_element in upload_time_html: + try: + upload_time = parser.parse(td_element) + except ValueError: + pass + print(f"资源上传时间:{upload_time}") # 资源大小 size_html = row_follow.xpath('.//td[@class="rowfollow"][3]/text()[normalize-space()]') @@ -177,7 +178,8 @@ class PtGetData: pt_id = extract_id(download_link, "id") # 详情链接地址 details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0] - print(f"PT_ID == {pt_id} 标题:{first_title} 二级标题:{second_title} 下载链接:/{download_link} 详情链接:/{details_link}") + print( + f"PT_ID == {pt_id} 下载链接:/{download_link} 详情链接:/{details_link}") entry = PtWebsiteData( pt_id=pt_id, source_name=section_name, @@ -244,8 +246,9 @@ class PtGetData: def opt(self): + toml_file = 'PT/pt_config.toml' try: - with open(self.toml_file, 'r', encoding='utf-8') as file: + with open(toml_file, 'r', encoding='utf-8') as file: config_data = toml.load(file) # 迭代每个 section for section_name, section_data in config_data.items(): @@ -255,6 +258,20 @@ def opt(self): # 拉取数据 self.get_data(section_name, section_data) except FileNotFoundError: - print(f"Error: The file '{self.toml_file}' was not found.") + print(f"Error: The file '{toml_file}' was not found.") except toml.TomlDecodeError as e: print(f"Error decoding TOML: {e}") + + +if __name__ == '__main__': + toml_file = 'pt_config.toml' + with open(toml_file, 'r', encoding='utf-8') as file: + config_data = toml.load(file) + # 迭代每个 section + for section_name, section_data in config_data.items(): + print(f"Processing section: {section_name} --- {section_data.get('url')}") + url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag') + if flag != 1 and cookie is not None and len(cookie.strip()) > 0: + # 拉取数据 + PtGetData().get_data(section_name, section_data) + break diff --git a/PT/test.py b/PT/test.py index 03c645c..e281344 100644 --- a/PT/test.py +++ b/PT/test.py @@ -1,4 +1,5 @@ import time +from datetime import datetime import pandas as pd import requests @@ -7,7 +8,7 @@ from lxml import html as lhtml from urllib.parse import urlparse, parse_qs from qnloft_db_model.PtWebsiteData import PtWebsiteData - +from dateutil import parser def extract_id(url, field) -> bytes: parsed_url = urlparse(url) @@ -89,14 +90,22 @@ data = {col: [] for col in columns} df = pd.DataFrame(data) -for i in range(0,10): - # 创建一行数据 - row_data = {'pt_id': i} +def is_date(s): + try: + datetime.strptime(s, '%Y-%m-%d %H:%M:%S') + return True + except ValueError: + return False - # 将一行数据添加到 DataFrame - df = df.append(row_data, ignore_index=True) -print(df) -""" + +my_list = ['置顶促销', '国语配音', '中文字幕', '2021-02-02 13:26:26','2021-02-02','2021-02-02 13:26'] +for item in my_list: + try: + parsed_date = parser.parse(item) + print(parsed_date) + except ValueError: + pass + """ 主键id,pt资源id,来源名称,一级标题,二级标题,分类id,分类名称 种子状态,状态剩余时间,做种状态,评论数,资源上传时间,资源大小, 做种数,下载数,完成数,发布者,豆瓣评分,IMDB评分,下载链接,详情链接 diff --git a/qnloft_db/sqlite_db_main.py b/qnloft_db/sqlite_db_main.py index 9872571..eb9aa86 100644 --- a/qnloft_db/sqlite_db_main.py +++ b/qnloft_db/sqlite_db_main.py @@ -22,7 +22,7 @@ class SqliteDbMain(DbMain): elif 'macos' in sys_platform.lower(): __engine = f"/Users/renmeng/Documents/sqlite_db/{self.database_name}" else: - __engine = f"{self.database_name}" + __engine = f"../sqlite_db/{self.database_name}" return __engine def __create_sqlite_engine(self): diff --git a/qnloft_db_model/PtWebsiteData.py b/qnloft_db_model/PtWebsiteData.py index 1ea1a07..0d3b829 100644 --- a/qnloft_db_model/PtWebsiteData.py +++ b/qnloft_db_model/PtWebsiteData.py @@ -5,7 +5,7 @@ from sqlalchemy import Column, Integer, String, Float, UniqueConstraint class PtWebsiteData(declarative_base()): __tablename__ = 'pt_website_data' - id = Column(Integer, primary_key=True) + id = Column(Integer, primary_key=True, autoincrement=True) # pt资源id pt_id = Column(Integer, nullable=False) # 来源名称