提交一些代码

This commit is contained in:
rm 2024-01-17 18:22:04 +08:00
parent f3e2a05e34
commit cba3feaf4a
4 changed files with 52 additions and 26 deletions

View File

@ -21,7 +21,7 @@ from urllib.parse import urlparse, parse_qs
from qnloft_db.sqlite_db_main import SqliteDbMain from qnloft_db.sqlite_db_main import SqliteDbMain
from qnloft_db_model.PtWebsiteData import PtWebsiteData from qnloft_db_model.PtWebsiteData import PtWebsiteData
from dateutil import parser
def extract_id(url, field): def extract_id(url, field):
parsed_url = urlparse(url) parsed_url = urlparse(url)
@ -49,7 +49,6 @@ class PtGetData:
def __init__(self): def __init__(self):
logger.add("../log/PtGetData_{time:YYYY-MM-DD}.log", rotation="1 day", level="INFO") logger.add("../log/PtGetData_{time:YYYY-MM-DD}.log", rotation="1 day", level="INFO")
logger.add(sys.stderr, level="INFO") logger.add(sys.stderr, level="INFO")
self.toml_file = 'PT/pt_config.toml'
self.torrents_uri = "/torrents.php?sort=0&type=desc" self.torrents_uri = "/torrents.php?sort=0&type=desc"
self.headers = { self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
@ -81,10 +80,10 @@ class PtGetData:
# 解析网页内容 # 解析网页内容
self.get_common_analysis(section_name, doc_html) self.get_common_analysis(section_name, doc_html)
# 获取分页 # 获取分页
pages = self.get_common_total_page(doc_html) # pages = self.get_common_total_page(doc_html)
for i in range(0, pages): # for i in range(0, pages):
time.sleep(2) # time.sleep(2)
self.get_data_by_page(section_name, section_data, i) # self.get_data_by_page(section_name, section_data, i)
# 数据入库 # 数据入库
except Exception as e: except Exception as e:
print(f"页面无法解析,请知晓!!!{e}") print(f"页面无法解析,请知晓!!!{e}")
@ -112,7 +111,6 @@ class PtGetData:
# print(f"html内容{html_content}") # print(f"html内容{html_content}")
# 一级标题 # 一级标题
first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0] first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0]
second_title_s = row_follow.xpath( second_title_s = row_follow.xpath(
'.//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]' './/table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]'
'| .//table[@class="torrentname"]//td[@class="embedded"]//font[@title]/text()') '| .//table[@class="torrentname"]//td[@class="embedded"]//font[@title]/text()')
@ -121,7 +119,7 @@ class PtGetData:
for text in second_title_s: for text in second_title_s:
second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese( second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(
text) is not None else None text) is not None else None
print(f"标题:{first_title} 二级标题:{second_title}")
type_id, type_name = "", "" type_id, type_name = "", ""
type_html = row_follow.xpath('.//td[contains(@class, "rowfollow")][1]//a[@href]') type_html = row_follow.xpath('.//td[contains(@class, "rowfollow")][1]//a[@href]')
for td_element in type_html: for td_element in type_html:
@ -150,11 +148,14 @@ class PtGetData:
comment_count = row_follow.xpath('.//td[@class="rowfollow"][2]//a/text()[normalize-space()]')[0] comment_count = row_follow.xpath('.//td[@class="rowfollow"][2]//a/text()[normalize-space()]')[0]
print(f"评论数:{comment_count}") print(f"评论数:{comment_count}")
upload_time = row_follow.xpath('.//span[@title][parent::td]/@title') upload_time = ""
# for td_element in upload_time: upload_time_html = row_follow.xpath('.//span[@title][parent::td]/@title')
# html_content = lhtml.tostring(td_element, encoding='unicode') for td_element in upload_time_html:
# print(html_content) try:
print(f"资源上传时间:{upload_time[0]}") upload_time = parser.parse(td_element)
except ValueError:
pass
print(f"资源上传时间:{upload_time}")
# 资源大小 # 资源大小
size_html = row_follow.xpath('.//td[@class="rowfollow"][3]/text()[normalize-space()]') size_html = row_follow.xpath('.//td[@class="rowfollow"][3]/text()[normalize-space()]')
@ -177,7 +178,8 @@ class PtGetData:
pt_id = extract_id(download_link, "id") pt_id = extract_id(download_link, "id")
# 详情链接地址 # 详情链接地址
details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0] details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0]
print(f"PT_ID == {pt_id} 标题:{first_title} 二级标题:{second_title} 下载链接:/{download_link} 详情链接:/{details_link}") print(
f"PT_ID == {pt_id} 下载链接:/{download_link} 详情链接:/{details_link}")
entry = PtWebsiteData( entry = PtWebsiteData(
pt_id=pt_id, pt_id=pt_id,
source_name=section_name, source_name=section_name,
@ -244,8 +246,9 @@ class PtGetData:
def opt(self): def opt(self):
toml_file = 'PT/pt_config.toml'
try: try:
with open(self.toml_file, 'r', encoding='utf-8') as file: with open(toml_file, 'r', encoding='utf-8') as file:
config_data = toml.load(file) config_data = toml.load(file)
# 迭代每个 section # 迭代每个 section
for section_name, section_data in config_data.items(): for section_name, section_data in config_data.items():
@ -255,6 +258,20 @@ def opt(self):
# 拉取数据 # 拉取数据
self.get_data(section_name, section_data) self.get_data(section_name, section_data)
except FileNotFoundError: except FileNotFoundError:
print(f"Error: The file '{self.toml_file}' was not found.") print(f"Error: The file '{toml_file}' was not found.")
except toml.TomlDecodeError as e: except toml.TomlDecodeError as e:
print(f"Error decoding TOML: {e}") print(f"Error decoding TOML: {e}")
if __name__ == '__main__':
toml_file = 'pt_config.toml'
with open(toml_file, 'r', encoding='utf-8') as file:
config_data = toml.load(file)
# 迭代每个 section
for section_name, section_data in config_data.items():
print(f"Processing section: {section_name} --- {section_data.get('url')}")
url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
if flag != 1 and cookie is not None and len(cookie.strip()) > 0:
# 拉取数据
PtGetData().get_data(section_name, section_data)
break

View File

@ -1,4 +1,5 @@
import time import time
from datetime import datetime
import pandas as pd import pandas as pd
import requests import requests
@ -7,7 +8,7 @@ from lxml import html as lhtml
from urllib.parse import urlparse, parse_qs from urllib.parse import urlparse, parse_qs
from qnloft_db_model.PtWebsiteData import PtWebsiteData from qnloft_db_model.PtWebsiteData import PtWebsiteData
from dateutil import parser
def extract_id(url, field) -> bytes: def extract_id(url, field) -> bytes:
parsed_url = urlparse(url) parsed_url = urlparse(url)
@ -89,14 +90,22 @@ data = {col: [] for col in columns}
df = pd.DataFrame(data) df = pd.DataFrame(data)
for i in range(0,10): def is_date(s):
# 创建一行数据 try:
row_data = {'pt_id': i} datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
return True
except ValueError:
return False
# 将一行数据添加到 DataFrame
df = df.append(row_data, ignore_index=True) my_list = ['置顶促销', '国语配音', '中文字幕', '2021-02-02 13:26:26','2021-02-02','2021-02-02 13:26']
print(df) for item in my_list:
""" try:
parsed_date = parser.parse(item)
print(parsed_date)
except ValueError:
pass
"""
主键id,pt资源id,来源名称,一级标题,二级标题,分类id分类名称 主键id,pt资源id,来源名称,一级标题,二级标题,分类id分类名称
种子状态,状态剩余时间,做种状态,评论数,资源上传时间,资源大小 种子状态,状态剩余时间,做种状态,评论数,资源上传时间,资源大小
做种数,下载数,完成数发布者豆瓣评分IMDB评分下载链接详情链接 做种数,下载数,完成数发布者豆瓣评分IMDB评分下载链接详情链接

View File

@ -22,7 +22,7 @@ class SqliteDbMain(DbMain):
elif 'macos' in sys_platform.lower(): elif 'macos' in sys_platform.lower():
__engine = f"/Users/renmeng/Documents/sqlite_db/{self.database_name}" __engine = f"/Users/renmeng/Documents/sqlite_db/{self.database_name}"
else: else:
__engine = f"{self.database_name}" __engine = f"../sqlite_db/{self.database_name}"
return __engine return __engine
def __create_sqlite_engine(self): def __create_sqlite_engine(self):

View File

@ -5,7 +5,7 @@ from sqlalchemy import Column, Integer, String, Float, UniqueConstraint
class PtWebsiteData(declarative_base()): class PtWebsiteData(declarative_base()):
__tablename__ = 'pt_website_data' __tablename__ = 'pt_website_data'
id = Column(Integer, primary_key=True) id = Column(Integer, primary_key=True, autoincrement=True)
# pt资源id # pt资源id
pt_id = Column(Integer, nullable=False) pt_id = Column(Integer, nullable=False)
# 来源名称 # 来源名称