提交一些代码
This commit is contained in:
parent
f3e2a05e34
commit
cba3feaf4a
|
@ -21,7 +21,7 @@ from urllib.parse import urlparse, parse_qs
|
|||
|
||||
from qnloft_db.sqlite_db_main import SqliteDbMain
|
||||
from qnloft_db_model.PtWebsiteData import PtWebsiteData
|
||||
|
||||
from dateutil import parser
|
||||
|
||||
def extract_id(url, field):
|
||||
parsed_url = urlparse(url)
|
||||
|
@ -49,7 +49,6 @@ class PtGetData:
|
|||
def __init__(self):
|
||||
logger.add("../log/PtGetData_{time:YYYY-MM-DD}.log", rotation="1 day", level="INFO")
|
||||
logger.add(sys.stderr, level="INFO")
|
||||
self.toml_file = 'PT/pt_config.toml'
|
||||
self.torrents_uri = "/torrents.php?sort=0&type=desc"
|
||||
self.headers = {
|
||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
|
@ -81,10 +80,10 @@ class PtGetData:
|
|||
# 解析网页内容
|
||||
self.get_common_analysis(section_name, doc_html)
|
||||
# 获取分页
|
||||
pages = self.get_common_total_page(doc_html)
|
||||
for i in range(0, pages):
|
||||
time.sleep(2)
|
||||
self.get_data_by_page(section_name, section_data, i)
|
||||
# pages = self.get_common_total_page(doc_html)
|
||||
# for i in range(0, pages):
|
||||
# time.sleep(2)
|
||||
# self.get_data_by_page(section_name, section_data, i)
|
||||
# 数据入库
|
||||
except Exception as e:
|
||||
print(f"页面无法解析,请知晓!!!{e}")
|
||||
|
@ -112,7 +111,6 @@ class PtGetData:
|
|||
# print(f"html内容:{html_content}")
|
||||
# 一级标题
|
||||
first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0]
|
||||
|
||||
second_title_s = row_follow.xpath(
|
||||
'.//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]'
|
||||
'| .//table[@class="torrentname"]//td[@class="embedded"]//font[@title]/text()')
|
||||
|
@ -121,7 +119,7 @@ class PtGetData:
|
|||
for text in second_title_s:
|
||||
second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(
|
||||
text) is not None else None
|
||||
|
||||
print(f"标题:{first_title} 二级标题:{second_title}")
|
||||
type_id, type_name = "", ""
|
||||
type_html = row_follow.xpath('.//td[contains(@class, "rowfollow")][1]//a[@href]')
|
||||
for td_element in type_html:
|
||||
|
@ -150,11 +148,14 @@ class PtGetData:
|
|||
comment_count = row_follow.xpath('.//td[@class="rowfollow"][2]//a/text()[normalize-space()]')[0]
|
||||
print(f"评论数:{comment_count}")
|
||||
|
||||
upload_time = row_follow.xpath('.//span[@title][parent::td]/@title')
|
||||
# for td_element in upload_time:
|
||||
# html_content = lhtml.tostring(td_element, encoding='unicode')
|
||||
# print(html_content)
|
||||
print(f"资源上传时间:{upload_time[0]}")
|
||||
upload_time = ""
|
||||
upload_time_html = row_follow.xpath('.//span[@title][parent::td]/@title')
|
||||
for td_element in upload_time_html:
|
||||
try:
|
||||
upload_time = parser.parse(td_element)
|
||||
except ValueError:
|
||||
pass
|
||||
print(f"资源上传时间:{upload_time}")
|
||||
|
||||
# 资源大小
|
||||
size_html = row_follow.xpath('.//td[@class="rowfollow"][3]/text()[normalize-space()]')
|
||||
|
@ -177,7 +178,8 @@ class PtGetData:
|
|||
pt_id = extract_id(download_link, "id")
|
||||
# 详情链接地址
|
||||
details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0]
|
||||
print(f"PT_ID == {pt_id} 标题:{first_title} 二级标题:{second_title} 下载链接:/{download_link} 详情链接:/{details_link}")
|
||||
print(
|
||||
f"PT_ID == {pt_id} 下载链接:/{download_link} 详情链接:/{details_link}")
|
||||
entry = PtWebsiteData(
|
||||
pt_id=pt_id,
|
||||
source_name=section_name,
|
||||
|
@ -244,8 +246,9 @@ class PtGetData:
|
|||
|
||||
|
||||
def opt(self):
|
||||
toml_file = 'PT/pt_config.toml'
|
||||
try:
|
||||
with open(self.toml_file, 'r', encoding='utf-8') as file:
|
||||
with open(toml_file, 'r', encoding='utf-8') as file:
|
||||
config_data = toml.load(file)
|
||||
# 迭代每个 section
|
||||
for section_name, section_data in config_data.items():
|
||||
|
@ -255,6 +258,20 @@ def opt(self):
|
|||
# 拉取数据
|
||||
self.get_data(section_name, section_data)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: The file '{self.toml_file}' was not found.")
|
||||
print(f"Error: The file '{toml_file}' was not found.")
|
||||
except toml.TomlDecodeError as e:
|
||||
print(f"Error decoding TOML: {e}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
toml_file = 'pt_config.toml'
|
||||
with open(toml_file, 'r', encoding='utf-8') as file:
|
||||
config_data = toml.load(file)
|
||||
# 迭代每个 section
|
||||
for section_name, section_data in config_data.items():
|
||||
print(f"Processing section: {section_name} --- {section_data.get('url')}")
|
||||
url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
|
||||
if flag != 1 and cookie is not None and len(cookie.strip()) > 0:
|
||||
# 拉取数据
|
||||
PtGetData().get_data(section_name, section_data)
|
||||
break
|
||||
|
|
25
PT/test.py
25
PT/test.py
|
@ -1,4 +1,5 @@
|
|||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
|
@ -7,7 +8,7 @@ from lxml import html as lhtml
|
|||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
from qnloft_db_model.PtWebsiteData import PtWebsiteData
|
||||
|
||||
from dateutil import parser
|
||||
|
||||
def extract_id(url, field) -> bytes:
|
||||
parsed_url = urlparse(url)
|
||||
|
@ -89,14 +90,22 @@ data = {col: [] for col in columns}
|
|||
df = pd.DataFrame(data)
|
||||
|
||||
|
||||
for i in range(0,10):
|
||||
# 创建一行数据
|
||||
row_data = {'pt_id': i}
|
||||
def is_date(s):
|
||||
try:
|
||||
datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
# 将一行数据添加到 DataFrame
|
||||
df = df.append(row_data, ignore_index=True)
|
||||
print(df)
|
||||
"""
|
||||
|
||||
my_list = ['置顶促销', '国语配音', '中文字幕', '2021-02-02 13:26:26','2021-02-02','2021-02-02 13:26']
|
||||
for item in my_list:
|
||||
try:
|
||||
parsed_date = parser.parse(item)
|
||||
print(parsed_date)
|
||||
except ValueError:
|
||||
pass
|
||||
"""
|
||||
主键id,pt资源id,来源名称,一级标题,二级标题,分类id,分类名称
|
||||
种子状态,状态剩余时间,做种状态,评论数,资源上传时间,资源大小,
|
||||
做种数,下载数,完成数,发布者,豆瓣评分,IMDB评分,下载链接,详情链接
|
||||
|
|
|
@ -22,7 +22,7 @@ class SqliteDbMain(DbMain):
|
|||
elif 'macos' in sys_platform.lower():
|
||||
__engine = f"/Users/renmeng/Documents/sqlite_db/{self.database_name}"
|
||||
else:
|
||||
__engine = f"{self.database_name}"
|
||||
__engine = f"../sqlite_db/{self.database_name}"
|
||||
return __engine
|
||||
|
||||
def __create_sqlite_engine(self):
|
||||
|
|
|
@ -5,7 +5,7 @@ from sqlalchemy import Column, Integer, String, Float, UniqueConstraint
|
|||
class PtWebsiteData(declarative_base()):
|
||||
__tablename__ = 'pt_website_data'
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
# pt资源id
|
||||
pt_id = Column(Integer, nullable=False)
|
||||
# 来源名称
|
||||
|
|
Loading…
Reference in New Issue