提交一些代码
This commit is contained in:
parent
f3e2a05e34
commit
cba3feaf4a
|
@ -21,7 +21,7 @@ from urllib.parse import urlparse, parse_qs
|
||||||
|
|
||||||
from qnloft_db.sqlite_db_main import SqliteDbMain
|
from qnloft_db.sqlite_db_main import SqliteDbMain
|
||||||
from qnloft_db_model.PtWebsiteData import PtWebsiteData
|
from qnloft_db_model.PtWebsiteData import PtWebsiteData
|
||||||
|
from dateutil import parser
|
||||||
|
|
||||||
def extract_id(url, field):
|
def extract_id(url, field):
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
|
@ -49,7 +49,6 @@ class PtGetData:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
logger.add("../log/PtGetData_{time:YYYY-MM-DD}.log", rotation="1 day", level="INFO")
|
logger.add("../log/PtGetData_{time:YYYY-MM-DD}.log", rotation="1 day", level="INFO")
|
||||||
logger.add(sys.stderr, level="INFO")
|
logger.add(sys.stderr, level="INFO")
|
||||||
self.toml_file = 'PT/pt_config.toml'
|
|
||||||
self.torrents_uri = "/torrents.php?sort=0&type=desc"
|
self.torrents_uri = "/torrents.php?sort=0&type=desc"
|
||||||
self.headers = {
|
self.headers = {
|
||||||
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
@ -81,10 +80,10 @@ class PtGetData:
|
||||||
# 解析网页内容
|
# 解析网页内容
|
||||||
self.get_common_analysis(section_name, doc_html)
|
self.get_common_analysis(section_name, doc_html)
|
||||||
# 获取分页
|
# 获取分页
|
||||||
pages = self.get_common_total_page(doc_html)
|
# pages = self.get_common_total_page(doc_html)
|
||||||
for i in range(0, pages):
|
# for i in range(0, pages):
|
||||||
time.sleep(2)
|
# time.sleep(2)
|
||||||
self.get_data_by_page(section_name, section_data, i)
|
# self.get_data_by_page(section_name, section_data, i)
|
||||||
# 数据入库
|
# 数据入库
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"页面无法解析,请知晓!!!{e}")
|
print(f"页面无法解析,请知晓!!!{e}")
|
||||||
|
@ -112,7 +111,6 @@ class PtGetData:
|
||||||
# print(f"html内容:{html_content}")
|
# print(f"html内容:{html_content}")
|
||||||
# 一级标题
|
# 一级标题
|
||||||
first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0]
|
first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0]
|
||||||
|
|
||||||
second_title_s = row_follow.xpath(
|
second_title_s = row_follow.xpath(
|
||||||
'.//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]'
|
'.//table[@class="torrentname"]//td[@class="embedded"]/text()[normalize-space()]'
|
||||||
'| .//table[@class="torrentname"]//td[@class="embedded"]//font[@title]/text()')
|
'| .//table[@class="torrentname"]//td[@class="embedded"]//font[@title]/text()')
|
||||||
|
@ -121,7 +119,7 @@ class PtGetData:
|
||||||
for text in second_title_s:
|
for text in second_title_s:
|
||||||
second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(
|
second_title = contains_alpha_or_chinese(text) if contains_alpha_or_chinese(
|
||||||
text) is not None else None
|
text) is not None else None
|
||||||
|
print(f"标题:{first_title} 二级标题:{second_title}")
|
||||||
type_id, type_name = "", ""
|
type_id, type_name = "", ""
|
||||||
type_html = row_follow.xpath('.//td[contains(@class, "rowfollow")][1]//a[@href]')
|
type_html = row_follow.xpath('.//td[contains(@class, "rowfollow")][1]//a[@href]')
|
||||||
for td_element in type_html:
|
for td_element in type_html:
|
||||||
|
@ -150,11 +148,14 @@ class PtGetData:
|
||||||
comment_count = row_follow.xpath('.//td[@class="rowfollow"][2]//a/text()[normalize-space()]')[0]
|
comment_count = row_follow.xpath('.//td[@class="rowfollow"][2]//a/text()[normalize-space()]')[0]
|
||||||
print(f"评论数:{comment_count}")
|
print(f"评论数:{comment_count}")
|
||||||
|
|
||||||
upload_time = row_follow.xpath('.//span[@title][parent::td]/@title')
|
upload_time = ""
|
||||||
# for td_element in upload_time:
|
upload_time_html = row_follow.xpath('.//span[@title][parent::td]/@title')
|
||||||
# html_content = lhtml.tostring(td_element, encoding='unicode')
|
for td_element in upload_time_html:
|
||||||
# print(html_content)
|
try:
|
||||||
print(f"资源上传时间:{upload_time[0]}")
|
upload_time = parser.parse(td_element)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
print(f"资源上传时间:{upload_time}")
|
||||||
|
|
||||||
# 资源大小
|
# 资源大小
|
||||||
size_html = row_follow.xpath('.//td[@class="rowfollow"][3]/text()[normalize-space()]')
|
size_html = row_follow.xpath('.//td[@class="rowfollow"][3]/text()[normalize-space()]')
|
||||||
|
@ -177,7 +178,8 @@ class PtGetData:
|
||||||
pt_id = extract_id(download_link, "id")
|
pt_id = extract_id(download_link, "id")
|
||||||
# 详情链接地址
|
# 详情链接地址
|
||||||
details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0]
|
details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0]
|
||||||
print(f"PT_ID == {pt_id} 标题:{first_title} 二级标题:{second_title} 下载链接:/{download_link} 详情链接:/{details_link}")
|
print(
|
||||||
|
f"PT_ID == {pt_id} 下载链接:/{download_link} 详情链接:/{details_link}")
|
||||||
entry = PtWebsiteData(
|
entry = PtWebsiteData(
|
||||||
pt_id=pt_id,
|
pt_id=pt_id,
|
||||||
source_name=section_name,
|
source_name=section_name,
|
||||||
|
@ -244,8 +246,9 @@ class PtGetData:
|
||||||
|
|
||||||
|
|
||||||
def opt(self):
|
def opt(self):
|
||||||
|
toml_file = 'PT/pt_config.toml'
|
||||||
try:
|
try:
|
||||||
with open(self.toml_file, 'r', encoding='utf-8') as file:
|
with open(toml_file, 'r', encoding='utf-8') as file:
|
||||||
config_data = toml.load(file)
|
config_data = toml.load(file)
|
||||||
# 迭代每个 section
|
# 迭代每个 section
|
||||||
for section_name, section_data in config_data.items():
|
for section_name, section_data in config_data.items():
|
||||||
|
@ -255,6 +258,20 @@ def opt(self):
|
||||||
# 拉取数据
|
# 拉取数据
|
||||||
self.get_data(section_name, section_data)
|
self.get_data(section_name, section_data)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print(f"Error: The file '{self.toml_file}' was not found.")
|
print(f"Error: The file '{toml_file}' was not found.")
|
||||||
except toml.TomlDecodeError as e:
|
except toml.TomlDecodeError as e:
|
||||||
print(f"Error decoding TOML: {e}")
|
print(f"Error decoding TOML: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
toml_file = 'pt_config.toml'
|
||||||
|
with open(toml_file, 'r', encoding='utf-8') as file:
|
||||||
|
config_data = toml.load(file)
|
||||||
|
# 迭代每个 section
|
||||||
|
for section_name, section_data in config_data.items():
|
||||||
|
print(f"Processing section: {section_name} --- {section_data.get('url')}")
|
||||||
|
url, cookie, flag = section_data.get('url'), section_data.get('cookie'), section_data.get('flag')
|
||||||
|
if flag != 1 and cookie is not None and len(cookie.strip()) > 0:
|
||||||
|
# 拉取数据
|
||||||
|
PtGetData().get_data(section_name, section_data)
|
||||||
|
break
|
||||||
|
|
23
PT/test.py
23
PT/test.py
|
@ -1,4 +1,5 @@
|
||||||
import time
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
|
@ -7,7 +8,7 @@ from lxml import html as lhtml
|
||||||
from urllib.parse import urlparse, parse_qs
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
|
||||||
from qnloft_db_model.PtWebsiteData import PtWebsiteData
|
from qnloft_db_model.PtWebsiteData import PtWebsiteData
|
||||||
|
from dateutil import parser
|
||||||
|
|
||||||
def extract_id(url, field) -> bytes:
|
def extract_id(url, field) -> bytes:
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
|
@ -89,13 +90,21 @@ data = {col: [] for col in columns}
|
||||||
df = pd.DataFrame(data)
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
|
|
||||||
for i in range(0,10):
|
def is_date(s):
|
||||||
# 创建一行数据
|
try:
|
||||||
row_data = {'pt_id': i}
|
datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
|
||||||
|
return True
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
# 将一行数据添加到 DataFrame
|
|
||||||
df = df.append(row_data, ignore_index=True)
|
my_list = ['置顶促销', '国语配音', '中文字幕', '2021-02-02 13:26:26','2021-02-02','2021-02-02 13:26']
|
||||||
print(df)
|
for item in my_list:
|
||||||
|
try:
|
||||||
|
parsed_date = parser.parse(item)
|
||||||
|
print(parsed_date)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
"""
|
"""
|
||||||
主键id,pt资源id,来源名称,一级标题,二级标题,分类id,分类名称
|
主键id,pt资源id,来源名称,一级标题,二级标题,分类id,分类名称
|
||||||
种子状态,状态剩余时间,做种状态,评论数,资源上传时间,资源大小,
|
种子状态,状态剩余时间,做种状态,评论数,资源上传时间,资源大小,
|
||||||
|
|
|
@ -22,7 +22,7 @@ class SqliteDbMain(DbMain):
|
||||||
elif 'macos' in sys_platform.lower():
|
elif 'macos' in sys_platform.lower():
|
||||||
__engine = f"/Users/renmeng/Documents/sqlite_db/{self.database_name}"
|
__engine = f"/Users/renmeng/Documents/sqlite_db/{self.database_name}"
|
||||||
else:
|
else:
|
||||||
__engine = f"{self.database_name}"
|
__engine = f"../sqlite_db/{self.database_name}"
|
||||||
return __engine
|
return __engine
|
||||||
|
|
||||||
def __create_sqlite_engine(self):
|
def __create_sqlite_engine(self):
|
||||||
|
|
|
@ -5,7 +5,7 @@ from sqlalchemy import Column, Integer, String, Float, UniqueConstraint
|
||||||
class PtWebsiteData(declarative_base()):
|
class PtWebsiteData(declarative_base()):
|
||||||
__tablename__ = 'pt_website_data'
|
__tablename__ = 'pt_website_data'
|
||||||
|
|
||||||
id = Column(Integer, primary_key=True)
|
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||||
# pt资源id
|
# pt资源id
|
||||||
pt_id = Column(Integer, nullable=False)
|
pt_id = Column(Integer, nullable=False)
|
||||||
# 来源名称
|
# 来源名称
|
||||||
|
|
Loading…
Reference in New Issue