更新一些代码

This commit is contained in:
rm 2024-01-18 16:32:27 +08:00
parent 90f88a6dd6
commit 2b13729f19
3 changed files with 153 additions and 126 deletions

View File

@ -12,6 +12,8 @@ import random
import sys
import time
from sqlalchemy import func
from qnloft_db import db_config as config
import requests
import toml
@ -67,6 +69,7 @@ class PtGetData:
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
}
self.db_main = SqliteDbMain(config.pt_website_db)
self.if_pass = False
def get_data(self, section_name, section_data):
res_txt = f"开始对 [{section_name}] 进行操作...,抓取数据:"
@ -78,6 +81,15 @@ class PtGetData:
if len(html) == 0:
return
try:
# 取数据库中查询一下是否存在source_name=section_name的数据如果存在则不是初始化
count = self.db_main.pandas_query_by_condition(
model=func.count(PtWebsiteData.id),
query_condition=PtWebsiteData.source_name == section_name,
)
# 如果不存在,则是初始化数据
res = int(count['count_1'].iloc[0])
if res == 0:
self.if_pass = True
doc_html = lhtml.fromstring(html)
# 解析网页内容
self.get_common_analysis(section_name, doc_html)
@ -211,22 +223,20 @@ class PtGetData:
download_link=f'/{download_link}',
details_link=f'/{details_link}'
)
if self.if_pass is False:
# 如果包含置顶,出现错误不管
if "置顶" in html_content:
self.insert_entry(True, entry)
else:
# todo 这里的逻辑明天补全
# 取数据库中查询一下是否存在source_name=section_name的数据如果存在则不是初始化
# 如果不存在,则是初始化数据
pass
self.if_pass = True
self.insert_entry(self.if_pass, entry)
def insert_entry(self, if_pass, entry):
# if_pass == true 则吃掉异常,代码继续存储
if if_pass:
try:
self.db_main.insert_entry(entry)
except Exception as e:
# 第一次初始化数据的时候为了防止数据没入库完成出现新增数据这里先设置成pass
logger.error(f"if_pass == {if_pass} 出现错误:{e}")
logger.error(f"if_pass == {if_pass} 出现错误:{e}")
pass
else:
try:
@ -267,7 +277,6 @@ class PtGetData:
logger.error(f"{section_name} , 5次出现错误无法访问")
return ""
def opt(self):
toml_file = 'PT/pt_config.toml'
try:

View File

@ -3,13 +3,17 @@ from datetime import datetime
import pandas as pd
import requests
import sqlalchemy
import toml
from lxml import html as lhtml
from urllib.parse import urlparse, parse_qs
from sqlalchemy.orm import attributes
from sqlalchemy import func
from qnloft_db.sqlite_db_main import SqliteDbMain
from qnloft_db_model.PtWebsiteData import PtWebsiteData
from dateutil import parser
from qnloft_db import db_config as config
def extract_id(url, field) -> bytes:
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
@ -89,22 +93,14 @@ data = {col: [] for col in columns}
# 创建空 DataFrame
df = pd.DataFrame(data)
def is_date(s):
try:
datetime.strptime(s, '%Y-%m-%d %H:%M:%S')
return True
except ValueError:
return False
db_main = SqliteDbMain(config.pt_website_db)
count = db_main.pandas_query_by_condition(
model=func.count(PtWebsiteData.id),
query_condition=PtWebsiteData.source_name == "1PTBar/壹PT",
)
print(int(count['count_1'].iloc[0]))
my_list = ['置顶促销', '国语配音', '中文字幕', '2021-02-02 13:26:26','2021-02-02','2021-02-02 13:26']
for item in my_list:
try:
parsed_date = parser.parse(item)
print(parsed_date)
except ValueError:
pass
"""
主键id,pt资源id,来源名称,一级标题,二级标题,分类id分类名称
种子状态,状态剩余时间,做种状态,评论数,资源上传时间,资源大小

View File

@ -1,7 +1,7 @@
import traceback
import pandas
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy.orm import sessionmaker, scoped_session, attributes
from sqlalchemy import *
from qnloft_db import db_config as config
@ -168,14 +168,29 @@ class DbMain:
self.session.close()
self.engine.dispose()
def pandas_query_by_condition(self, model, query_condition):
def pandas_query_by_condition(self, model, query_condition, sort_column=None, ascending=True):
try:
# 当需要根据多个条件进行查询操作时
# query_condition = and_(
# StockDaily.trade_date == '20230823',
# StockDaily.symbol == 'ABC'
# )
query = self.session.query(model).filter(query_condition).order_by(model.id)
print(model)
query = self.session.query(model).filter(query_condition)
# 如果模型中存在排序字段,则进行排序
if sort_column:
sort_attr = None
if isinstance(sort_column, attributes.InstrumentedAttribute):
# 如果包含点号,表示是类属性
sort_attr = sort_column
elif isinstance(sort_column, str):
# 否则,按照字符串处理
sort_attr = getattr(model, sort_column, None)
if sort_attr:
if ascending:
query = query.order_by(sort_attr)
else:
query = query.order_by(sort_attr.desc())
return self.pandas_query_by_sql(stmt=query.statement).reset_index()
except Exception as e:
trace = traceback.extract_tb(e.__traceback__)
@ -185,6 +200,7 @@ class DbMain:
finally:
self.session.close()
# ===================== delete 方法=============================
def delete_by_id(self, model, db_id):
try:
@ -199,6 +215,7 @@ class DbMain:
finally:
self.session.close()
def delete_by_condition(self, model, delete_condition):
try:
# 使用 delete() 方法删除符合条件的记录
@ -220,6 +237,7 @@ class DbMain:
finally:
self.session.close()
def delete_all_table(self, model): # 清空表数据
try:
self.session.query(model).delete()
@ -232,6 +250,7 @@ class DbMain:
finally:
self.session.close()
# ===================== 其它 方法=============================
def has_table(self, entries):
if isinstance(entries, list):
@ -241,6 +260,7 @@ class DbMain:
# 检查表是否存在,如果不存在则创建
return self.inspector.has_table(table_name)
def execute_sql(self, s):
try:
sql_text = text(s)
@ -253,6 +273,7 @@ class DbMain:
finally:
self.session.close()
def execute_sql_to_pandas(self, s):
try:
sql_text = text(s)
@ -266,5 +287,6 @@ class DbMain:
finally:
self.session.close()
def close(self):
self.session.close()