提交更新

This commit is contained in:
rm 2024-01-18 01:49:34 +08:00
parent cba3feaf4a
commit 90f88a6dd6
3 changed files with 46 additions and 21 deletions

1
.gitignore vendored
View File

@ -9,3 +9,4 @@
.settings/org.eclipse.wst.jsdt.core.prefs
.settings/org.eclipse.wst.server.core.prefs
log/
*.pyc

View File

@ -8,6 +8,7 @@
数据如何展示呢
"""
import random
import sys
import time
@ -23,6 +24,7 @@ from qnloft_db.sqlite_db_main import SqliteDbMain
from qnloft_db_model.PtWebsiteData import PtWebsiteData
from dateutil import parser
def extract_id(url, field):
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
@ -68,7 +70,7 @@ class PtGetData:
def get_data(self, section_name, section_data):
res_txt = f"开始对 [{section_name}] 进行操作...,抓取数据:"
print(res_txt)
logger.info(res_txt)
url, cookie = section_data.get('url'), section_data.get('cookie')
if cookie is not None and len(cookie.strip()) > 0:
self.headers["cookie"] = cookie
@ -80,17 +82,20 @@ class PtGetData:
# 解析网页内容
self.get_common_analysis(section_name, doc_html)
# 获取分页
# pages = self.get_common_total_page(doc_html)
# for i in range(0, pages):
# time.sleep(2)
# self.get_data_by_page(section_name, section_data, i)
# 数据入库
pages = self.get_common_total_page(doc_html)
for i in range(0, pages):
sleep_time = random.uniform(1, 3)
logger.info(
f"总共 【{pages}】 页,开始抓取第 【{i}】 页数据,还剩 【{pages - i}】 页,不过要休眠 {sleep_time}")
time.sleep(sleep_time)
self.get_data_by_page(section_name, section_data, i)
except Exception as e:
print(f"页面无法解析,请知晓!!!{e}")
logger.error(f"页面无法解析,请知晓!!!{e}")
return
def get_data_by_page(self, section_name, section_data, page_num=0):
if page_num > 1:
html = self.get_website_html(uri=f"{self.torrents_uri}&incldead=1&spstate=0&page={page_num}",
if page_num >= 1:
html = self.get_website_html(uri=f"{self.torrents_uri}&page={page_num}",
section_name=section_name, section_data=section_data)
if len(html) == 0:
return
@ -103,11 +108,10 @@ class PtGetData:
return int(pages_str) if pages_str.isdigit() else 0
def get_common_analysis(self, section_name, doc_html):
entries = []
# 使用lxml解析HTML
row_follow_tables = doc_html.xpath('//table[@class="torrents"]//tr[position() > 1]')
for row_follow in row_follow_tables:
# html_content = lhtml.tostring(row_follow, encoding='unicode')
html_content = lhtml.tostring(row_follow, encoding='unicode')
# print(f"html内容{html_content}")
# 一级标题
first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0]
@ -180,6 +184,11 @@ class PtGetData:
details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0]
print(
f"PT_ID == {pt_id} 下载链接:/{download_link} 详情链接:/{details_link}")
# douban_rating = doc.xpath('')
# print(f"豆瓣评分:/{douban_rating[0]}")
# imdb_rating = doc.xpath('')
# print(f"imdb_rating/{imdb_rating[0]}")
entry = PtWebsiteData(
pt_id=pt_id,
source_name=section_name,
@ -202,15 +211,29 @@ class PtGetData:
download_link=f'/{download_link}',
details_link=f'/{details_link}'
)
entries.append(entry)
self.db_main.insert_all_entry(entries)
# 如果包含置顶,出现错误不管
if "置顶" in html_content:
self.insert_entry(True, entry)
else:
# todo 这里的逻辑明天补全
# 取数据库中查询一下是否存在source_name=section_name的数据如果存在则不是初始化
# 如果不存在,则是初始化数据
pass
# break
# douban_rating = doc.xpath('')
# print(f"豆瓣评分:/{douban_rating[0]}")
# imdb_rating = doc.xpath('')
# print(f"imdb_rating/{imdb_rating[0]}")
def insert_entry(self, if_pass, entry):
if if_pass:
try:
self.db_main.insert_entry(entry)
except Exception as e:
# 第一次初始化数据的时候为了防止数据没入库完成出现新增数据这里先设置成pass
logger.error(f"if_pass == {if_pass} 是出现错误:{e}")
pass
else:
try:
self.db_main.insert_entry(entry)
except Exception as e:
logger.error(f"数据存储失败,原因:{e}")
raise
def get_type(self, section_name, section_data):
res_txt = f"开始对 [{section_name}] 进行操作...,抓取网站分类:"
@ -258,9 +281,9 @@ def opt(self):
# 拉取数据
self.get_data(section_name, section_data)
except FileNotFoundError:
print(f"Error: The file '{toml_file}' was not found.")
logger.error(f"Error: The file '{toml_file}' was not found.")
except toml.TomlDecodeError as e:
print(f"Error decoding TOML: {e}")
logger.error(f"Error decoding TOML: {e}")
if __name__ == '__main__':

View File

@ -43,6 +43,7 @@ class DbMain:
trace = traceback.extract_tb(e.__traceback__)
for filename, lineno, funcname, source in trace:
print(f"在文件 {filename} 的第 {lineno} 行发生错误")
raise
finally:
self.session.close()