提交更新
This commit is contained in:
parent
cba3feaf4a
commit
90f88a6dd6
|
@ -9,3 +9,4 @@
|
|||
.settings/org.eclipse.wst.jsdt.core.prefs
|
||||
.settings/org.eclipse.wst.server.core.prefs
|
||||
log/
|
||||
*.pyc
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
数据如何展示呢??
|
||||
"""
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
|
||||
|
@ -23,6 +24,7 @@ from qnloft_db.sqlite_db_main import SqliteDbMain
|
|||
from qnloft_db_model.PtWebsiteData import PtWebsiteData
|
||||
from dateutil import parser
|
||||
|
||||
|
||||
def extract_id(url, field):
|
||||
parsed_url = urlparse(url)
|
||||
query_params = parse_qs(parsed_url.query)
|
||||
|
@ -68,7 +70,7 @@ class PtGetData:
|
|||
|
||||
def get_data(self, section_name, section_data):
|
||||
res_txt = f"开始对 [{section_name}] 进行操作...,抓取数据:"
|
||||
print(res_txt)
|
||||
logger.info(res_txt)
|
||||
url, cookie = section_data.get('url'), section_data.get('cookie')
|
||||
if cookie is not None and len(cookie.strip()) > 0:
|
||||
self.headers["cookie"] = cookie
|
||||
|
@ -80,17 +82,20 @@ class PtGetData:
|
|||
# 解析网页内容
|
||||
self.get_common_analysis(section_name, doc_html)
|
||||
# 获取分页
|
||||
# pages = self.get_common_total_page(doc_html)
|
||||
# for i in range(0, pages):
|
||||
# time.sleep(2)
|
||||
# self.get_data_by_page(section_name, section_data, i)
|
||||
# 数据入库
|
||||
pages = self.get_common_total_page(doc_html)
|
||||
for i in range(0, pages):
|
||||
sleep_time = random.uniform(1, 3)
|
||||
logger.info(
|
||||
f"总共 【{pages}】 页,开始抓取第 【{i}】 页数据,还剩 【{pages - i}】 页,不过要休眠 {sleep_time} 秒")
|
||||
time.sleep(sleep_time)
|
||||
self.get_data_by_page(section_name, section_data, i)
|
||||
except Exception as e:
|
||||
print(f"页面无法解析,请知晓!!!{e}")
|
||||
logger.error(f"页面无法解析,请知晓!!!{e}")
|
||||
return
|
||||
|
||||
def get_data_by_page(self, section_name, section_data, page_num=0):
|
||||
if page_num > 1:
|
||||
html = self.get_website_html(uri=f"{self.torrents_uri}&incldead=1&spstate=0&page={page_num}",
|
||||
if page_num >= 1:
|
||||
html = self.get_website_html(uri=f"{self.torrents_uri}&page={page_num}",
|
||||
section_name=section_name, section_data=section_data)
|
||||
if len(html) == 0:
|
||||
return
|
||||
|
@ -103,11 +108,10 @@ class PtGetData:
|
|||
return int(pages_str) if pages_str.isdigit() else 0
|
||||
|
||||
def get_common_analysis(self, section_name, doc_html):
|
||||
entries = []
|
||||
# 使用lxml解析HTML
|
||||
row_follow_tables = doc_html.xpath('//table[@class="torrents"]//tr[position() > 1]')
|
||||
for row_follow in row_follow_tables:
|
||||
# html_content = lhtml.tostring(row_follow, encoding='unicode')
|
||||
html_content = lhtml.tostring(row_follow, encoding='unicode')
|
||||
# print(f"html内容:{html_content}")
|
||||
# 一级标题
|
||||
first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0]
|
||||
|
@ -180,6 +184,11 @@ class PtGetData:
|
|||
details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0]
|
||||
print(
|
||||
f"PT_ID == {pt_id} 下载链接:/{download_link} 详情链接:/{details_link}")
|
||||
# douban_rating = doc.xpath('')
|
||||
# print(f"豆瓣评分:/{douban_rating[0]}")
|
||||
|
||||
# imdb_rating = doc.xpath('')
|
||||
# print(f"imdb_rating:/{imdb_rating[0]}")
|
||||
entry = PtWebsiteData(
|
||||
pt_id=pt_id,
|
||||
source_name=section_name,
|
||||
|
@ -202,15 +211,29 @@ class PtGetData:
|
|||
download_link=f'/{download_link}',
|
||||
details_link=f'/{details_link}'
|
||||
)
|
||||
entries.append(entry)
|
||||
self.db_main.insert_all_entry(entries)
|
||||
# 如果包含置顶,出现错误不管
|
||||
if "置顶" in html_content:
|
||||
self.insert_entry(True, entry)
|
||||
else:
|
||||
# todo 这里的逻辑明天补全
|
||||
# 取数据库中查询一下,是否存在source_name=section_name的数据,如果存在,则不是初始化
|
||||
# 如果不存在,则是初始化数据
|
||||
pass
|
||||
|
||||
# break
|
||||
# douban_rating = doc.xpath('')
|
||||
# print(f"豆瓣评分:/{douban_rating[0]}")
|
||||
|
||||
# imdb_rating = doc.xpath('')
|
||||
# print(f"imdb_rating:/{imdb_rating[0]}")
|
||||
def insert_entry(self, if_pass, entry):
|
||||
if if_pass:
|
||||
try:
|
||||
self.db_main.insert_entry(entry)
|
||||
except Exception as e:
|
||||
# 第一次初始化数据的时候,为了防止数据没入库完成,出现新增数据,这里先设置成pass
|
||||
logger.error(f"if_pass == {if_pass} 是出现错误:{e}")
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
self.db_main.insert_entry(entry)
|
||||
except Exception as e:
|
||||
logger.error(f"数据存储失败,原因:{e}")
|
||||
raise
|
||||
|
||||
def get_type(self, section_name, section_data):
|
||||
res_txt = f"开始对 [{section_name}] 进行操作...,抓取网站分类:"
|
||||
|
@ -258,9 +281,9 @@ def opt(self):
|
|||
# 拉取数据
|
||||
self.get_data(section_name, section_data)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: The file '{toml_file}' was not found.")
|
||||
logger.error(f"Error: The file '{toml_file}' was not found.")
|
||||
except toml.TomlDecodeError as e:
|
||||
print(f"Error decoding TOML: {e}")
|
||||
logger.error(f"Error decoding TOML: {e}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -43,6 +43,7 @@ class DbMain:
|
|||
trace = traceback.extract_tb(e.__traceback__)
|
||||
for filename, lineno, funcname, source in trace:
|
||||
print(f"在文件 {filename} 的第 {lineno} 行发生错误")
|
||||
raise
|
||||
finally:
|
||||
self.session.close()
|
||||
|
||||
|
|
Loading…
Reference in New Issue