提交更新
This commit is contained in:
parent
cba3feaf4a
commit
90f88a6dd6
|
@ -9,3 +9,4 @@
|
||||||
.settings/org.eclipse.wst.jsdt.core.prefs
|
.settings/org.eclipse.wst.jsdt.core.prefs
|
||||||
.settings/org.eclipse.wst.server.core.prefs
|
.settings/org.eclipse.wst.server.core.prefs
|
||||||
log/
|
log/
|
||||||
|
*.pyc
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
|
|
||||||
数据如何展示呢??
|
数据如何展示呢??
|
||||||
"""
|
"""
|
||||||
|
import random
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
@ -23,6 +24,7 @@ from qnloft_db.sqlite_db_main import SqliteDbMain
|
||||||
from qnloft_db_model.PtWebsiteData import PtWebsiteData
|
from qnloft_db_model.PtWebsiteData import PtWebsiteData
|
||||||
from dateutil import parser
|
from dateutil import parser
|
||||||
|
|
||||||
|
|
||||||
def extract_id(url, field):
|
def extract_id(url, field):
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
query_params = parse_qs(parsed_url.query)
|
query_params = parse_qs(parsed_url.query)
|
||||||
|
@ -68,7 +70,7 @@ class PtGetData:
|
||||||
|
|
||||||
def get_data(self, section_name, section_data):
|
def get_data(self, section_name, section_data):
|
||||||
res_txt = f"开始对 [{section_name}] 进行操作...,抓取数据:"
|
res_txt = f"开始对 [{section_name}] 进行操作...,抓取数据:"
|
||||||
print(res_txt)
|
logger.info(res_txt)
|
||||||
url, cookie = section_data.get('url'), section_data.get('cookie')
|
url, cookie = section_data.get('url'), section_data.get('cookie')
|
||||||
if cookie is not None and len(cookie.strip()) > 0:
|
if cookie is not None and len(cookie.strip()) > 0:
|
||||||
self.headers["cookie"] = cookie
|
self.headers["cookie"] = cookie
|
||||||
|
@ -80,17 +82,20 @@ class PtGetData:
|
||||||
# 解析网页内容
|
# 解析网页内容
|
||||||
self.get_common_analysis(section_name, doc_html)
|
self.get_common_analysis(section_name, doc_html)
|
||||||
# 获取分页
|
# 获取分页
|
||||||
# pages = self.get_common_total_page(doc_html)
|
pages = self.get_common_total_page(doc_html)
|
||||||
# for i in range(0, pages):
|
for i in range(0, pages):
|
||||||
# time.sleep(2)
|
sleep_time = random.uniform(1, 3)
|
||||||
# self.get_data_by_page(section_name, section_data, i)
|
logger.info(
|
||||||
# 数据入库
|
f"总共 【{pages}】 页,开始抓取第 【{i}】 页数据,还剩 【{pages - i}】 页,不过要休眠 {sleep_time} 秒")
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
self.get_data_by_page(section_name, section_data, i)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"页面无法解析,请知晓!!!{e}")
|
logger.error(f"页面无法解析,请知晓!!!{e}")
|
||||||
|
return
|
||||||
|
|
||||||
def get_data_by_page(self, section_name, section_data, page_num=0):
|
def get_data_by_page(self, section_name, section_data, page_num=0):
|
||||||
if page_num > 1:
|
if page_num >= 1:
|
||||||
html = self.get_website_html(uri=f"{self.torrents_uri}&incldead=1&spstate=0&page={page_num}",
|
html = self.get_website_html(uri=f"{self.torrents_uri}&page={page_num}",
|
||||||
section_name=section_name, section_data=section_data)
|
section_name=section_name, section_data=section_data)
|
||||||
if len(html) == 0:
|
if len(html) == 0:
|
||||||
return
|
return
|
||||||
|
@ -103,11 +108,10 @@ class PtGetData:
|
||||||
return int(pages_str) if pages_str.isdigit() else 0
|
return int(pages_str) if pages_str.isdigit() else 0
|
||||||
|
|
||||||
def get_common_analysis(self, section_name, doc_html):
|
def get_common_analysis(self, section_name, doc_html):
|
||||||
entries = []
|
|
||||||
# 使用lxml解析HTML
|
# 使用lxml解析HTML
|
||||||
row_follow_tables = doc_html.xpath('//table[@class="torrents"]//tr[position() > 1]')
|
row_follow_tables = doc_html.xpath('//table[@class="torrents"]//tr[position() > 1]')
|
||||||
for row_follow in row_follow_tables:
|
for row_follow in row_follow_tables:
|
||||||
# html_content = lhtml.tostring(row_follow, encoding='unicode')
|
html_content = lhtml.tostring(row_follow, encoding='unicode')
|
||||||
# print(f"html内容:{html_content}")
|
# print(f"html内容:{html_content}")
|
||||||
# 一级标题
|
# 一级标题
|
||||||
first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0]
|
first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0]
|
||||||
|
@ -180,6 +184,11 @@ class PtGetData:
|
||||||
details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0]
|
details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0]
|
||||||
print(
|
print(
|
||||||
f"PT_ID == {pt_id} 下载链接:/{download_link} 详情链接:/{details_link}")
|
f"PT_ID == {pt_id} 下载链接:/{download_link} 详情链接:/{details_link}")
|
||||||
|
# douban_rating = doc.xpath('')
|
||||||
|
# print(f"豆瓣评分:/{douban_rating[0]}")
|
||||||
|
|
||||||
|
# imdb_rating = doc.xpath('')
|
||||||
|
# print(f"imdb_rating:/{imdb_rating[0]}")
|
||||||
entry = PtWebsiteData(
|
entry = PtWebsiteData(
|
||||||
pt_id=pt_id,
|
pt_id=pt_id,
|
||||||
source_name=section_name,
|
source_name=section_name,
|
||||||
|
@ -202,15 +211,29 @@ class PtGetData:
|
||||||
download_link=f'/{download_link}',
|
download_link=f'/{download_link}',
|
||||||
details_link=f'/{details_link}'
|
details_link=f'/{details_link}'
|
||||||
)
|
)
|
||||||
entries.append(entry)
|
# 如果包含置顶,出现错误不管
|
||||||
self.db_main.insert_all_entry(entries)
|
if "置顶" in html_content:
|
||||||
|
self.insert_entry(True, entry)
|
||||||
|
else:
|
||||||
|
# todo 这里的逻辑明天补全
|
||||||
|
# 取数据库中查询一下,是否存在source_name=section_name的数据,如果存在,则不是初始化
|
||||||
|
# 如果不存在,则是初始化数据
|
||||||
|
pass
|
||||||
|
|
||||||
# break
|
def insert_entry(self, if_pass, entry):
|
||||||
# douban_rating = doc.xpath('')
|
if if_pass:
|
||||||
# print(f"豆瓣评分:/{douban_rating[0]}")
|
try:
|
||||||
|
self.db_main.insert_entry(entry)
|
||||||
# imdb_rating = doc.xpath('')
|
except Exception as e:
|
||||||
# print(f"imdb_rating:/{imdb_rating[0]}")
|
# 第一次初始化数据的时候,为了防止数据没入库完成,出现新增数据,这里先设置成pass
|
||||||
|
logger.error(f"if_pass == {if_pass} 是出现错误:{e}")
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
self.db_main.insert_entry(entry)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"数据存储失败,原因:{e}")
|
||||||
|
raise
|
||||||
|
|
||||||
def get_type(self, section_name, section_data):
|
def get_type(self, section_name, section_data):
|
||||||
res_txt = f"开始对 [{section_name}] 进行操作...,抓取网站分类:"
|
res_txt = f"开始对 [{section_name}] 进行操作...,抓取网站分类:"
|
||||||
|
@ -258,9 +281,9 @@ def opt(self):
|
||||||
# 拉取数据
|
# 拉取数据
|
||||||
self.get_data(section_name, section_data)
|
self.get_data(section_name, section_data)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print(f"Error: The file '{toml_file}' was not found.")
|
logger.error(f"Error: The file '{toml_file}' was not found.")
|
||||||
except toml.TomlDecodeError as e:
|
except toml.TomlDecodeError as e:
|
||||||
print(f"Error decoding TOML: {e}")
|
logger.error(f"Error decoding TOML: {e}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -43,6 +43,7 @@ class DbMain:
|
||||||
trace = traceback.extract_tb(e.__traceback__)
|
trace = traceback.extract_tb(e.__traceback__)
|
||||||
for filename, lineno, funcname, source in trace:
|
for filename, lineno, funcname, source in trace:
|
||||||
print(f"在文件 {filename} 的第 {lineno} 行发生错误")
|
print(f"在文件 {filename} 的第 {lineno} 行发生错误")
|
||||||
|
raise
|
||||||
finally:
|
finally:
|
||||||
self.session.close()
|
self.session.close()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue