From 90f88a6dd624684333ac40cbba098306fe7cbeda Mon Sep 17 00:00:00 2001 From: rm Date: Thu, 18 Jan 2024 01:49:34 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + PT/pt_get_data.py | 65 ++++++++++++++++++++++++++++++-------------- qnloft_db/db_main.py | 1 + 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 1f7dd81..c2a07f5 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ .settings/org.eclipse.wst.jsdt.core.prefs .settings/org.eclipse.wst.server.core.prefs log/ +*.pyc diff --git a/PT/pt_get_data.py b/PT/pt_get_data.py index c4283dc..e41284c 100644 --- a/PT/pt_get_data.py +++ b/PT/pt_get_data.py @@ -8,6 +8,7 @@ 数据如何展示呢?? """ +import random import sys import time @@ -23,6 +24,7 @@ from qnloft_db.sqlite_db_main import SqliteDbMain from qnloft_db_model.PtWebsiteData import PtWebsiteData from dateutil import parser + def extract_id(url, field): parsed_url = urlparse(url) query_params = parse_qs(parsed_url.query) @@ -68,7 +70,7 @@ class PtGetData: def get_data(self, section_name, section_data): res_txt = f"开始对 [{section_name}] 进行操作...,抓取数据:" - print(res_txt) + logger.info(res_txt) url, cookie = section_data.get('url'), section_data.get('cookie') if cookie is not None and len(cookie.strip()) > 0: self.headers["cookie"] = cookie @@ -80,17 +82,20 @@ class PtGetData: # 解析网页内容 self.get_common_analysis(section_name, doc_html) # 获取分页 - # pages = self.get_common_total_page(doc_html) - # for i in range(0, pages): - # time.sleep(2) - # self.get_data_by_page(section_name, section_data, i) - # 数据入库 + pages = self.get_common_total_page(doc_html) + for i in range(0, pages): + sleep_time = random.uniform(1, 3) + logger.info( + f"总共 【{pages}】 页,开始抓取第 【{i}】 页数据,还剩 【{pages - i}】 页,不过要休眠 {sleep_time} 秒") + time.sleep(sleep_time) + self.get_data_by_page(section_name, section_data, i) except Exception as e: - print(f"页面无法解析,请知晓!!!{e}") + logger.error(f"页面无法解析,请知晓!!!{e}") + return def get_data_by_page(self, section_name, section_data, page_num=0): - if page_num > 1: - html = self.get_website_html(uri=f"{self.torrents_uri}&incldead=1&spstate=0&page={page_num}", + if page_num >= 1: + html = self.get_website_html(uri=f"{self.torrents_uri}&page={page_num}", section_name=section_name, section_data=section_data) if len(html) == 0: return @@ -103,11 +108,10 @@ class PtGetData: return int(pages_str) if pages_str.isdigit() else 0 def get_common_analysis(self, section_name, doc_html): - entries = [] # 使用lxml解析HTML row_follow_tables = doc_html.xpath('//table[@class="torrents"]//tr[position() > 1]') for row_follow in row_follow_tables: - # html_content = lhtml.tostring(row_follow, encoding='unicode') + html_content = lhtml.tostring(row_follow, encoding='unicode') # print(f"html内容:{html_content}") # 一级标题 first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0] @@ -180,6 +184,11 @@ class PtGetData: details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0] print( f"PT_ID == {pt_id} 下载链接:/{download_link} 详情链接:/{details_link}") + # douban_rating = doc.xpath('') + # print(f"豆瓣评分:/{douban_rating[0]}") + + # imdb_rating = doc.xpath('') + # print(f"imdb_rating:/{imdb_rating[0]}") entry = PtWebsiteData( pt_id=pt_id, source_name=section_name, @@ -202,15 +211,29 @@ class PtGetData: download_link=f'/{download_link}', details_link=f'/{details_link}' ) - entries.append(entry) - self.db_main.insert_all_entry(entries) + # 如果包含置顶,出现错误不管 + if "置顶" in html_content: + self.insert_entry(True, entry) + else: + # todo 这里的逻辑明天补全 + # 取数据库中查询一下,是否存在source_name=section_name的数据,如果存在,则不是初始化 + # 如果不存在,则是初始化数据 + pass - # break - # douban_rating = doc.xpath('') - # print(f"豆瓣评分:/{douban_rating[0]}") - - # imdb_rating = doc.xpath('') - # print(f"imdb_rating:/{imdb_rating[0]}") + def insert_entry(self, if_pass, entry): + if if_pass: + try: + self.db_main.insert_entry(entry) + except Exception as e: + # 第一次初始化数据的时候,为了防止数据没入库完成,出现新增数据,这里先设置成pass + logger.error(f"if_pass == {if_pass} 是出现错误:{e}") + pass + else: + try: + self.db_main.insert_entry(entry) + except Exception as e: + logger.error(f"数据存储失败,原因:{e}") + raise def get_type(self, section_name, section_data): res_txt = f"开始对 [{section_name}] 进行操作...,抓取网站分类:" @@ -258,9 +281,9 @@ def opt(self): # 拉取数据 self.get_data(section_name, section_data) except FileNotFoundError: - print(f"Error: The file '{toml_file}' was not found.") + logger.error(f"Error: The file '{toml_file}' was not found.") except toml.TomlDecodeError as e: - print(f"Error decoding TOML: {e}") + logger.error(f"Error decoding TOML: {e}") if __name__ == '__main__': diff --git a/qnloft_db/db_main.py b/qnloft_db/db_main.py index a02a552..2ea12d2 100644 --- a/qnloft_db/db_main.py +++ b/qnloft_db/db_main.py @@ -43,6 +43,7 @@ class DbMain: trace = traceback.extract_tb(e.__traceback__) for filename, lineno, funcname, source in trace: print(f"在文件 {filename} 的第 {lineno} 行发生错误") + raise finally: self.session.close()