提交更新

2024-01-18 01:49:34 +08:00 · 2024-01-18 01:49:34 +08:00 · 90f88a6dd6
parent cba3feaf4a
commit 90f88a6dd6
3 changed files with 46 additions and 21 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,3 +9,4 @@
 .settings/org.eclipse.wst.jsdt.core.prefs
 .settings/org.eclipse.wst.server.core.prefs
 log/
+*.pyc
--- a/PT/pt_get_data.py
+++ b/PT/pt_get_data.py
@ -8,6 +8,7 @@

 数据如何展示呢？？
 """
+import random
 import sys
 import time

@ -23,6 +24,7 @@ from qnloft_db.sqlite_db_main import SqliteDbMain
 from qnloft_db_model.PtWebsiteData import PtWebsiteData
 from dateutil import parser

+
 def extract_id(url, field):
 	parsed_url = urlparse(url)
 	query_params = parse_qs(parsed_url.query)
@ -68,7 +70,7 @@ class PtGetData:

 	def get_data(self, section_name, section_data):
 		res_txt = f"开始对 [{section_name}] 进行操作...，抓取数据："
-		print(res_txt)
+		logger.info(res_txt)
 		url, cookie = section_data.get('url'), section_data.get('cookie')
 		if cookie is not None and len(cookie.strip()) > 0:
 			self.headers["cookie"] = cookie
@ -80,17 +82,20 @@ class PtGetData:
 				# 解析网页内容
 				self.get_common_analysis(section_name, doc_html)
 				# 获取分页
-				# pages = self.get_common_total_page(doc_html)
-				# for i in range(0, pages):
-				# 	time.sleep(2)
-				# 	self.get_data_by_page(section_name, section_data, i)
-			# 数据入库
+				pages = self.get_common_total_page(doc_html)
+				for i in range(0, pages):
+					sleep_time = random.uniform(1, 3)
+					logger.info(
+						f"总共 【{pages}】 页，开始抓取第 【{i}】 页数据，还剩 【{pages - i}】 页，不过要休眠 {sleep_time} 秒")
+					time.sleep(sleep_time)
+					self.get_data_by_page(section_name, section_data, i)
 			except Exception as e:
-				print(f"页面无法解析，请知晓！！！{e}")
+				logger.error(f"页面无法解析，请知晓！！！{e}")
+				return

 	def get_data_by_page(self, section_name, section_data, page_num=0):
-		if page_num > 1:
-			html = self.get_website_html(uri=f"{self.torrents_uri}&incldead=1&spstate=0&page={page_num}",
+		if page_num >= 1:
+			html = self.get_website_html(uri=f"{self.torrents_uri}&page={page_num}",
 										 section_name=section_name, section_data=section_data)
 			if len(html) == 0:
 				return
@ -103,11 +108,10 @@ class PtGetData:
 		return int(pages_str) if pages_str.isdigit() else 0

 	def get_common_analysis(self, section_name, doc_html):
-		entries = []
 		# 使用lxml解析HTML
 		row_follow_tables = doc_html.xpath('//table[@class="torrents"]//tr[position() > 1]')
 		for row_follow in row_follow_tables:
-			# html_content = lhtml.tostring(row_follow, encoding='unicode')
+			html_content = lhtml.tostring(row_follow, encoding='unicode')
 			# print(f"html内容：{html_content}")
 			# 一级标题
 			first_title = row_follow.xpath('.//table[@class="torrentname"]//a[@title]/@title')[0]
@ -180,6 +184,11 @@ class PtGetData:
 			details_link = row_follow.xpath('.//table[@class="torrentname"]//a[@href]/@href')[0]
 			print(
 				f"PT_ID == {pt_id} 下载链接：/{download_link} 详情链接：/{details_link}")
+			# douban_rating = doc.xpath('')
+			# print(f"豆瓣评分：/{douban_rating[0]}")
+
+			# imdb_rating = doc.xpath('')
+			# print(f"imdb_rating：/{imdb_rating[0]}")
 			entry = PtWebsiteData(
 				pt_id=pt_id,
 				source_name=section_name,
@ -202,15 +211,29 @@ class PtGetData:
 				download_link=f'/{download_link}',
 				details_link=f'/{details_link}'
 			)
-			entries.append(entry)
-		self.db_main.insert_all_entry(entries)
+			# 如果包含置顶，出现错误不管
+			if "置顶" in html_content:
+				self.insert_entry(True, entry)
+			else:
+				# todo 这里的逻辑明天补全
+				# 取数据库中查询一下，是否存在source_name=section_name的数据，如果存在，则不是初始化
+				# 如果不存在，则是初始化数据
+				pass

-	# break
-	# douban_rating = doc.xpath('')
-	# print(f"豆瓣评分：/{douban_rating[0]}")
-
-	# imdb_rating = doc.xpath('')
-	# print(f"imdb_rating：/{imdb_rating[0]}")
+	def insert_entry(self, if_pass, entry):
+		if if_pass:
+			try:
+				self.db_main.insert_entry(entry)
+			except Exception as e:
+				# 第一次初始化数据的时候，为了防止数据没入库完成，出现新增数据，这里先设置成pass
+				logger.error(f"if_pass == {if_pass} 是出现错误：{e}")
+				pass
+		else:
+			try:
+				self.db_main.insert_entry(entry)
+			except Exception as e:
+				logger.error(f"数据存储失败，原因：{e}")
+				raise

 	def get_type(self, section_name, section_data):
 		res_txt = f"开始对 [{section_name}] 进行操作...，抓取网站分类："
@ -258,9 +281,9 @@ def opt(self):
 					# 拉取数据
 					self.get_data(section_name, section_data)
 	except FileNotFoundError:
-		print(f"Error: The file '{toml_file}' was not found.")
+		logger.error(f"Error: The file '{toml_file}' was not found.")
 	except toml.TomlDecodeError as e:
-		print(f"Error decoding TOML: {e}")
+		logger.error(f"Error decoding TOML: {e}")


 if __name__ == '__main__':
--- a/qnloft_db/db_main.py
+++ b/qnloft_db/db_main.py
@ -43,6 +43,7 @@ class DbMain:
 			trace = traceback.extract_tb(e.__traceback__)
 			for filename, lineno, funcname, source in trace:
 				print(f"在文件 {filename} 的第 {lineno} 行发生错误")
+			raise
 		finally:
 			self.session.close()