qnloft-stock/fp/新闻资讯.py

125 lines
3.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from utils.comm import *
from snownlp import SnowNLP
import jieba
import jieba.posseg as pseg
import jieba.analyse
class News:
def __init__(self, trade_date=datetime.now()):
self.trade_date = trade_date.strftime('%Y%m%d')
@staticmethod
def preprocess(text):
seg_list = jieba.lcut(text) # 文本预处理及分词
return seg_list
@staticmethod
def sentiment_analysis(text): # 情感分析
s = SnowNLP(text)
sentiment = s.sentiments # 情感得分范围0-1越接近1表示正面情感越接近0表示负面情感
return sentiment
@staticmethod
def extract_keywords(text, top_k=5): # 关键词提取
keywords = jieba.analyse.extract_tags(text, topK=top_k, withWeight=False, allowPOS=()) # 提取关键词默认返回前5个
return keywords
@staticmethod
def named_entity_recognition(text): # 命名实体识别
words = pseg.cut(text)
entities = []
for word, flag in words:
# 识别人名、地名、机构名、其它专名
if flag in ['nr', 'ns', 'nt', 'nz']:
entities.append((word, flag))
return entities
def result(self, news):
print("原文:", news)
preprocessed_news = self.preprocess(news)
sentiment_score = self.sentiment_analysis(news)
keywords = self.extract_keywords(news, top_k=3)
entities = self.named_entity_recognition(news)
print("分词结果:", preprocessed_news)
print("情感分析得分:", round(sentiment_score, 2))
print("关键词提取:", keywords)
print("命名实体识别:", entities)
def news_cctv(self):
for _ in range(5):
try:
# 中央新闻
df = ak.news_cctv(date=self.trade_date)
# 情感分析
# df['title'].apply(lambda x: self.result(x))
# print(df['情感'])
return df
except Exception as e:
print(f"{self.trade_date} 日的新闻咨询拉取发生错误!{e.__traceback__}")
time.sleep(1)
continue
else:
print("中央新闻 5次出现错误请关注")
return None
def news_cls(self):
for _ in range(5):
try:
# 财联社-电报
df = ak.stock_telegraph_cls()
# 情感分析
# df['title'].apply(lambda x: self.result(x))
# print(df['情感'])
return df
except Exception as e:
print(f"{self.trade_date} 财联社-电报 咨询拉取发生错误!{e.__traceback__}")
continue
else:
print("财联社-电报 5次出现错误请关注")
return None
@staticmethod
def news_stock_by_code(symbol, date_time=None):
# 个股新闻
stock_news_em_df = ak.stock_news_em(symbol=symbol)
# 将发布时间列转换为日期时间类型
stock_news_em_df['发布时间'] = pd.to_datetime(stock_news_em_df['发布时间'])
# print(stock_news_em_df)
if date_time is not None:
if len(date_time.split("-")) == 1:
str_date = datetime.strptime(str(date_time), '%Y%m%d').strftime('%Y-%m-%d')
else:
str_date = datetime.strptime(str(date_time), '%Y-%m-%d').strftime('%Y-%m-%d')
else:
# 获取今天的日期
str_date = datetime.now().date()
# 使用日期过滤器筛选出今天的新闻
today_news = stock_news_em_df[stock_news_em_df['发布时间'].dt.date == str_date]
# 打印今天的新闻
print(today_news['新闻标题'])
print(today_news["新闻内容"])
# today_news['新闻标题'].apply(lambda x: result(x))
return today_news
def html_page_data(self):
cctv_df = self.news_cctv()
cls_df = self.news_cls()
cctv_content, cls_content = '', ''
for index, row in cctv_df.iterrows():
cctv_content += f'<tr><th scope="row">{row["title"]}</th>' \
f'<td>开发中...</td>'
cctv_content += '</tr>'
for index, row in cls_df.iterrows():
if len(row["标题"]) > 1:
cls_content += f'<tr><th scope="row">{row["标题"]}</th>' \
f'<td>开发中...</td>'
cls_content += '</tr>'
return cctv_content, cls_content
if __name__ == '__main__':
date_obj = datetime.strptime('20231019', "%Y%m%d")
print(News().news_cctv())