qnloft-stock/fp/新闻资讯.py

125 lines
3.9 KiB
Python
Raw Permalink Normal View History

2023-11-24 02:49:22 +00:00
from utils.comm import *
from snownlp import SnowNLP
import jieba
import jieba.posseg as pseg
import jieba.analyse
class News:
def __init__(self, trade_date=datetime.now()):
self.trade_date = trade_date.strftime('%Y%m%d')
@staticmethod
def preprocess(text):
seg_list = jieba.lcut(text) # 文本预处理及分词
return seg_list
@staticmethod
def sentiment_analysis(text): # 情感分析
s = SnowNLP(text)
sentiment = s.sentiments # 情感得分范围0-1越接近1表示正面情感越接近0表示负面情感
return sentiment
@staticmethod
def extract_keywords(text, top_k=5): # 关键词提取
keywords = jieba.analyse.extract_tags(text, topK=top_k, withWeight=False, allowPOS=()) # 提取关键词默认返回前5个
return keywords
@staticmethod
def named_entity_recognition(text): # 命名实体识别
words = pseg.cut(text)
entities = []
for word, flag in words:
# 识别人名、地名、机构名、其它专名
if flag in ['nr', 'ns', 'nt', 'nz']:
entities.append((word, flag))
return entities
def result(self, news):
print("原文:", news)
preprocessed_news = self.preprocess(news)
sentiment_score = self.sentiment_analysis(news)
keywords = self.extract_keywords(news, top_k=3)
entities = self.named_entity_recognition(news)
print("分词结果:", preprocessed_news)
print("情感分析得分:", round(sentiment_score, 2))
print("关键词提取:", keywords)
print("命名实体识别:", entities)
def news_cctv(self):
for _ in range(5):
try:
# 中央新闻
df = ak.news_cctv(date=self.trade_date)
# 情感分析
# df['title'].apply(lambda x: self.result(x))
# print(df['情感'])
return df
except Exception as e:
print(f"{self.trade_date} 日的新闻咨询拉取发生错误!{e.__traceback__}")
time.sleep(1)
continue
else:
print("中央新闻 5次出现错误请关注")
return None
def news_cls(self):
for _ in range(5):
try:
# 财联社-电报
df = ak.stock_telegraph_cls()
# 情感分析
# df['title'].apply(lambda x: self.result(x))
# print(df['情感'])
return df
except Exception as e:
print(f"{self.trade_date} 财联社-电报 咨询拉取发生错误!{e.__traceback__}")
continue
else:
print("财联社-电报 5次出现错误请关注")
return None
@staticmethod
def news_stock_by_code(symbol, date_time=None):
# 个股新闻
stock_news_em_df = ak.stock_news_em(symbol=symbol)
# 将发布时间列转换为日期时间类型
stock_news_em_df['发布时间'] = pd.to_datetime(stock_news_em_df['发布时间'])
# print(stock_news_em_df)
if date_time is not None:
if len(date_time.split("-")) == 1:
str_date = datetime.strptime(str(date_time), '%Y%m%d').strftime('%Y-%m-%d')
else:
str_date = datetime.strptime(str(date_time), '%Y-%m-%d').strftime('%Y-%m-%d')
else:
# 获取今天的日期
str_date = datetime.now().date()
# 使用日期过滤器筛选出今天的新闻
today_news = stock_news_em_df[stock_news_em_df['发布时间'].dt.date == str_date]
# 打印今天的新闻
print(today_news['新闻标题'])
print(today_news["新闻内容"])
# today_news['新闻标题'].apply(lambda x: result(x))
return today_news
def html_page_data(self):
cctv_df = self.news_cctv()
cls_df = self.news_cls()
cctv_content, cls_content = '', ''
for index, row in cctv_df.iterrows():
cctv_content += f'<tr><th scope="row">{row["title"]}</th>' \
f'<td>开发中...</td>'
cctv_content += '</tr>'
for index, row in cls_df.iterrows():
if len(row["标题"]) > 1:
cls_content += f'<tr><th scope="row">{row["标题"]}</th>' \
f'<td>开发中...</td>'
cls_content += '</tr>'
return cctv_content, cls_content
if __name__ == '__main__':
date_obj = datetime.strptime('20231019', "%Y%m%d")
print(News().news_cctv())