2024-10-19 16:43:25 +00:00
|
|
|
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
|
|
|
|
# 1. 不得用于任何商业用途。
|
|
|
|
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
|
|
|
|
# 3. 不得进行大规模爬取或对平台造成运营干扰。
|
|
|
|
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
|
|
|
|
# 5. 不得用于任何非法或不当的用途。
|
|
|
|
#
|
|
|
|
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
|
|
|
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
|
|
|
|
|
|
|
|
2024-06-12 07:33:39 +00:00
|
|
|
import asyncio
|
|
|
|
import json
|
2024-08-09 10:12:15 +00:00
|
|
|
import logging
|
2024-06-16 16:25:48 +00:00
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
import aiofiles
|
|
|
|
import jieba
|
2024-06-12 07:33:39 +00:00
|
|
|
import matplotlib.pyplot as plt
|
2024-06-16 16:25:48 +00:00
|
|
|
from wordcloud import WordCloud
|
|
|
|
|
2024-06-12 07:33:39 +00:00
|
|
|
import config
|
|
|
|
from tools import utils
|
|
|
|
|
|
|
|
plot_lock = asyncio.Lock()
|
|
|
|
|
|
|
|
class AsyncWordCloudGenerator:
|
|
|
|
def __init__(self):
|
2024-08-09 10:12:15 +00:00
|
|
|
logging.getLogger('jieba').setLevel(logging.WARNING)
|
2024-06-12 07:33:39 +00:00
|
|
|
self.stop_words_file = config.STOP_WORDS_FILE
|
|
|
|
self.lock = asyncio.Lock()
|
|
|
|
self.stop_words = self.load_stop_words()
|
|
|
|
self.custom_words = config.CUSTOM_WORDS
|
|
|
|
for word, group in self.custom_words.items():
|
|
|
|
jieba.add_word(word)
|
|
|
|
|
|
|
|
def load_stop_words(self):
|
|
|
|
with open(self.stop_words_file, 'r', encoding='utf-8') as f:
|
|
|
|
return set(f.read().strip().split('\n'))
|
|
|
|
|
|
|
|
async def generate_word_frequency_and_cloud(self, data, save_words_prefix):
|
|
|
|
all_text = ' '.join(item['content'] for item in data)
|
2024-08-06 07:24:23 +00:00
|
|
|
words = [word for word in jieba.lcut(all_text) if word not in self.stop_words and len(word.strip()) > 0]
|
2024-06-12 07:33:39 +00:00
|
|
|
word_freq = Counter(words)
|
|
|
|
|
|
|
|
# Save word frequency to file
|
|
|
|
freq_file = f"{save_words_prefix}_word_freq.json"
|
|
|
|
async with aiofiles.open(freq_file, 'w', encoding='utf-8') as file:
|
|
|
|
await file.write(json.dumps(word_freq, ensure_ascii=False, indent=4))
|
|
|
|
|
|
|
|
# Try to acquire the plot lock without waiting
|
|
|
|
if plot_lock.locked():
|
|
|
|
utils.logger.info("Skipping word cloud generation as the lock is held.")
|
|
|
|
return
|
|
|
|
|
|
|
|
await self.generate_word_cloud(word_freq, save_words_prefix)
|
|
|
|
|
|
|
|
async def generate_word_cloud(self, word_freq, save_words_prefix):
|
|
|
|
await plot_lock.acquire()
|
|
|
|
top_20_word_freq = {word: freq for word, freq in
|
|
|
|
sorted(word_freq.items(), key=lambda item: item[1], reverse=True)[:20]}
|
|
|
|
wordcloud = WordCloud(
|
|
|
|
font_path=config.FONT_PATH,
|
|
|
|
width=800,
|
|
|
|
height=400,
|
|
|
|
background_color='white',
|
|
|
|
max_words=200,
|
|
|
|
stopwords=self.stop_words,
|
|
|
|
colormap='viridis',
|
|
|
|
contour_color='steelblue',
|
|
|
|
contour_width=1
|
|
|
|
).generate_from_frequencies(top_20_word_freq)
|
|
|
|
|
|
|
|
# Save word cloud image
|
|
|
|
plt.figure(figsize=(10, 5), facecolor='white')
|
|
|
|
plt.imshow(wordcloud, interpolation='bilinear')
|
|
|
|
|
|
|
|
plt.axis('off')
|
|
|
|
plt.tight_layout(pad=0)
|
|
|
|
plt.savefig(f"{save_words_prefix}_word_cloud.png", format='png', dpi=300)
|
|
|
|
plt.close()
|
|
|
|
|
|
|
|
plot_lock.release()
|