MediaCrawler/tools/words.py

68 lines
2.4 KiB
Python

import aiofiles
import asyncio
import jieba
from collections import Counter
from wordcloud import WordCloud
import json
import matplotlib.pyplot as plt
import config
from tools import utils
plot_lock = asyncio.Lock()
class AsyncWordCloudGenerator:
def __init__(self):
self.stop_words_file = config.STOP_WORDS_FILE
self.lock = asyncio.Lock()
self.stop_words = self.load_stop_words()
self.custom_words = config.CUSTOM_WORDS
for word, group in self.custom_words.items():
jieba.add_word(word)
def load_stop_words(self):
with open(self.stop_words_file, 'r', encoding='utf-8') as f:
return set(f.read().strip().split('\n'))
async def generate_word_frequency_and_cloud(self, data, save_words_prefix):
all_text = ' '.join(item['content'] for item in data)
words = [word for word in jieba.lcut(all_text) if word not in self.stop_words]
word_freq = Counter(words)
# Save word frequency to file
freq_file = f"{save_words_prefix}_word_freq.json"
async with aiofiles.open(freq_file, 'w', encoding='utf-8') as file:
await file.write(json.dumps(word_freq, ensure_ascii=False, indent=4))
# Try to acquire the plot lock without waiting
if plot_lock.locked():
utils.logger.info("Skipping word cloud generation as the lock is held.")
return
await self.generate_word_cloud(word_freq, save_words_prefix)
async def generate_word_cloud(self, word_freq, save_words_prefix):
await plot_lock.acquire()
top_20_word_freq = {word: freq for word, freq in
sorted(word_freq.items(), key=lambda item: item[1], reverse=True)[:20]}
wordcloud = WordCloud(
font_path=config.FONT_PATH,
width=800,
height=400,
background_color='white',
max_words=200,
stopwords=self.stop_words,
colormap='viridis',
contour_color='steelblue',
contour_width=1
).generate_from_frequencies(top_20_word_freq)
# Save word cloud image
plt.figure(figsize=(10, 5), facecolor='white')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.savefig(f"{save_words_prefix}_word_cloud.png", format='png', dpi=300)
plt.close()
plot_lock.release()