diff --git a/README.md b/README.md
index 443dabe..3931c32 100644
--- a/README.md
+++ b/README.md
@@ -17,13 +17,13 @@
## 功能列表
> 下面不支持的项目,相关的代码架构已经搭建好,只需要实现对应的方法即可,欢迎大家提交PR
-| 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 |
-|-----|-------|----------|-----|--------|-------|-------|
-| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
-| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
-| B 站 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
-| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ |
+| 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 |
+|-----|-------|----------|-----|--------|-------|-------|-------|
+| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
+| 快手 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
+| B 站 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
+| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
## 使用方法
@@ -186,4 +186,3 @@
-
diff --git a/config/base_config.py b/config/base_config.py
index ee55a87..4cd9844 100644
--- a/config/base_config.py
+++ b/config/base_config.py
@@ -95,3 +95,19 @@ DY_CREATOR_ID_LIST = [
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
# ........................
]
+
+#词云相关
+#是否开启生成评论词云图
+ENABLE_GET_WORDCLOUD = False
+# 自定义词语及其分组
+#添加规则:xx:yy 其中xx为自定义添加的词组,yy为将xx该词组分到的组名。
+CUSTOM_WORDS = {
+ '零几': '年份', # 将“零几”识别为一个整体
+ '高频词': '专业术语' # 示例自定义词
+}
+
+#停用(禁用)词文件路径
+STOP_WORDS_FILE = "./docs/hit_stopwords.txt"
+
+#中文字体文件路径
+FONT_PATH= "./docs/STZHONGS.TTF"
\ No newline at end of file
diff --git a/docs/STZHONGS.TTF b/docs/STZHONGS.TTF
new file mode 100644
index 0000000..412dfcd
Binary files /dev/null and b/docs/STZHONGS.TTF differ
diff --git a/docs/hit_stopwords.txt b/docs/hit_stopwords.txt
new file mode 100644
index 0000000..1d1818e
--- /dev/null
+++ b/docs/hit_stopwords.txt
@@ -0,0 +1,768 @@
+\n
+———
+》),
+)÷(1-
+”,
+)、
+=(
+:
+→
+℃
+&
+*
+一一
+~~~~
+’
+.
+『
+.一
+./
+--
+』
+=″
+【
+[*]
+}>
+[⑤]]
+[①D]
+c]
+ng昉
+*
+//
+[
+]
+[②e]
+[②g]
+={
+}
+,也
+‘
+A
+[①⑥]
+[②B]
+[①a]
+[④a]
+[①③]
+[③h]
+③]
+1.
+--
+[②b]
+’‘
+×××
+[①⑧]
+0:2
+=[
+[⑤b]
+[②c]
+[④b]
+[②③]
+[③a]
+[④c]
+[①⑤]
+[①⑦]
+[①g]
+∈[
+[①⑨]
+[①④]
+[①c]
+[②f]
+[②⑧]
+[②①]
+[①C]
+[③c]
+[③g]
+[②⑤]
+[②②]
+一.
+[①h]
+.数
+[]
+[①B]
+数/
+[①i]
+[③e]
+[①①]
+[④d]
+[④e]
+[③b]
+[⑤a]
+[①A]
+[②⑧]
+[②⑦]
+[①d]
+[②j]
+〕〔
+][
+://
+′∈
+[②④
+[⑤e]
+12%
+b]
+...
+...................
+…………………………………………………③
+ZXFITL
+[③F]
+」
+[①o]
+]∧′=[
+∪φ∈
+′|
+{-
+②c
+}
+[③①]
+R.L.
+[①E]
+Ψ
+-[*]-
+↑
+.日
+[②d]
+[②
+[②⑦]
+[②②]
+[③e]
+[①i]
+[①B]
+[①h]
+[①d]
+[①g]
+[①②]
+[②a]
+f]
+[⑩]
+a]
+[①e]
+[②h]
+[②⑥]
+[③d]
+[②⑩]
+e]
+〉
+】
+元/吨
+[②⑩]
+2.3%
+5:0
+[①]
+::
+[②]
+[③]
+[④]
+[⑤]
+[⑥]
+[⑦]
+[⑧]
+[⑨]
+……
+——
+?
+、
+。
+“
+”
+《
+》
+!
+,
+:
+;
+?
+.
+,
+.
+'
+?
+·
+———
+──
+?
+—
+<
+>
+(
+)
+〔
+〕
+[
+]
+(
+)
+-
++
+~
+×
+/
+/
+①
+②
+③
+④
+⑤
+⑥
+⑦
+⑧
+⑨
+⑩
+Ⅲ
+В
+"
+;
+#
+@
+γ
+μ
+φ
+φ.
+×
+Δ
+■
+▲
+sub
+exp
+sup
+sub
+Lex
+#
+%
+&
+'
++
++ξ
+++
+-
+-β
+<
+<±
+<Δ
+<λ
+<φ
+<<
+=
+=
+=☆
+=-
+>
+>λ
+_
+~±
+~+
+[⑤f]
+[⑤d]
+[②i]
+≈
+[②G]
+[①f]
+LI
+㈧
+[-
+......
+〉
+[③⑩]
+第二
+一番
+一直
+一个
+一些
+许多
+种
+有的是
+也就是说
+末##末
+啊
+阿
+哎
+哎呀
+哎哟
+唉
+俺
+俺们
+按
+按照
+吧
+吧哒
+把
+罢了
+被
+本
+本着
+比
+比方
+比如
+鄙人
+彼
+彼此
+边
+别
+别的
+别说
+并
+并且
+不比
+不成
+不单
+不但
+不独
+不管
+不光
+不过
+不仅
+不拘
+不论
+不怕
+不然
+不如
+不特
+不惟
+不问
+不只
+朝
+朝着
+趁
+趁着
+乘
+冲
+除
+除此之外
+除非
+除了
+此
+此间
+此外
+从
+从而
+打
+待
+但
+但是
+当
+当着
+到
+得
+的
+的话
+等
+等等
+地
+第
+叮咚
+对
+对于
+多
+多少
+而
+而况
+而且
+而是
+而外
+而言
+而已
+尔后
+反过来
+反过来说
+反之
+非但
+非徒
+否则
+嘎
+嘎登
+该
+赶
+个
+各
+各个
+各位
+各种
+各自
+给
+根据
+跟
+故
+故此
+固然
+关于
+管
+归
+果然
+果真
+过
+哈
+哈哈
+呵
+和
+何
+何处
+何况
+何时
+嘿
+哼
+哼唷
+呼哧
+乎
+哗
+还是
+还有
+换句话说
+换言之
+或
+或是
+或者
+极了
+及
+及其
+及至
+即
+即便
+即或
+即令
+即若
+即使
+几
+几时
+己
+既
+既然
+既是
+继而
+加之
+假如
+假若
+假使
+鉴于
+将
+较
+较之
+叫
+接着
+结果
+借
+紧接着
+进而
+尽
+尽管
+经
+经过
+就
+就是
+就是说
+据
+具体地说
+具体说来
+开始
+开外
+靠
+咳
+可
+可见
+可是
+可以
+况且
+啦
+来
+来着
+离
+例如
+哩
+连
+连同
+两者
+了
+临
+另
+另外
+另一方面
+论
+嘛
+吗
+慢说
+漫说
+冒
+么
+每
+每当
+们
+莫若
+某
+某个
+某些
+拿
+哪
+哪边
+哪儿
+哪个
+哪里
+哪年
+哪怕
+哪天
+哪些
+哪样
+那
+那边
+那儿
+那个
+那会儿
+那里
+那么
+那么些
+那么样
+那时
+那些
+那样
+乃
+乃至
+呢
+能
+你
+你们
+您
+宁
+宁可
+宁肯
+宁愿
+哦
+呕
+啪达
+旁人
+呸
+凭
+凭借
+其
+其次
+其二
+其他
+其它
+其一
+其余
+其中
+起
+起见
+起见
+岂但
+恰恰相反
+前后
+前者
+且
+然而
+然后
+然则
+让
+人家
+任
+任何
+任凭
+如
+如此
+如果
+如何
+如其
+如若
+如上所述
+若
+若非
+若是
+啥
+上下
+尚且
+设若
+设使
+甚而
+甚么
+甚至
+省得
+时候
+什么
+什么样
+使得
+是
+是的
+首先
+谁
+谁知
+顺
+顺着
+似的
+虽
+虽然
+虽说
+虽则
+随
+随着
+所
+所以
+他
+他们
+他人
+它
+它们
+她
+她们
+倘
+倘或
+倘然
+倘若
+倘使
+腾
+替
+通过
+同
+同时
+哇
+万一
+往
+望
+为
+为何
+为了
+为什么
+为着
+喂
+嗡嗡
+我
+我们
+呜
+呜呼
+乌乎
+无论
+无宁
+毋宁
+嘻
+吓
+相对而言
+像
+向
+向着
+嘘
+呀
+焉
+沿
+沿着
+要
+要不
+要不然
+要不是
+要么
+要是
+也
+也罢
+也好
+一
+一般
+一旦
+一方面
+一来
+一切
+一样
+一则
+依
+依照
+矣
+以
+以便
+以及
+以免
+以至
+以至于
+以致
+抑或
+因
+因此
+因而
+因为
+哟
+用
+由
+由此可见
+由于
+有
+有的
+有关
+有些
+又
+于
+于是
+于是乎
+与
+与此同时
+与否
+与其
+越是
+云云
+哉
+再说
+再者
+在
+在下
+咱
+咱们
+则
+怎
+怎么
+怎么办
+怎么样
+怎样
+咋
+照
+照着
+者
+这
+这边
+这儿
+这个
+这会儿
+这就是说
+这里
+这么
+这么点儿
+这么些
+这么样
+这时
+这些
+这样
+正如
+吱
+之
+之类
+之所以
+之一
+只是
+只限
+只要
+只有
+至
+至于
+诸位
+着
+着呢
+自
+自从
+自个儿
+自各儿
+自己
+自家
+自身
+综上所述
+总的来看
+总的来说
+总的说来
+总而言之
+总之
+纵
+纵令
+纵然
+纵使
+遵照
+作为
+兮
+呃
+呗
+咚
+咦
+喏
+啐
+喔唷
+嗬
+嗯
+嗳
diff --git a/docs/常见问题.md b/docs/常见问题.md
index e01fd37..1b749d4 100644
--- a/docs/常见问题.md
+++ b/docs/常见问题.md
@@ -22,4 +22,10 @@ Q: 报错 `playwright._impl._api_types.TimeoutError: Timeout 30000ms exceeded.`<
A: 出现这种情况检查下开梯子没有
Q: 小红书扫码登录成功后如何手动验证?
-A: 打开 config/base_config.py 文件, 找到 HEADLESS 配置项, 将其设置为 False, 此时重启项目, 在浏览器中手动通过验证码
+A: 打开 config/base_config.py 文件, 找到 HEADLESS 配置项, 将其设置为 False, 此时重启项目, 在浏览器中手动通过验证码
+
+Q: 如何配置词云图的生成?
+A: 打开 config/base_config.py 文件, 找到`ENABLE_GET_WORDCLOUD` 以及`ENABLE_GET_COMMENTS` 两个配置项,将其都设为True即可使用该功能。
+
+Q: 如何给词云图添加禁用词和自定义词组?
+A: 打开 `docs/hit_stopwords.txt` 输入禁用词(注意一个词语一行)。打开 config/base_config.py 文件找到 `CUSTOM_WORDS `按格式添加自定义词组即可。
diff --git a/docs/项目代码结构.md b/docs/项目代码结构.md
index baaa71f..ca076ed 100644
--- a/docs/项目代码结构.md
+++ b/docs/项目代码结构.md
@@ -29,7 +29,8 @@ MediaCrawler
│ ├── crawler_util.py # 爬虫相关的工具函数
│ ├── slider_util.py # 滑块相关的工具函数
│ ├── time_util.py # 时间相关的工具函数
-│ └── easing.py # 模拟滑动轨迹相关的函数
+│ ├── easing.py # 模拟滑动轨迹相关的函数
+| └── words.py # 生成词云图相关的函数
├── db.py # DB ORM
├── main.py # 程序入口
├── var.py # 上下文变量定义
diff --git a/store/bilibili/bilibili_store_impl.py b/store/bilibili/bilibili_store_impl.py
index 018244d..7b93432 100644
--- a/store/bilibili/bilibili_store_impl.py
+++ b/store/bilibili/bilibili_store_impl.py
@@ -11,10 +11,11 @@ from typing import Dict
import aiofiles
+import config
from base.base_crawler import AbstractStore
from tools import utils
from var import crawler_type_var
-
+from tools import words
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
@@ -130,12 +131,14 @@ class BiliDbStoreImplement(AbstractStore):
class BiliJsonStoreImplement(AbstractStore):
- json_store_path: str = "data/bilibili"
+ json_store_path: str = "data/bilibili/json"
+ words_store_path: str = "data/bilibili/words"
lock = asyncio.Lock()
file_count:int=calculate_number_of_files(json_store_path)
+ WordCloud = words.AsyncWordCloudGenerator()
- def make_save_file_name(self, store_type: str) -> str:
+ def make_save_file_name(self, store_type: str) -> (str,str):
"""
make save file name by store type
Args:
@@ -145,7 +148,10 @@ class BiliJsonStoreImplement(AbstractStore):
"""
- return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json"
+ return (
+ f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json",
+ f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}"
+ )
async def save_data_to_json(self, save_item: Dict, store_type: str):
"""
@@ -158,7 +164,8 @@ class BiliJsonStoreImplement(AbstractStore):
"""
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
- save_file_name = self.make_save_file_name(store_type=store_type)
+ pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
+ save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_data = []
async with self.lock:
@@ -170,6 +177,12 @@ class BiliJsonStoreImplement(AbstractStore):
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
await file.write(json.dumps(save_data, ensure_ascii=False))
+ if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD:
+ try:
+ await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
+ except:
+ pass
+
async def store_content(self, content_item: Dict):
"""
content JSON storage implementation
diff --git a/store/douyin/douyin_store_impl.py b/store/douyin/douyin_store_impl.py
index 6277b4c..a4672ee 100644
--- a/store/douyin/douyin_store_impl.py
+++ b/store/douyin/douyin_store_impl.py
@@ -12,8 +12,9 @@ from typing import Dict
import aiofiles
from base.base_crawler import AbstractStore
-from tools import utils
+from tools import utils,words
from var import crawler_type_var
+import config
def calculate_number_of_files(file_store_path: str) -> int:
@@ -162,11 +163,14 @@ class DouyinDbStoreImplement(AbstractStore):
await update_creator_by_user_id(user_id, creator)
class DouyinJsonStoreImplement(AbstractStore):
- json_store_path: str = "data/douyin"
+ json_store_path: str = "data/douyin/json"
+ words_store_path: str = "data/douyin/words"
+
lock = asyncio.Lock()
file_count: int = calculate_number_of_files(json_store_path)
+ WordCloud = words.AsyncWordCloudGenerator()
- def make_save_file_name(self, store_type: str) -> str:
+ def make_save_file_name(self, store_type: str) -> (str,str):
"""
make save file name by store type
Args:
@@ -176,8 +180,10 @@ class DouyinJsonStoreImplement(AbstractStore):
"""
- return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json"
-
+ return (
+ f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json",
+ f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}"
+ )
async def save_data_to_json(self, save_item: Dict, store_type: str):
"""
Below is a simple way to save it in json format.
@@ -189,7 +195,8 @@ class DouyinJsonStoreImplement(AbstractStore):
"""
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
- save_file_name = self.make_save_file_name(store_type=store_type)
+ pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
+ save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_data = []
async with self.lock:
@@ -201,6 +208,12 @@ class DouyinJsonStoreImplement(AbstractStore):
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
await file.write(json.dumps(save_data, ensure_ascii=False))
+ if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD:
+ try:
+ await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
+ except:
+ pass
+
async def store_content(self, content_item: Dict):
"""
content JSON storage implementation
diff --git a/store/kuaishou/kuaishou_store_impl.py b/store/kuaishou/kuaishou_store_impl.py
index 14b477a..4883daa 100644
--- a/store/kuaishou/kuaishou_store_impl.py
+++ b/store/kuaishou/kuaishou_store_impl.py
@@ -12,9 +12,9 @@ from typing import Dict
import aiofiles
from base.base_crawler import AbstractStore
-from tools import utils
+from tools import utils,words
from var import crawler_type_var
-
+import config
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
@@ -131,12 +131,15 @@ class KuaishouDbStoreImplement(AbstractStore):
class KuaishouJsonStoreImplement(AbstractStore):
- json_store_path: str = "data/kuaishou"
+ json_store_path: str = "data/kuaishou/json"
+ words_store_path: str = "data/kuaishou/words"
lock = asyncio.Lock()
file_count:int=calculate_number_of_files(json_store_path)
+ WordCloud = words.AsyncWordCloudGenerator()
- def make_save_file_name(self, store_type: str) -> str:
+
+ def make_save_file_name(self, store_type: str) -> (str,str):
"""
make save file name by store type
Args:
@@ -146,8 +149,10 @@ class KuaishouJsonStoreImplement(AbstractStore):
"""
-
- return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json"
+ return (
+ f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json",
+ f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}"
+ )
async def save_data_to_json(self, save_item: Dict, store_type: str):
"""
@@ -160,7 +165,8 @@ class KuaishouJsonStoreImplement(AbstractStore):
"""
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
- save_file_name = self.make_save_file_name(store_type=store_type)
+ pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
+ save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_data = []
async with self.lock:
@@ -172,6 +178,12 @@ class KuaishouJsonStoreImplement(AbstractStore):
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
await file.write(json.dumps(save_data, ensure_ascii=False))
+ if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD:
+ try:
+ await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
+ except:
+ pass
+
async def store_content(self, content_item: Dict):
"""
content JSON storage implementation
diff --git a/store/weibo/weibo_store_impl.py b/store/weibo/weibo_store_impl.py
index 8bf09b4..fdd21d4 100644
--- a/store/weibo/weibo_store_impl.py
+++ b/store/weibo/weibo_store_impl.py
@@ -12,9 +12,9 @@ from typing import Dict
import aiofiles
from base.base_crawler import AbstractStore
-from tools import utils
+from tools import utils,words
from var import crawler_type_var
-
+import config
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
@@ -132,12 +132,14 @@ class WeiboDbStoreImplement(AbstractStore):
class WeiboJsonStoreImplement(AbstractStore):
- json_store_path: str = "data/weibo"
+ json_store_path: str = "data/weibo/json"
+ words_store_path: str = "data/weibo/words"
lock = asyncio.Lock()
file_count:int=calculate_number_of_files(json_store_path)
+ WordCloud = words.AsyncWordCloudGenerator()
- def make_save_file_name(self, store_type: str) -> str:
+ def make_save_file_name(self, store_type: str) -> (str,str):
"""
make save file name by store type
Args:
@@ -147,7 +149,10 @@ class WeiboJsonStoreImplement(AbstractStore):
"""
- return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json"
+ return (
+ f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json",
+ f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}"
+ )
async def save_data_to_json(self, save_item: Dict, store_type: str):
"""
@@ -160,7 +165,8 @@ class WeiboJsonStoreImplement(AbstractStore):
"""
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
- save_file_name = self.make_save_file_name(store_type=store_type)
+ pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
+ save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_data = []
async with self.lock:
@@ -172,6 +178,12 @@ class WeiboJsonStoreImplement(AbstractStore):
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
await file.write(json.dumps(save_data, ensure_ascii=False))
+ if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD:
+ try:
+ await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
+ except:
+ pass
+
async def store_content(self, content_item: Dict):
"""
content JSON storage implementation
diff --git a/store/xhs/xhs_store_impl.py b/store/xhs/xhs_store_impl.py
index 63b5217..3204d0c 100644
--- a/store/xhs/xhs_store_impl.py
+++ b/store/xhs/xhs_store_impl.py
@@ -12,9 +12,9 @@ from typing import Dict
import aiofiles
from base.base_crawler import AbstractStore
-from tools import utils
+from tools import utils,words
from var import crawler_type_var
-
+import config
def calculate_number_of_files(file_store_path: str) -> int:
"""计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中
@@ -161,11 +161,13 @@ class XhsDbStoreImplement(AbstractStore):
class XhsJsonStoreImplement(AbstractStore):
- json_store_path: str = "data/xhs"
+ json_store_path: str = "data/xhs/json"
+ words_store_path: str = "data/xhs/words"
lock = asyncio.Lock()
file_count:int=calculate_number_of_files(json_store_path)
+ WordCloud = words.AsyncWordCloudGenerator()
- def make_save_file_name(self, store_type: str) -> str:
+ def make_save_file_name(self, store_type: str) -> (str,str):
"""
make save file name by store type
Args:
@@ -175,7 +177,10 @@ class XhsJsonStoreImplement(AbstractStore):
"""
- return f"{self.json_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json"
+ return (
+ f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json",
+ f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}"
+ )
async def save_data_to_json(self, save_item: Dict, store_type: str):
"""
@@ -188,7 +193,8 @@ class XhsJsonStoreImplement(AbstractStore):
"""
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
- save_file_name = self.make_save_file_name(store_type=store_type)
+ pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
+ save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
save_data = []
async with self.lock:
@@ -200,6 +206,11 @@ class XhsJsonStoreImplement(AbstractStore):
async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file:
await file.write(json.dumps(save_data, ensure_ascii=False))
+ if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD:
+ try:
+ await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix)
+ except:
+ pass
async def store_content(self, content_item: Dict):
"""
content JSON storage implementation
diff --git a/tools/words.py b/tools/words.py
new file mode 100644
index 0000000..b7c2b00
--- /dev/null
+++ b/tools/words.py
@@ -0,0 +1,68 @@
+import aiofiles
+import asyncio
+import jieba
+from collections import Counter
+from wordcloud import WordCloud
+import json
+import matplotlib.pyplot as plt
+import config
+from tools import utils
+
+plot_lock = asyncio.Lock()
+
+class AsyncWordCloudGenerator:
+ def __init__(self):
+ self.stop_words_file = config.STOP_WORDS_FILE
+ self.lock = asyncio.Lock()
+ self.stop_words = self.load_stop_words()
+ self.custom_words = config.CUSTOM_WORDS
+ for word, group in self.custom_words.items():
+ jieba.add_word(word)
+
+ def load_stop_words(self):
+ with open(self.stop_words_file, 'r', encoding='utf-8') as f:
+ return set(f.read().strip().split('\n'))
+
+ async def generate_word_frequency_and_cloud(self, data, save_words_prefix):
+ all_text = ' '.join(item['content'] for item in data)
+ words = [word for word in jieba.lcut(all_text) if word not in self.stop_words]
+ word_freq = Counter(words)
+
+ # Save word frequency to file
+ freq_file = f"{save_words_prefix}_word_freq.json"
+ async with aiofiles.open(freq_file, 'w', encoding='utf-8') as file:
+ await file.write(json.dumps(word_freq, ensure_ascii=False, indent=4))
+
+ # Try to acquire the plot lock without waiting
+ if plot_lock.locked():
+ utils.logger.info("Skipping word cloud generation as the lock is held.")
+ return
+
+ await self.generate_word_cloud(word_freq, save_words_prefix)
+
+ async def generate_word_cloud(self, word_freq, save_words_prefix):
+ await plot_lock.acquire()
+ top_20_word_freq = {word: freq for word, freq in
+ sorted(word_freq.items(), key=lambda item: item[1], reverse=True)[:20]}
+ wordcloud = WordCloud(
+ font_path=config.FONT_PATH,
+ width=800,
+ height=400,
+ background_color='white',
+ max_words=200,
+ stopwords=self.stop_words,
+ colormap='viridis',
+ contour_color='steelblue',
+ contour_width=1
+ ).generate_from_frequencies(top_20_word_freq)
+
+ # Save word cloud image
+ plt.figure(figsize=(10, 5), facecolor='white')
+ plt.imshow(wordcloud, interpolation='bilinear')
+
+ plt.axis('off')
+ plt.tight_layout(pad=0)
+ plt.savefig(f"{save_words_prefix}_word_cloud.png", format='png', dpi=300)
+ plt.close()
+
+ plot_lock.release()
\ No newline at end of file