diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index d5060f1..334a487 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -17,7 +17,7 @@ from base.base_crawler import AbstractCrawler from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import bilibili as bilibili_store from tools import utils -from var import crawler_type_var +from var import crawler_type_var, source_keyword_var from .client import BilibiliClient from .exception import DataFetchError @@ -96,6 +96,7 @@ class BilibiliCrawler(AbstractCrawler): config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count start_page = config.START_PAGE # start page number for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) utils.logger.info( f"[BilibiliCrawler.search] Current search keyword: {keyword}") page = 1 diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 395d7c0..9c79371 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -12,7 +12,7 @@ from base.base_crawler import AbstractCrawler from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import douyin as douyin_store from tools import utils -from var import crawler_type_var +from var import crawler_type_var, source_keyword_var from .client import DOUYINClient from .exception import DataFetchError @@ -80,6 +80,7 @@ class DouYinCrawler(AbstractCrawler): config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count start_page = config.START_PAGE # start page number for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}") aweme_list: List[str] = [] page = 0 diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index a5eda58..9ea90e0 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -13,7 +13,7 @@ from base.base_crawler import AbstractCrawler from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import kuaishou as kuaishou_store from tools import utils -from var import comment_tasks_var, crawler_type_var +from var import comment_tasks_var, crawler_type_var, source_keyword_var from .client import KuaiShouClient from .exception import DataFetchError @@ -85,6 +85,7 @@ class KuaishouCrawler(AbstractCrawler): config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count start_page = config.START_PAGE for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}") page = 1 while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py index c8b8764..2edc2da 100644 --- a/media_platform/tieba/core.py +++ b/media_platform/tieba/core.py @@ -14,7 +14,7 @@ from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import tieba as tieba_store from tools import utils from tools.crawler_util import format_proxy_info -from var import crawler_type_var +from var import crawler_type_var, source_keyword_var from .client import BaiduTieBaClient from .field import SearchNoteType, SearchSortType @@ -74,6 +74,7 @@ class TieBaCrawler(AbstractCrawler): config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count start_page = config.START_PAGE for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) utils.logger.info(f"[BaiduTieBaCrawler.search] Current search keyword: {keyword}") page = 1 while (page - start_page + 1) * tieba_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index 7683385..47c4fff 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -18,7 +18,7 @@ from base.base_crawler import AbstractCrawler from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import weibo as weibo_store from tools import utils -from var import crawler_type_var +from var import crawler_type_var, source_keyword_var from .client import WeiboClient from .exception import DataFetchError @@ -99,6 +99,7 @@ class WeiboCrawler(AbstractCrawler): config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count start_page = config.START_PAGE for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}") page = 1 while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 3c39938..f176240 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -12,7 +12,7 @@ from base.base_crawler import AbstractCrawler from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import xhs as xhs_store from tools import utils -from var import crawler_type_var +from var import crawler_type_var, source_keyword_var from .client import XiaoHongShuClient from .exception import DataFetchError @@ -94,6 +94,7 @@ class XiaoHongShuCrawler(AbstractCrawler): config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count start_page = config.START_PAGE for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}") page = 1 while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: diff --git a/model/m_baidu_tieba.py b/model/m_baidu_tieba.py index a32250f..2f7d1b8 100644 --- a/model/m_baidu_tieba.py +++ b/model/m_baidu_tieba.py @@ -21,6 +21,7 @@ class TiebaNote(BaseModel): total_replay_num: int = Field(default=0, description="回复总数") total_replay_page: int = Field(default=0, description="回复总页数") ip_location: Optional[str] = Field(default="", description="IP地理位置") + source_keyword: str = Field(default="", description="来源关键词") class TiebaComment(BaseModel): diff --git a/schema/tables.sql b/schema/tables.sql index 3fc72da..da9e855 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -397,4 +397,12 @@ CREATE TABLE tieba_comment KEY `idx_tieba_comment_comment_id` (`note_id`), KEY `idx_tieba_comment_note_id` (`note_id`), KEY `idx_tieba_comment_publish_time` (`publish_time`) -) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表'; \ No newline at end of file +) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表'; + +-- 增加搜索来源关键字字段 +alter table bilibili_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table douyin_aweme add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table kuaishou_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table weibo_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table xhs_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; +alter table tieba_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字'; diff --git a/store/bilibili/__init__.py b/store/bilibili/__init__.py index 1bc52cd..04bd9a5 100644 --- a/store/bilibili/__init__.py +++ b/store/bilibili/__init__.py @@ -6,6 +6,7 @@ from typing import List import config +from var import source_keyword_var from .bilibili_store_impl import * from .bilibilli_store_video import * @@ -48,6 +49,7 @@ async def update_bilibili_video(video_item: Dict): "last_modify_ts": utils.get_current_timestamp(), "video_url": f"https://www.bilibili.com/video/av{video_id}", "video_cover_url": video_item_view.get("pic", ""), + "source_keyword": source_keyword_var.get(), } utils.logger.info( f"[store.bilibili.update_bilibili_video] bilibili video id:{video_id}, title:{save_content_item.get('title')}") diff --git a/store/douyin/__init__.py b/store/douyin/__init__.py index 6b8a753..d064a04 100644 --- a/store/douyin/__init__.py +++ b/store/douyin/__init__.py @@ -5,6 +5,7 @@ from typing import List import config +from var import source_keyword_var from .douyin_store_impl import * @@ -48,7 +49,8 @@ async def update_douyin_aweme(aweme_item: Dict): "share_count": str(interact_info.get("share_count")), "ip_location": aweme_item.get("ip_label", ""), "last_modify_ts": utils.get_current_timestamp(), - "aweme_url": f"https://www.douyin.com/video/{aweme_id}" + "aweme_url": f"https://www.douyin.com/video/{aweme_id}", + "source_keyword": source_keyword_var.get(), } utils.logger.info( f"[store.douyin.update_douyin_aweme] douyin aweme id:{aweme_id}, title:{save_content_item.get('title')}") diff --git a/store/kuaishou/__init__.py b/store/kuaishou/__init__.py index cfdcd29..efac378 100644 --- a/store/kuaishou/__init__.py +++ b/store/kuaishou/__init__.py @@ -5,6 +5,7 @@ from typing import List import config +from var import source_keyword_var from .kuaishou_store_impl import * @@ -46,6 +47,7 @@ async def update_kuaishou_video(video_item: Dict): "video_url": f"https://www.kuaishou.com/short-video/{video_id}", "video_cover_url": photo_info.get("coverUrl", ""), "video_play_url": photo_info.get("photoUrl", ""), + "source_keyword": source_keyword_var.get(), } utils.logger.info( f"[store.kuaishou.update_kuaishou_video] Kuaishou video id:{video_id}, title:{save_content_item.get('title')}") diff --git a/store/tieba/__init__.py b/store/tieba/__init__.py index 788a93d..8de8876 100644 --- a/store/tieba/__init__.py +++ b/store/tieba/__init__.py @@ -2,6 +2,7 @@ from typing import List from model.m_baidu_tieba import TiebaComment, TiebaNote +from var import source_keyword_var from . import tieba_store_impl from .tieba_store_impl import * @@ -32,6 +33,7 @@ async def update_tieba_note(note_item: TiebaNote): Returns: """ + note_item.source_keyword = source_keyword_var.get() save_note_item = note_item.model_dump() save_note_item.update({"last_modify_ts": utils.get_current_timestamp()}) utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {save_note_item}") diff --git a/store/weibo/__init__.py b/store/weibo/__init__.py index e3751dc..2e86a5d 100644 --- a/store/weibo/__init__.py +++ b/store/weibo/__init__.py @@ -6,6 +6,7 @@ import re from typing import List +from var import source_keyword_var from .weibo_store_image import * from .weibo_store_impl import * @@ -51,6 +52,8 @@ async def update_weibo_note(note_item: Dict): "gender": user_info.get("gender", ""), "profile_url": user_info.get("profile_url", ""), "avatar": user_info.get("profile_image_url", ""), + + "source_keyword": source_keyword_var.get(), } utils.logger.info( f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...") diff --git a/store/xhs/__init__.py b/store/xhs/__init__.py index 81e74a4..eb58f2d 100644 --- a/store/xhs/__init__.py +++ b/store/xhs/__init__.py @@ -5,6 +5,7 @@ from typing import List import config +from var import source_keyword_var from . import xhs_store_impl from .xhs_store_image import * @@ -78,6 +79,7 @@ async def update_xhs_note(note_item: Dict): "tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']), "last_modify_ts": utils.get_current_timestamp(), "note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search", + "source_keyword": source_keyword_var.get(), } utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}") await XhsStoreFactory.create_store().store_content(local_db_item) diff --git a/var.py b/var.py index 9107e52..40bac24 100644 --- a/var.py +++ b/var.py @@ -11,3 +11,4 @@ crawler_type_var: ContextVar[str] = ContextVar("crawler_type", default="") comment_tasks_var: ContextVar[List[Task]] = ContextVar("comment_tasks", default=[]) media_crawler_db_var: ContextVar[AsyncMysqlDB] = ContextVar("media_crawler_db_var") db_conn_pool_var: ContextVar[aiomysql.Pool] = ContextVar("db_conn_pool_var") +source_keyword_var: ContextVar[str] = ContextVar("source_keyword", default="") \ No newline at end of file