feat: 增加搜索词来源渠道
This commit is contained in:
parent
d3c5111985
commit
c70bd9e071
|
@ -17,7 +17,7 @@ from base.base_crawler import AbstractCrawler
|
||||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||||
from store import bilibili as bilibili_store
|
from store import bilibili as bilibili_store
|
||||||
from tools import utils
|
from tools import utils
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var, source_keyword_var
|
||||||
|
|
||||||
from .client import BilibiliClient
|
from .client import BilibiliClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
|
@ -96,6 +96,7 @@ class BilibiliCrawler(AbstractCrawler):
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
|
||||||
start_page = config.START_PAGE # start page number
|
start_page = config.START_PAGE # start page number
|
||||||
for keyword in config.KEYWORDS.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
|
source_keyword_var.set(keyword)
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[BilibiliCrawler.search] Current search keyword: {keyword}")
|
f"[BilibiliCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
|
|
|
@ -12,7 +12,7 @@ from base.base_crawler import AbstractCrawler
|
||||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||||
from store import douyin as douyin_store
|
from store import douyin as douyin_store
|
||||||
from tools import utils
|
from tools import utils
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var, source_keyword_var
|
||||||
|
|
||||||
from .client import DOUYINClient
|
from .client import DOUYINClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
|
@ -80,6 +80,7 @@ class DouYinCrawler(AbstractCrawler):
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count
|
||||||
start_page = config.START_PAGE # start page number
|
start_page = config.START_PAGE # start page number
|
||||||
for keyword in config.KEYWORDS.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
|
source_keyword_var.set(keyword)
|
||||||
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
|
utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}")
|
||||||
aweme_list: List[str] = []
|
aweme_list: List[str] = []
|
||||||
page = 0
|
page = 0
|
||||||
|
|
|
@ -13,7 +13,7 @@ from base.base_crawler import AbstractCrawler
|
||||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||||
from store import kuaishou as kuaishou_store
|
from store import kuaishou as kuaishou_store
|
||||||
from tools import utils
|
from tools import utils
|
||||||
from var import comment_tasks_var, crawler_type_var
|
from var import comment_tasks_var, crawler_type_var, source_keyword_var
|
||||||
|
|
||||||
from .client import KuaiShouClient
|
from .client import KuaiShouClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
|
@ -85,6 +85,7 @@ class KuaishouCrawler(AbstractCrawler):
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count
|
||||||
start_page = config.START_PAGE
|
start_page = config.START_PAGE
|
||||||
for keyword in config.KEYWORDS.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
|
source_keyword_var.set(keyword)
|
||||||
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
|
utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
|
|
@ -14,7 +14,7 @@ from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||||
from store import tieba as tieba_store
|
from store import tieba as tieba_store
|
||||||
from tools import utils
|
from tools import utils
|
||||||
from tools.crawler_util import format_proxy_info
|
from tools.crawler_util import format_proxy_info
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var, source_keyword_var
|
||||||
|
|
||||||
from .client import BaiduTieBaClient
|
from .client import BaiduTieBaClient
|
||||||
from .field import SearchNoteType, SearchSortType
|
from .field import SearchNoteType, SearchSortType
|
||||||
|
@ -74,6 +74,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
||||||
start_page = config.START_PAGE
|
start_page = config.START_PAGE
|
||||||
for keyword in config.KEYWORDS.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
|
source_keyword_var.set(keyword)
|
||||||
utils.logger.info(f"[BaiduTieBaCrawler.search] Current search keyword: {keyword}")
|
utils.logger.info(f"[BaiduTieBaCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
while (page - start_page + 1) * tieba_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * tieba_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
|
|
@ -18,7 +18,7 @@ from base.base_crawler import AbstractCrawler
|
||||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||||
from store import weibo as weibo_store
|
from store import weibo as weibo_store
|
||||||
from tools import utils
|
from tools import utils
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var, source_keyword_var
|
||||||
|
|
||||||
from .client import WeiboClient
|
from .client import WeiboClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
|
@ -99,6 +99,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count
|
||||||
start_page = config.START_PAGE
|
start_page = config.START_PAGE
|
||||||
for keyword in config.KEYWORDS.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
|
source_keyword_var.set(keyword)
|
||||||
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
|
utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
|
|
@ -12,7 +12,7 @@ from base.base_crawler import AbstractCrawler
|
||||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||||
from store import xhs as xhs_store
|
from store import xhs as xhs_store
|
||||||
from tools import utils
|
from tools import utils
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var, source_keyword_var
|
||||||
|
|
||||||
from .client import XiaoHongShuClient
|
from .client import XiaoHongShuClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
|
@ -94,6 +94,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
|
config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count
|
||||||
start_page = config.START_PAGE
|
start_page = config.START_PAGE
|
||||||
for keyword in config.KEYWORDS.split(","):
|
for keyword in config.KEYWORDS.split(","):
|
||||||
|
source_keyword_var.set(keyword)
|
||||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
|
utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}")
|
||||||
page = 1
|
page = 1
|
||||||
while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
|
||||||
|
|
|
@ -21,6 +21,7 @@ class TiebaNote(BaseModel):
|
||||||
total_replay_num: int = Field(default=0, description="回复总数")
|
total_replay_num: int = Field(default=0, description="回复总数")
|
||||||
total_replay_page: int = Field(default=0, description="回复总页数")
|
total_replay_page: int = Field(default=0, description="回复总页数")
|
||||||
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
ip_location: Optional[str] = Field(default="", description="IP地理位置")
|
||||||
|
source_keyword: str = Field(default="", description="来源关键词")
|
||||||
|
|
||||||
|
|
||||||
class TiebaComment(BaseModel):
|
class TiebaComment(BaseModel):
|
||||||
|
|
|
@ -397,4 +397,12 @@ CREATE TABLE tieba_comment
|
||||||
KEY `idx_tieba_comment_comment_id` (`note_id`),
|
KEY `idx_tieba_comment_comment_id` (`note_id`),
|
||||||
KEY `idx_tieba_comment_note_id` (`note_id`),
|
KEY `idx_tieba_comment_note_id` (`note_id`),
|
||||||
KEY `idx_tieba_comment_publish_time` (`publish_time`)
|
KEY `idx_tieba_comment_publish_time` (`publish_time`)
|
||||||
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表';
|
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧评论表';
|
||||||
|
|
||||||
|
-- 增加搜索来源关键字字段
|
||||||
|
alter table bilibili_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
||||||
|
alter table douyin_aweme add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
||||||
|
alter table kuaishou_video add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
||||||
|
alter table weibo_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
||||||
|
alter table xhs_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
||||||
|
alter table tieba_note add column `source_keyword` varchar(255) default '' comment '搜索来源关键字';
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import config
|
import config
|
||||||
|
from var import source_keyword_var
|
||||||
|
|
||||||
from .bilibili_store_impl import *
|
from .bilibili_store_impl import *
|
||||||
from .bilibilli_store_video import *
|
from .bilibilli_store_video import *
|
||||||
|
@ -48,6 +49,7 @@ async def update_bilibili_video(video_item: Dict):
|
||||||
"last_modify_ts": utils.get_current_timestamp(),
|
"last_modify_ts": utils.get_current_timestamp(),
|
||||||
"video_url": f"https://www.bilibili.com/video/av{video_id}",
|
"video_url": f"https://www.bilibili.com/video/av{video_id}",
|
||||||
"video_cover_url": video_item_view.get("pic", ""),
|
"video_cover_url": video_item_view.get("pic", ""),
|
||||||
|
"source_keyword": source_keyword_var.get(),
|
||||||
}
|
}
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[store.bilibili.update_bilibili_video] bilibili video id:{video_id}, title:{save_content_item.get('title')}")
|
f"[store.bilibili.update_bilibili_video] bilibili video id:{video_id}, title:{save_content_item.get('title')}")
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import config
|
import config
|
||||||
|
from var import source_keyword_var
|
||||||
|
|
||||||
from .douyin_store_impl import *
|
from .douyin_store_impl import *
|
||||||
|
|
||||||
|
@ -48,7 +49,8 @@ async def update_douyin_aweme(aweme_item: Dict):
|
||||||
"share_count": str(interact_info.get("share_count")),
|
"share_count": str(interact_info.get("share_count")),
|
||||||
"ip_location": aweme_item.get("ip_label", ""),
|
"ip_location": aweme_item.get("ip_label", ""),
|
||||||
"last_modify_ts": utils.get_current_timestamp(),
|
"last_modify_ts": utils.get_current_timestamp(),
|
||||||
"aweme_url": f"https://www.douyin.com/video/{aweme_id}"
|
"aweme_url": f"https://www.douyin.com/video/{aweme_id}",
|
||||||
|
"source_keyword": source_keyword_var.get(),
|
||||||
}
|
}
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[store.douyin.update_douyin_aweme] douyin aweme id:{aweme_id}, title:{save_content_item.get('title')}")
|
f"[store.douyin.update_douyin_aweme] douyin aweme id:{aweme_id}, title:{save_content_item.get('title')}")
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import config
|
import config
|
||||||
|
from var import source_keyword_var
|
||||||
|
|
||||||
from .kuaishou_store_impl import *
|
from .kuaishou_store_impl import *
|
||||||
|
|
||||||
|
@ -46,6 +47,7 @@ async def update_kuaishou_video(video_item: Dict):
|
||||||
"video_url": f"https://www.kuaishou.com/short-video/{video_id}",
|
"video_url": f"https://www.kuaishou.com/short-video/{video_id}",
|
||||||
"video_cover_url": photo_info.get("coverUrl", ""),
|
"video_cover_url": photo_info.get("coverUrl", ""),
|
||||||
"video_play_url": photo_info.get("photoUrl", ""),
|
"video_play_url": photo_info.get("photoUrl", ""),
|
||||||
|
"source_keyword": source_keyword_var.get(),
|
||||||
}
|
}
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[store.kuaishou.update_kuaishou_video] Kuaishou video id:{video_id}, title:{save_content_item.get('title')}")
|
f"[store.kuaishou.update_kuaishou_video] Kuaishou video id:{video_id}, title:{save_content_item.get('title')}")
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||||
|
from var import source_keyword_var
|
||||||
|
|
||||||
from . import tieba_store_impl
|
from . import tieba_store_impl
|
||||||
from .tieba_store_impl import *
|
from .tieba_store_impl import *
|
||||||
|
@ -32,6 +33,7 @@ async def update_tieba_note(note_item: TiebaNote):
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
note_item.source_keyword = source_keyword_var.get()
|
||||||
save_note_item = note_item.model_dump()
|
save_note_item = note_item.model_dump()
|
||||||
save_note_item.update({"last_modify_ts": utils.get_current_timestamp()})
|
save_note_item.update({"last_modify_ts": utils.get_current_timestamp()})
|
||||||
utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {save_note_item}")
|
utils.logger.info(f"[store.tieba.update_tieba_note] tieba note: {save_note_item}")
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
import re
|
import re
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from var import source_keyword_var
|
||||||
from .weibo_store_image import *
|
from .weibo_store_image import *
|
||||||
from .weibo_store_impl import *
|
from .weibo_store_impl import *
|
||||||
|
|
||||||
|
@ -51,6 +52,8 @@ async def update_weibo_note(note_item: Dict):
|
||||||
"gender": user_info.get("gender", ""),
|
"gender": user_info.get("gender", ""),
|
||||||
"profile_url": user_info.get("profile_url", ""),
|
"profile_url": user_info.get("profile_url", ""),
|
||||||
"avatar": user_info.get("profile_image_url", ""),
|
"avatar": user_info.get("profile_image_url", ""),
|
||||||
|
|
||||||
|
"source_keyword": source_keyword_var.get(),
|
||||||
}
|
}
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...")
|
f"[store.weibo.update_weibo_note] weibo note id:{note_id}, title:{save_content_item.get('content')[:24]} ...")
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import config
|
import config
|
||||||
|
from var import source_keyword_var
|
||||||
|
|
||||||
from . import xhs_store_impl
|
from . import xhs_store_impl
|
||||||
from .xhs_store_image import *
|
from .xhs_store_image import *
|
||||||
|
@ -78,6 +79,7 @@ async def update_xhs_note(note_item: Dict):
|
||||||
"tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
|
"tag_list": ','.join([tag.get('name', '') for tag in tag_list if tag.get('type') == 'topic']),
|
||||||
"last_modify_ts": utils.get_current_timestamp(),
|
"last_modify_ts": utils.get_current_timestamp(),
|
||||||
"note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search",
|
"note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search",
|
||||||
|
"source_keyword": source_keyword_var.get(),
|
||||||
}
|
}
|
||||||
utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}")
|
utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}")
|
||||||
await XhsStoreFactory.create_store().store_content(local_db_item)
|
await XhsStoreFactory.create_store().store_content(local_db_item)
|
||||||
|
|
1
var.py
1
var.py
|
@ -11,3 +11,4 @@ crawler_type_var: ContextVar[str] = ContextVar("crawler_type", default="")
|
||||||
comment_tasks_var: ContextVar[List[Task]] = ContextVar("comment_tasks", default=[])
|
comment_tasks_var: ContextVar[List[Task]] = ContextVar("comment_tasks", default=[])
|
||||||
media_crawler_db_var: ContextVar[AsyncMysqlDB] = ContextVar("media_crawler_db_var")
|
media_crawler_db_var: ContextVar[AsyncMysqlDB] = ContextVar("media_crawler_db_var")
|
||||||
db_conn_pool_var: ContextVar[aiomysql.Pool] = ContextVar("db_conn_pool_var")
|
db_conn_pool_var: ContextVar[aiomysql.Pool] = ContextVar("db_conn_pool_var")
|
||||||
|
source_keyword_var: ContextVar[str] = ContextVar("source_keyword", default="")
|
Loading…
Reference in New Issue