diff --git a/README.md b/README.md index 5755a15..7cd0d3c 100644 --- a/README.md +++ b/README.md @@ -23,13 +23,14 @@ ## 功能列表 | 平台 | 关键词搜索 | 指定帖子ID爬取 | 二级评论 | 指定创作者主页 | 登录态缓存 | IP代理池 | 生成评论词云图 | -|-----|-------|----------|-----|--------|-------|-------|-------| -| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 微博 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 贴吧 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +|-----|-------|---------|-----|--------|-------|-------|-------| +| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 微博 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 贴吧 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 知乎 | ✅ | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ## 使用方法 @@ -186,6 +187,7 @@ PS:如果打赏时请备注捐赠者,如有遗漏请联系我添加(有时 | 捐赠者 | 捐赠金额 | 捐赠日期 | |-------------|-------|------------| +| Urtb* | 100 元 | 2024-09-07 | | Tornado | 66 元 | 2024-09-04 | | srhedbj | 50 元 | 2024-08-20 | | *嘉 | 20 元 | 2024-08-15 | diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 65819a1..412d076 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -7,8 +7,8 @@ from tools.utils import str2bool async def parse_cmd(): # 读取command arg parser = argparse.ArgumentParser(description='Media crawler program.') - parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb | tieba)', - choices=["xhs", "dy", "ks", "bili", "wb", "tieba"], default=config.PLATFORM) + parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb | tieba | zhihu)', + choices=["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"], default=config.PLATFORM) parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)', diff --git a/config/base_config.py b/config/base_config.py index 0a03af8..2cb0b6c 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -27,8 +27,8 @@ HEADLESS = False # 是否保存登录状态 SAVE_LOGIN_STATE = True -# 数据保存类型选项配置,支持三种类型:csv、db、json -SAVE_DATA_OPTION = "csv" # csv or db or json +# 数据保存类型选项配置,支持三种类型:csv、db、json, 最好保存到DB,有排重的功能。 +SAVE_DATA_OPTION = "json" # csv or db or json # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name diff --git a/constant/zhihu.py b/constant/zhihu.py new file mode 100644 index 0000000..629b4b4 --- /dev/null +++ b/constant/zhihu.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- +ZHIHU_URL = "https://www.zhihu.com" + +ANSWER_NAME = "answer" +ARTICLE_NAME = "article" +VIDEO_NAME = "zvideo" \ No newline at end of file diff --git a/docs/常见问题.md b/docs/常见问题.md index 1b749d4..d2410fc 100644 --- a/docs/常见问题.md +++ b/docs/常见问题.md @@ -1,7 +1,7 @@ ## 常见程序运行出错问题 -Q: 爬取抖音报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'`
-A: 该错误为缺少 nodejs 环境,这个错误可以通过安装 nodejs 环境来解决,版本为:`v16.8.0`
+Q: 爬取知乎报错: `execjs._exceptions.ProgramError: SyntaxError: 缺少 ';'`
+A: 该错误为缺少 nodejs 环境,这个错误可以通过安装 nodejs 环境来解决,版本大于等:`v16`
Q: 使用Cookie爬取抖音报错: execjs._exceptions.ProgramError: TypeError: Cannot read property 'JS_MD5_NO_COMMON_JS' of null A: windows电脑去网站下载`https://nodejs.org/en/blog/release/v16.8.0` Windows 64-bit Installer 版本,一直下一步即可。 diff --git a/libs/zhihu.js b/libs/zhihu.js new file mode 100644 index 0000000..8048218 --- /dev/null +++ b/libs/zhihu.js @@ -0,0 +1,166 @@ +// copy from https://github.com/tiam-bloom/zhihuQuestionAnswer/blob/main/zhihuvmp.js thanks to tiam-bloom +// 仅供学习交流使用,严禁用于商业用途,也不要滥用,否则后果自负 +// modified by relakkes + +const crypto = require('crypto'); // 导入加密模块 + + +let init_str = "6fpLRqJO8M/c3jnYxFkUVC4ZIG12SiH=5v0mXDazWBTsuw7QetbKdoPyAl+hN9rgE"; +var h = { + zk: [1170614578, 1024848638, 1413669199, -343334464, -766094290, -1373058082, -143119608, -297228157, 1933479194, -971186181, -406453910, 460404854, -547427574, -1891326262, -1679095901, 2119585428, -2029270069, 2035090028, -1521520070, -5587175, -77751101, -2094365853, -1243052806, 1579901135, 1321810770, 456816404, -1391643889, -229302305, 330002838, -788960546, 363569021, -1947871109], + zb: [20, 223, 245, 7, 248, 2, 194, 209, 87, 6, 227, 253, 240, 128, 222, 91, 237, 9, 125, 157, 230, 93, 252, 205, 90, 79, 144, 199, 159, 197, 186, 167, 39, 37, 156, 198, 38, 42, 43, 168, 217, 153, 15, 103, 80, 189, 71, 191, 97, 84, 247, 95, 36, 69, 14, 35, 12, 171, 28, 114, 178, 148, 86, 182, 32, 83, 158, 109, 22, 255, 94, 238, 151, 85, 77, 124, 254, 18, 4, 26, 123, 176, 232, 193, 131, 172, 143, 142, 150, 30, 10, 146, 162, 62, 224, 218, 196, 229, 1, 192, 213, 27, 110, 56, 231, 180, 138, 107, 242, 187, 54, 120, 19, 44, 117, 228, 215, 203, 53, 239, 251, 127, 81, 11, 133, 96, 204, 132, 41, 115, 73, 55, 249, 147, 102, 48, 122, 145, 106, 118, 74, 190, 29, 16, 174, 5, 177, 129, 63, 113, 99, 31, 161, 76, 246, 34, 211, 13, 60, 68, 207, 160, 65, 111, 82, 165, 67, 169, 225, 57, 112, 244, 155, 51, 236, 200, 233, 58, 61, 47, 100, 137, 185, 64, 17, 70, 234, 163, 219, 108, 170, 166, 59, 149, 52, 105, 24, 212, 78, 173, 45, 0, 116, 226, 119, 136, 206, 135, 175, 195, 25, 92, 121, 208, 126, 139, 3, 75, 141, 21, 130, 98, 241, 40, 154, 66, 184, 49, 181, 46, 243, 88, 101, 183, 8, 23, 72, 188, 104, 179, 210, 134, 250, 201, 164, 89, 216, 202, 220, 50, 221, 152, 140, 33, 235, 214] + +}; + +function i(e, t, n) { + t[n] = 255 & e >>> 24, + t[n + 1] = 255 & e >>> 16, + t[n + 2] = 255 & e >>> 8, + t[n + 3] = 255 & e +} + +function Q(e, t) { + return (4294967295 & e) << t | e >>> 32 - t +} + +function B(e, t) { + return (255 & e[t]) << 24 | (255 & e[t + 1]) << 16 | (255 & e[t + 2]) << 8 | 255 & e[t + 3] +} + +function G(e) { + var t = new Array(4) + , n = new Array(4); + i(e, t, 0), + n[0] = h.zb[255 & t[0]], + n[1] = h.zb[255 & t[1]], + n[2] = h.zb[255 & t[2]], + n[3] = h.zb[255 & t[3]]; + + var r = B(n, 0); + return r ^ Q(r, 2) ^ Q(r, 10) ^ Q(r, 18) ^ Q(r, 24) +} + +function array_0_16_offset(e) { + var t = new Array(16) + , n = new Array(36); + n[0] = B(e, 0), + n[1] = B(e, 4), + n[2] = B(e, 8), + n[3] = B(e, 12); + for (var r = 0; r < 32; r++) { + var o = G(n[r + 1] ^ n[r + 2] ^ n[r + 3] ^ h.zk[r]); + n[r + 4] = n[r] ^ o + } + return i(n[35], t, 0), + i(n[34], t, 4), + i(n[33], t, 8), + i(n[32], t, 12), + t + +} + +function array_16_48_offset(e, t) { + for (var n = [], r = e.length, i = 0; 0 < r; r -= 16) { + for (var o = e.slice(16 * i, 16 * (i + 1)), a = new Array(16), c = 0; c < 16; c++) + a[c] = o[c] ^ t[c]; + t = array_0_16_offset(a), + n = n.concat(t), + i++ + } + return n +} + +function encode_0_16(array_0_16) { + let result = []; + let array_offset = [48, 53, 57, 48, 53, 51, 102, 55, 100, 49, 53, 101, 48, 49, 100, 55]; + for (let i = 0; i < array_0_16.length; i++) { + let a = array_0_16[i] ^ array_offset[i], + b = a ^ 42; + result.push(b) + } + return array_0_16_offset(result) +} + +function encode(ar) { + let b = ar[1] << 8, + c = ar[0] | b, + d = ar[2] << 16, + e = c | d, + result_array = [], + x6 = 6; + result_array.push(e & 63); + while (result_array.length < 4) { + let a = e >>> x6; + result_array.push(a & 63); + x6 += 6; + } + return result_array +} + +function get_init_array(encode_md5) { + let init_array = [] + for (let i = 0; i < encode_md5.length; i++) { + init_array.push(encode_md5.charCodeAt(i)) + } + init_array.unshift(0) + init_array.unshift(Math.floor(Math.random() * 127)) + while (init_array.length < 48) { + init_array.push(14) + } + let array_0_16 = encode_0_16(init_array.slice(0, 16)), + array_16_48 = array_16_48_offset(init_array.slice(16, 48), array_0_16), + array_result = array_0_16.concat(array_16_48); + return array_result +} + +function get_zse_96(encode_md5) { + let result_array = [], + init_array = get_init_array(encode_md5), + result = ""; + for (let i = 47; i >= 0; i -= 4) { + init_array[i] ^= 58 + } + init_array.reverse() + for (let j = 3; j <= init_array.length; j += 3) { + let ar = init_array.slice(j - 3, j); + result_array = result_array.concat(encode(ar)) + } + for (let index = 0; index < result_array.length; index++) { + result += init_str.charAt(result_array[index]) + } + result = '2.0_' + result + return result +} + +/***********************relakkes modify*******************************************************/ + +/** + * 从cookies中提取dc0的值 + * @param cookies + * @returns {string} + */ +const extract_dc0_value_from_cookies = function (cookies) { + const t9 = RegExp("d_c0=([^;]+)") + const tt = t9.exec(cookies); + const dc0 = tt && tt[1] + return tt && tt[1] +} + +/** + * 获取zhihu sign value 对python暴漏的接口 + * @param url 请求的路由参数 + * @param cookies 请求的cookies,需要包含dc0这个key + * @returns {*} + */ +function get_sign(url, cookies) { + const ta = "101_3_3.0" + const dc0 = extract_dc0_value_from_cookies(cookies) + const tc = "3_2.0aR_sn77yn6O92wOB8hPZnQr0EMYxc4f18wNBUgpTQ6nxERFZfTY0-4Lm-h3_tufIwJS8gcxTgJS_AuPZNcXCTwxI78YxEM20s4PGDwN8gGcYAupMWufIoLVqr4gxrRPOI0cY7HL8qun9g93mFukyigcmebS_FwOYPRP0E4rZUrN9DDom3hnynAUMnAVPF_PhaueTFH9fQL39OCCqYTxfb0rfi9wfPhSM6vxGDJo_rBHpQGNmBBLqPJHK2_w8C9eTVMO9Z9NOrMtfhGH_DgpM-BNM1DOxScLG3gg1Hre1FCXKQcXKkrSL1r9GWDXMk8wqBLNmbRH96BtOFqVZ7UYG3gC8D9cMS7Y9UrHLVCLZPJO8_CL_6GNCOg_zhJS8PbXmGTcBpgxfkieOPhNfthtf2gC_qD3YOce8nCwG2uwBOqeMoML9NBC1xb9yk6SuJhHLK7SM6LVfCve_3vLKlqcL6TxL_UosDvHLxrHmWgxBQ8Xs" + const params_join_str = [ta, url, dc0, tc].join("+") + const params_md5_value = crypto.createHash('md5').update(params_join_str).digest('hex') + + return { + "x-zst-81": tc, + "x-zse-96": get_zse_96(params_md5_value), + } +} diff --git a/main.py b/main.py index 15aa36d..c84fcc8 100644 --- a/main.py +++ b/main.py @@ -11,6 +11,7 @@ from media_platform.kuaishou import KuaishouCrawler from media_platform.tieba import TieBaCrawler from media_platform.weibo import WeiboCrawler from media_platform.xhs import XiaoHongShuCrawler +from media_platform.zhihu import ZhihuCrawler class CrawlerFactory: @@ -20,7 +21,8 @@ class CrawlerFactory: "ks": KuaishouCrawler, "bili": BilibiliCrawler, "wb": WeiboCrawler, - "tieba": TieBaCrawler + "tieba": TieBaCrawler, + "zhihu": ZhihuCrawler } @staticmethod diff --git a/media_platform/zhihu/__init__.py b/media_platform/zhihu/__init__.py new file mode 100644 index 0000000..a16a066 --- /dev/null +++ b/media_platform/zhihu/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +from .core import ZhihuCrawler \ No newline at end of file diff --git a/media_platform/zhihu/client.py b/media_platform/zhihu/client.py new file mode 100644 index 0000000..ba29fbe --- /dev/null +++ b/media_platform/zhihu/client.py @@ -0,0 +1,319 @@ +# -*- coding: utf-8 -*- +import asyncio +import json +from typing import Any, Callable, Dict, List, Optional, Union +from urllib.parse import urlencode + +import httpx +from playwright.async_api import BrowserContext, Page +from tenacity import retry, stop_after_attempt, wait_fixed + +import config +from base.base_crawler import AbstractApiClient +from constant import zhihu as zhihu_constant +from model.m_zhihu import ZhihuComment, ZhihuContent +from tools import utils + +from .exception import DataFetchError, ForbiddenError +from .field import SearchSort, SearchTime, SearchType +from .help import ZhiHuJsonExtractor, sign + + +class ZhiHuClient(AbstractApiClient): + def __init__( + self, + timeout=10, + proxies=None, + *, + headers: Dict[str, str], + playwright_page: Page, + cookie_dict: Dict[str, str], + ): + self.proxies = proxies + self.timeout = timeout + self.default_headers = headers + self.cookie_dict = cookie_dict + self._extractor = ZhiHuJsonExtractor() + + async def _pre_headers(self, url: str) -> Dict: + """ + 请求头参数签名 + Args: + url: 请求的URL需要包含请求的参数 + Returns: + + """ + d_c0 = self.cookie_dict.get("d_c0") + if not d_c0: + raise Exception("d_c0 not found in cookies") + sign_res = sign(url, self.default_headers["cookie"]) + headers = self.default_headers.copy() + headers['x-zst-81'] = sign_res["x-zst-81"] + headers['x-zse-96'] = sign_res["x-zse-96"] + return headers + + @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) + async def request(self, method, url, **kwargs) -> Union[str, Any]: + """ + 封装httpx的公共请求方法,对请求响应做一些处理 + Args: + method: 请求方法 + url: 请求的URL + **kwargs: 其他请求参数,例如请求头、请求体等 + + Returns: + + """ + # return response.text + return_response = kwargs.pop('return_response', False) + + async with httpx.AsyncClient(proxies=self.proxies, ) as client: + response = await client.request( + method, url, timeout=self.timeout, + **kwargs + ) + + if response.status_code != 200: + utils.logger.error(f"[ZhiHuClient.request] Requset Url: {url}, Request error: {response.text}") + if response.status_code == 403: + raise ForbiddenError(response.text) + elif response.status_code == 404: # 如果一个content没有评论也是404 + return {} + + raise DataFetchError(response.text) + + if return_response: + return response.text + try: + data: Dict = response.json() + if data.get("error"): + utils.logger.error(f"[ZhiHuClient.request] Request error: {data}") + raise DataFetchError(data.get("error", {}).get("message")) + return data + except json.JSONDecodeError: + utils.logger.error(f"[ZhiHuClient.request] Request error: {response.text}") + raise DataFetchError(response.text) + + + async def get(self, uri: str, params=None) -> Dict: + """ + GET请求,对请求头签名 + Args: + uri: 请求路由 + params: 请求参数 + + Returns: + + """ + final_uri = uri + if isinstance(params, dict): + final_uri += '?' + urlencode(params) + headers = await self._pre_headers(final_uri) + return await self.request(method="GET", url=zhihu_constant.ZHIHU_URL + final_uri, headers=headers) + + async def pong(self) -> bool: + """ + 用于检查登录态是否失效了 + Returns: + + """ + utils.logger.info("[ZhiHuClient.pong] Begin to pong zhihu...") + ping_flag = False + try: + res = await self.get_current_user_info() + if res.get("uid") and res.get("name"): + ping_flag = True + utils.logger.info("[ZhiHuClient.pong] Ping zhihu successfully") + else: + utils.logger.error(f"[ZhiHuClient.pong] Ping zhihu failed, response data: {res}") + except Exception as e: + utils.logger.error(f"[ZhiHuClient.pong] Ping zhihu failed: {e}, and try to login again...") + ping_flag = False + return ping_flag + + async def update_cookies(self, browser_context: BrowserContext): + """ + API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法 + Args: + browser_context: 浏览器上下文对象 + + Returns: + + """ + cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) + self.default_headers["cookie"] = cookie_str + self.cookie_dict = cookie_dict + + async def get_current_user_info(self) -> Dict: + """ + 获取当前登录用户信息 + Returns: + + """ + params = { + "include": "email,is_active,is_bind_phone" + } + return await self.get("/api/v4/me", params) + + async def get_note_by_keyword( + self, keyword: str, + page: int = 1, + page_size: int = 20, + sort: SearchSort = SearchSort.DEFAULT, + note_type: SearchType = SearchType.DEFAULT, + search_time: SearchTime = SearchTime.DEFAULT + ) -> List[ZhihuContent]: + """ + 根据关键词搜索 + Args: + keyword: 关键词 + page: 第几页 + page_size: 分页size + sort: 排序 + note_type: 搜索结果类型 + search_time: 搜索多久时间的结果 + + Returns: + + """ + uri = "/api/v4/search_v3" + params = { + "gk_version": "gz-gaokao", + "t": "general", + "q": keyword, + "correction": 1, + "offset": (page - 1) * page_size, + "limit": page_size, + "filter_fields": "", + "lc_idx": (page - 1) * page_size, + "show_all_topics": 0, + "search_source": "Filter", + "time_interval": search_time.value, + "sort": sort.value, + "vertical": note_type.value, + } + search_res = await self.get(uri, params) + utils.logger.info(f"[ZhiHuClient.get_note_by_keyword] Search result: {search_res}") + return self._extractor.extract_contents(search_res) + + async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10, + order_by: str = "sort") -> Dict: + """ + 获取内容的一级评论 + Args: + content_id: 内容ID + content_type: 内容类型(answer, article, zvideo) + offset: + limit: + order_by: + + Returns: + + """ + uri = f"/api/v4/{content_type}s/{content_id}/root_comments" + params = { + "order": order_by, + "offset": offset, + "limit": limit + } + return await self.get(uri, params) + + async def get_child_comments(self, root_comment_id: str, offset: str = "", limit: int = 10, + order_by: str = "sort") -> Dict: + """ + 获取一级评论下的子评论 + Args: + root_comment_id: + offset: + limit: + order_by: + + Returns: + + """ + uri = f"/api/v4/comment_v5/comment/{root_comment_id}/child_comment" + params = { + "order": order_by, + "offset": offset, + "limit": limit + } + return await self.get(uri, params) + + async def get_note_all_comments(self, content: ZhihuContent, crawl_interval: float = 1.0, + callback: Optional[Callable] = None) -> List[ZhihuComment]: + """ + 获取指定帖子下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 + Args: + content: 内容详情对象(问题|文章|视频) + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + + Returns: + + """ + result: List[ZhihuComment] = [] + is_end: bool = False + offset: str = "" + limit: int = 10 + while not is_end: + root_comment_res = await self.get_root_comments(content.content_id, content.content_type, offset, limit) + if not root_comment_res: + break + paging_info = root_comment_res.get("paging", {}) + is_end = paging_info.get("is_end") + offset = self._extractor.extract_offset(paging_info) + comments = self._extractor.extract_comments(content, root_comment_res.get("data")) + + if not comments: + break + + if callback: + await callback(comments) + + result.extend(comments) + await self.get_comments_all_sub_comments(content, comments, crawl_interval=crawl_interval, callback=callback) + await asyncio.sleep(crawl_interval) + return result + + async def get_comments_all_sub_comments(self, content: ZhihuContent, comments: List[ZhihuComment], crawl_interval: float = 1.0, + callback: Optional[Callable] = None) -> List[ZhihuComment]: + """ + 获取指定评论下的所有子评论 + Args: + content: 内容详情对象(问题|文章|视频) + comments: 评论列表 + crawl_interval: 爬取一次笔记的延迟单位(秒) + callback: 一次笔记爬取结束后 + + Returns: + + """ + if not config.ENABLE_GET_SUB_COMMENTS: + return [] + + all_sub_comments: List[ZhihuComment] = [] + for parment_comment in comments: + if parment_comment.sub_comment_count == 0: + continue + + is_end: bool = False + offset: str = "" + limit: int = 10 + while not is_end: + child_comment_res = await self.get_child_comments(parment_comment.comment_id, offset, limit) + if not child_comment_res: + break + paging_info = child_comment_res.get("paging", {}) + is_end = paging_info.get("is_end") + offset = self._extractor.extract_offset(paging_info) + sub_comments = self._extractor.extract_comments(content, child_comment_res.get("data")) + + if not sub_comments: + break + + if callback: + await callback(sub_comments) + + all_sub_comments.extend(sub_comments) + await asyncio.sleep(crawl_interval) + return all_sub_comments diff --git a/media_platform/zhihu/core.py b/media_platform/zhihu/core.py new file mode 100644 index 0000000..2593133 --- /dev/null +++ b/media_platform/zhihu/core.py @@ -0,0 +1,236 @@ +# -*- coding: utf-8 -*- +import asyncio +import os +import random +from asyncio import Task +from typing import Dict, List, Optional, Tuple + +from playwright.async_api import (BrowserContext, BrowserType, Page, + async_playwright) + +import config +from base.base_crawler import AbstractCrawler +from model.m_zhihu import ZhihuContent +from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool +from store import zhihu as zhihu_store +from tools import utils +from var import crawler_type_var, source_keyword_var + +from .client import ZhiHuClient +from .exception import DataFetchError +from .help import ZhiHuJsonExtractor +from .login import ZhiHuLogin + + +class ZhihuCrawler(AbstractCrawler): + context_page: Page + zhihu_client: ZhiHuClient + browser_context: BrowserContext + + def __init__(self) -> None: + self.index_url = "https://www.zhihu.com" + # self.user_agent = utils.get_user_agent() + self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36" + self._extractor = ZhiHuJsonExtractor() + + async def start(self) -> None: + """ + Start the crawler + Returns: + + """ + playwright_proxy_format, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info) + + async with async_playwright() as playwright: + # Launch a browser context. + chromium = playwright.chromium + self.browser_context = await self.launch_browser( + chromium, + None, + self.user_agent, + headless=config.HEADLESS + ) + # stealth.min.js is a js script to prevent the website from detecting the crawler. + await self.browser_context.add_init_script(path="libs/stealth.min.js") + + self.context_page = await self.browser_context.new_page() + await self.context_page.goto(self.index_url, wait_until="domcontentloaded") + + # Create a client to interact with the zhihu website. + self.zhihu_client = await self.create_zhihu_client(httpx_proxy_format) + if not await self.zhihu_client.pong(): + login_obj = ZhiHuLogin( + login_type=config.LOGIN_TYPE, + login_phone="", # input your phone number + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES + ) + await login_obj.begin() + await self.zhihu_client.update_cookies(browser_context=self.browser_context) + + crawler_type_var.set(config.CRAWLER_TYPE) + if config.CRAWLER_TYPE == "search": + # Search for notes and retrieve their comment information. + await self.search() + elif config.CRAWLER_TYPE == "detail": + # Get the information and comments of the specified post + raise NotImplementedError + elif config.CRAWLER_TYPE == "creator": + # Get creator's information and their notes and comments + raise NotImplementedError + else: + pass + + utils.logger.info("[ZhihuCrawler.start] Zhihu Crawler finished ...") + + async def search(self) -> None: + """Search for notes and retrieve their comment information.""" + utils.logger.info("[ZhihuCrawler.search] Begin search zhihu keywords") + zhihu_limit_count = 20 # zhihu limit page fixed value + if config.CRAWLER_MAX_NOTES_COUNT < zhihu_limit_count: + config.CRAWLER_MAX_NOTES_COUNT = zhihu_limit_count + start_page = config.START_PAGE + for keyword in config.KEYWORDS.split(","): + source_keyword_var.set(keyword) + utils.logger.info(f"[ZhihuCrawler.search] Current search keyword: {keyword}") + page = 1 + while (page - start_page + 1) * zhihu_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[ZhihuCrawler.search] Skip page {page}") + page += 1 + continue + + try: + utils.logger.info(f"[ZhihuCrawler.search] search zhihu keyword: {keyword}, page: {page}") + content_list: List[ZhihuContent] = await self.zhihu_client.get_note_by_keyword( + keyword=keyword, + page=page, + ) + utils.logger.info(f"[ZhihuCrawler.search] Search contents :{content_list}") + if not content_list: + utils.logger.info("No more content!") + break + + page += 1 + for content in content_list: + await zhihu_store.update_zhihu_content(content) + + await self.batch_get_content_comments(content_list) + except DataFetchError: + utils.logger.error("[ZhihuCrawler.search] Search content error") + return + + async def batch_get_content_comments(self, content_list: List[ZhihuContent]): + """ + Batch get content comments + Args: + content_list: + + Returns: + + """ + if not config.ENABLE_GET_COMMENTS: + utils.logger.info(f"[ZhihuCrawler.batch_get_content_comments] Crawling comment mode is not enabled") + return + + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list: List[Task] = [] + for content_item in content_list: + task = asyncio.create_task(self.get_comments(content_item, semaphore), name=content_item.content_id) + task_list.append(task) + await asyncio.gather(*task_list) + + async def get_comments(self, content_item: ZhihuContent, semaphore: asyncio.Semaphore): + """ + Get note comments with keyword filtering and quantity limitation + Args: + content_item: + semaphore: + + Returns: + + """ + async with semaphore: + utils.logger.info(f"[ZhihuCrawler.get_comments] Begin get note id comments {content_item.content_id}") + await self.zhihu_client.get_note_all_comments( + content=content_item, + crawl_interval=random.random(), + callback=zhihu_store.batch_update_zhihu_note_comments + ) + + @staticmethod + def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: + """format proxy info for playwright and httpx""" + playwright_proxy = { + "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", + "username": ip_proxy_info.user, + "password": ip_proxy_info.password, + } + httpx_proxy = { + f"{ip_proxy_info.protocol}": f"http://{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" + } + return playwright_proxy, httpx_proxy + + async def create_zhihu_client(self, httpx_proxy: Optional[str]) -> ZhiHuClient: + """Create zhihu client""" + utils.logger.info("[ZhihuCrawler.create_zhihu_client] Begin create zhihu API client ...") + cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) + zhihu_client_obj = ZhiHuClient( + proxies=httpx_proxy, + headers={ + 'accept': '*/*', + 'accept-language': 'zh-CN,zh;q=0.9', + 'cookie': cookie_str, + 'priority': 'u=1, i', + 'referer': 'https://www.zhihu.com/search?q=python&time_interval=a_year&type=content', + 'user-agent': self.user_agent, + 'x-api-version': '3.0.91', + 'x-app-za': 'OS=Web', + 'x-requested-with': 'fetch', + 'x-zse-93': '101_3_3.0', + }, + playwright_page=self.context_page, + cookie_dict=cookie_dict, + ) + return zhihu_client_obj + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True + ) -> BrowserContext: + """Launch browser and create browser context""" + utils.logger.info("[ZhihuCrawler.launch_browser] Begin create browser context ...") + if config.SAVE_LOGIN_STATE: + # feat issue #14 + # we will save login state to avoid login every time + user_data_dir = os.path.join(os.getcwd(), "browser_data", + config.USER_DATA_DIR % config.PLATFORM) # type: ignore + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, # type: ignore + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent + ) + return browser_context + else: + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore + browser_context = await browser.new_context( + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent + ) + return browser_context + + async def close(self): + """Close browser context""" + await self.browser_context.close() + utils.logger.info("[ZhihuCrawler.close] Browser context closed ...") diff --git a/media_platform/zhihu/exception.py b/media_platform/zhihu/exception.py new file mode 100644 index 0000000..cb24176 --- /dev/null +++ b/media_platform/zhihu/exception.py @@ -0,0 +1,12 @@ +from httpx import RequestError + + +class DataFetchError(RequestError): + """something error when fetch""" + + +class IPBlockError(RequestError): + """fetch so fast that the server block us ip""" + +class ForbiddenError(RequestError): + """Forbidden""" \ No newline at end of file diff --git a/media_platform/zhihu/field.py b/media_platform/zhihu/field.py new file mode 100644 index 0000000..1403ff7 --- /dev/null +++ b/media_platform/zhihu/field.py @@ -0,0 +1,36 @@ +from enum import Enum +from typing import NamedTuple + +from constant import zhihu as zhihu_constant + + +class SearchTime(Enum): + """ + 搜索时间范围 + """ + DEFAULT = "" # 不限时间 + ONE_DAY = "a_day" # 一天内 + ONE_WEEK = "a_week" # 一周内 + ONE_MONTH = "a_month" # 一个月内 + THREE_MONTH = "three_months" # 三个月内 + HALF_YEAR = "half_a_year" # 半年内 + ONE_YEAR = "a_year" # 一年内 + + +class SearchType(Enum): + """ + 搜索结果类型 + """ + DEFAULT = "" # 不限类型 + ANSWER = zhihu_constant.ANSWER_NAME # 只看回答 + ARTICLE = zhihu_constant.ARTICLE_NAME # 只看文章 + VIDEO = zhihu_constant.VIDEO_NAME # 只看视频 + + +class SearchSort(Enum): + """ + 搜索结果排序 + """ + DEFAULT = "" # 综合排序 + UPVOTED_COUNT = "upvoted_count" # 最多赞同 + CREATE_TIME = "created_time" # 最新发布 diff --git a/media_platform/zhihu/help.py b/media_platform/zhihu/help.py new file mode 100644 index 0000000..e9e929a --- /dev/null +++ b/media_platform/zhihu/help.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- +from typing import Dict, List +from urllib.parse import parse_qs, urlparse + +import execjs + +from constant import zhihu as zhihu_constant +from model.m_zhihu import ZhihuComment, ZhihuContent, ZhihuCreator +from tools.crawler_util import extract_text_from_html + +ZHIHU_SGIN_JS = None + + +def sign(url: str, cookies: str) -> Dict: + """ + zhihu sign algorithm + Args: + url: request url with query string + cookies: request cookies with d_c0 key + + Returns: + + """ + global ZHIHU_SGIN_JS + if not ZHIHU_SGIN_JS: + with open("libs/zhihu.js", "r") as f: + ZHIHU_SGIN_JS = execjs.compile(f.read()) + + return ZHIHU_SGIN_JS.call("get_sign", url, cookies) + + +class ZhiHuJsonExtractor: + def __init__(self): + pass + + def extract_contents(self, json_data: Dict) -> List[ZhihuContent]: + """ + extract zhihu contents + Args: + json_data: zhihu json data + + Returns: + + """ + if not json_data: + return [] + + result: List[ZhihuContent] = [] + search_result: List[Dict] = json_data.get("data", []) + search_result = [s_item for s_item in search_result if s_item.get("type") in ['search_result', 'zvideo']] + for sr_item in search_result: + sr_object: Dict = sr_item.get("object", {}) + if sr_object.get("type") == zhihu_constant.ANSWER_NAME: + result.append(self._extract_answer_content(sr_object)) + elif sr_object.get("type") == zhihu_constant.ARTICLE_NAME: + result.append(self._extract_article_content(sr_object)) + elif sr_object.get("type") == zhihu_constant.VIDEO_NAME: + result.append(self._extract_zvideo_content(sr_object)) + else: + continue + + return result + + def _extract_answer_content(self, answer: Dict) -> ZhihuContent: + """ + extract zhihu answer content + Args: + answer: zhihu answer + + Returns: + """ + res = ZhihuContent() + res.content_id = answer.get("id") + res.content_type = answer.get("type") + res.content_text = extract_text_from_html(answer.get("content")) + res.question_id = answer.get("question").get("id") + res.content_url = f"{zhihu_constant.ZHIHU_URL}/question/{res.question_id}/answer/{res.content_id}" + res.title = extract_text_from_html(answer.get("title")) + res.desc = extract_text_from_html(answer.get("description")) + res.created_time = answer.get("created_time") + res.updated_time = answer.get("updated_time") + res.voteup_count = answer.get("voteup_count") + res.comment_count = answer.get("comment_count") + + # extract author info + author_info = self._extract_author(answer.get("author")) + res.user_id = author_info.user_id + res.user_link = author_info.user_link + res.user_nickname = author_info.user_nickname + res.user_avatar = author_info.user_avatar + return res + + def _extract_article_content(self, article: Dict) -> ZhihuContent: + """ + extract zhihu article content + Args: + article: zhihu article + + Returns: + + """ + res = ZhihuContent() + res.content_id = article.get("id") + res.content_type = article.get("type") + res.content_text = extract_text_from_html(article.get("content")) + res.content_url = f"{zhihu_constant.ZHIHU_URL}/p/{res.content_id}" + res.title = extract_text_from_html(article.get("title")) + res.desc = extract_text_from_html(article.get("excerpt")) + res.created_time = article.get("created_time") + res.updated_time = article.get("updated_time") + res.voteup_count = article.get("voteup_count") + res.comment_count = article.get("comment_count") + + # extract author info + author_info = self._extract_author(article.get("author")) + res.user_id = author_info.user_id + res.user_link = author_info.user_link + res.user_nickname = author_info.user_nickname + res.user_avatar = author_info.user_avatar + return res + + def _extract_zvideo_content(self, zvideo: Dict) -> ZhihuContent: + """ + extract zhihu zvideo content + Args: + zvideo: + + Returns: + + """ + res = ZhihuContent() + res.content_id = zvideo.get("zvideo_id") + res.content_type = zvideo.get("type") + res.content_url = zvideo.get("video_url") + res.title = extract_text_from_html(zvideo.get("title")) + res.desc = extract_text_from_html(zvideo.get("description")) + res.created_time = zvideo.get("created_at") + res.voteup_count = zvideo.get("voteup_count") + res.comment_count = zvideo.get("comment_count") + + # extract author info + author_info = self._extract_author(zvideo.get("author")) + res.user_id = author_info.user_id + res.user_link = author_info.user_link + res.user_nickname = author_info.user_nickname + res.user_avatar = author_info.user_avatar + return res + + @staticmethod + def _extract_author(author: Dict) -> ZhihuCreator: + """ + extract zhihu author + Args: + author: + + Returns: + + """ + res = ZhihuCreator() + if not author: + return res + if not author.get("id"): + author = author.get("member") + res.user_id = author.get("id") + res.user_link = f"{zhihu_constant.ZHIHU_URL}/people/{author.get('url_token')}" + res.user_nickname = author.get("name") + res.user_avatar = author.get("avatar_url") + return res + + def extract_comments(self, page_content: ZhihuContent, comments: List[Dict]) -> List[ZhihuComment]: + """ + extract zhihu comments + Args: + page_content: zhihu content object + comments: zhihu comments + + Returns: + + """ + if not comments: + return [] + res: List[ZhihuComment] = [] + for comment in comments: + if comment.get("type") != "comment": + continue + res.append(self._extract_comment(page_content, comment)) + return res + + def _extract_comment(self, page_content: ZhihuContent, comment: Dict) -> ZhihuComment: + """ + extract zhihu comment + Args: + page_content: comment with content object + comment: zhihu comment + + Returns: + + """ + res = ZhihuComment() + res.comment_id = str(comment.get("id", "")) + res.parent_comment_id = comment.get("reply_comment_id") + res.content = extract_text_from_html(comment.get("content")) + res.publish_time = comment.get("created_time") + res.ip_location = self._extract_comment_ip_location(comment.get("comment_tag", [])) + res.sub_comment_count = comment.get("child_comment_count") + res.like_count = comment.get("like_count") if comment.get("like_count") else 0 + res.dislike_count = comment.get("dislike_count") if comment.get("dislike_count") else 0 + res.content_id = page_content.content_id + res.content_type = page_content.content_type + + # extract author info + author_info = self._extract_author(comment.get("author")) + res.user_id = author_info.user_id + res.user_link = author_info.user_link + res.user_nickname = author_info.user_nickname + res.user_avatar = author_info.user_avatar + return res + + @staticmethod + def _extract_comment_ip_location(comment_tags: List[Dict]) -> str: + """ + extract comment ip location + Args: + comment_tags: + + Returns: + + """ + if not comment_tags: + return "" + + for ct in comment_tags: + if ct.get("type") == "ip_info": + return ct.get("text") + + return "" + + @staticmethod + def extract_offset(paging_info: Dict) -> str: + """ + extract offset + Args: + paging_info: + + Returns: + + """ + # https://www.zhihu.com/api/v4/comment_v5/zvideos/1424368906836807681/root_comment?limit=10&offset=456770961_10125996085_0&order_by=score + next_url = paging_info.get("next") + if not next_url: + return "" + + parsed_url = urlparse(next_url) + query_params = parse_qs(parsed_url.query) + offset = query_params.get('offset', [""])[0] + return offset diff --git a/media_platform/zhihu/login.py b/media_platform/zhihu/login.py new file mode 100644 index 0000000..db9c8a7 --- /dev/null +++ b/media_platform/zhihu/login.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +import asyncio +import functools +import sys +from typing import Optional + +from playwright.async_api import BrowserContext, Page +from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, + wait_fixed) + +import config +from base.base_crawler import AbstractLogin +from tools import utils + + +class ZhiHuLogin(AbstractLogin): + + def __init__(self, + login_type: str, + browser_context: BrowserContext, + context_page: Page, + login_phone: Optional[str] = "", + cookie_str: str = "" + ): + config.LOGIN_TYPE = login_type + self.browser_context = browser_context + self.context_page = context_page + self.login_phone = login_phone + self.cookie_str = cookie_str + + @retry(stop=stop_after_attempt(600), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) + async def check_login_state(self) -> bool: + """ + Check if the current login status is successful and return True otherwise return False + Returns: + + """ + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + current_web_session = cookie_dict.get("z_c0") + if current_web_session: + return True + return False + + async def begin(self): + """Start login zhihu""" + utils.logger.info("[ZhiHu.begin] Begin login zhihu ...") + if config.LOGIN_TYPE == "qrcode": + await self.login_by_qrcode() + elif config.LOGIN_TYPE == "phone": + await self.login_by_mobile() + elif config.LOGIN_TYPE == "cookie": + await self.login_by_cookies() + else: + raise ValueError("[ZhiHu.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...") + + async def login_by_mobile(self): + """Login zhihu by mobile""" + # todo implement login by mobile + + async def login_by_qrcode(self): + """login zhihu website and keep webdriver login state""" + utils.logger.info("[ZhiHu.login_by_qrcode] Begin login zhihu by qrcode ...") + qrcode_img_selector = "canvas.Qrcode-qrcode" + # find login qrcode + base64_qrcode_img = await utils.find_qrcode_img_from_canvas( + self.context_page, + canvas_selector=qrcode_img_selector + ) + if not base64_qrcode_img: + utils.logger.info("[ZhiHu.login_by_qrcode] login failed , have not found qrcode please check ....") + if not base64_qrcode_img: + sys.exit() + + # show login qrcode + # fix issue #12 + # we need to use partial function to call show_qrcode function and run in executor + # then current asyncio event loop will not be blocked + partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) + asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) + + utils.logger.info(f"[ZhiHu.login_by_qrcode] waiting for scan code login, remaining time is 120s") + try: + await self.check_login_state() + + except RetryError: + utils.logger.info("[ZhiHu.login_by_qrcode] Login zhihu failed by qrcode login method ...") + sys.exit() + + wait_redirect_seconds = 5 + utils.logger.info( + f"[ZhiHu.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + await asyncio.sleep(wait_redirect_seconds) + + async def login_by_cookies(self): + """login zhihu website by cookies""" + utils.logger.info("[ZhiHu.login_by_cookies] Begin login zhihu by cookie ...") + for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): + await self.browser_context.add_cookies([{ + 'name': key, + 'value': value, + 'domain': ".zhihu.com", + 'path': "/" + }]) diff --git a/model/m_zhihu.py b/model/m_zhihu.py new file mode 100644 index 0000000..0dc6b6a --- /dev/null +++ b/model/m_zhihu.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +from typing import Optional + +from pydantic import BaseModel, Field + + +class ZhihuContent(BaseModel): + """ + 知乎内容(回答、文章、视频) + """ + content_id: str = Field(default="", description="内容ID") + content_type: str = Field(default="", description="内容类型(article | answer | zvideo)") + content_text: str = Field(default="", description="内容文本, 如果是视频类型这里为空") + content_url: str = Field(default="", description="内容落地链接") + question_id: str = Field(default="", description="问题ID, type为answer时有值") + title: str = Field(default="", description="内容标题") + desc: str = Field(default="", description="内容描述") + created_time: int = Field(default="", description="创建时间") + updated_time: int = Field(default="", description="更新时间") + voteup_count: int = Field(default=0, description="赞同人数") + comment_count: int = Field(default=0, description="评论数量") + source_keyword: str = Field(default="", description="来源关键词") + + user_id: str = Field(default="", description="用户ID") + user_link: str = Field(default="", description="用户主页链接") + user_nickname: str = Field(default="", description="用户昵称") + user_avatar: str = Field(default="", description="用户头像地址") + + +class ZhihuComment(BaseModel): + """ + 知乎评论 + """ + + comment_id: str = Field(default="", description="评论ID") + parent_comment_id: str = Field(default="", description="父评论ID") + content: str = Field(default="", description="评论内容") + publish_time: int = Field(default=0, description="发布时间") + ip_location: Optional[str] = Field(default="", description="IP地理位置") + sub_comment_count: int = Field(default=0, description="子评论数") + like_count: int = Field(default=0, description="点赞数") + dislike_count: int = Field(default=0, description="踩数") + content_id: str = Field(default="", description="内容ID") + content_type: str = Field(default="", description="内容类型(article | answer | zvideo)") + + user_id: str = Field(default="", description="用户ID") + user_link: str = Field(default="", description="用户主页链接") + user_nickname: str = Field(default="", description="用户昵称") + user_avatar: str = Field(default="", description="用户头像地址") + + +class ZhihuCreator(BaseModel): + """ + 知乎创作者 + """ + user_id: str = Field(default="", description="用户ID") + user_link: str = Field(default="", description="用户主页链接") + user_nickname: str = Field(default="", description="用户昵称") + user_avatar: str = Field(default="", description="用户头像地址") + gender: str = Field(default="", description="用户性别") + ip_location: Optional[str] = Field(default="", description="IP地理位置") + follows: int = Field(default=0, description="关注数") + fans: int = Field(default=0, description="粉丝数") diff --git a/schema/tables.sql b/schema/tables.sql index d23e8d8..6e304f5 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -453,4 +453,72 @@ CREATE TABLE `tieba_creator` `fans` varchar(16) DEFAULT NULL COMMENT '粉丝数', `registration_duration` varchar(16) DEFAULT NULL COMMENT '吧龄', PRIMARY KEY (`id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧创作者'; \ No newline at end of file +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='贴吧创作者'; + + +CREATE TABLE `zhihu_content` ( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `content_id` varchar(64) NOT NULL COMMENT '内容ID', + `content_type` varchar(16) NOT NULL COMMENT '内容类型(article | answer | zvideo)', + `content_text` longtext COMMENT '内容文本, 如果是视频类型这里为空', + `content_url` varchar(255) NOT NULL COMMENT '内容落地链接', + `question_id` varchar(64) DEFAULT NULL COMMENT '问题ID, type为answer时有值', + `title` varchar(255) NOT NULL COMMENT '内容标题', + `desc` longtext COMMENT '内容描述', + `created_time` varchar(32) NOT NULL COMMENT '创建时间', + `updated_time` varchar(32) NOT NULL COMMENT '更新时间', + `voteup_count` int NOT NULL DEFAULT '0' COMMENT '赞同人数', + `comment_count` int NOT NULL DEFAULT '0' COMMENT '评论数量', + `source_keyword` varchar(64) DEFAULT NULL COMMENT '来源关键词', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `user_link` varchar(255) NOT NULL COMMENT '用户主页链接', + `user_nickname` varchar(64) NOT NULL COMMENT '用户昵称', + `user_avatar` varchar(255) NOT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + PRIMARY KEY (`id`), + KEY `idx_zhihu_content_content_id` (`content_id`), + KEY `idx_zhihu_content_created_time` (`created_time`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='知乎内容(回答、文章、视频)'; + + +CREATE TABLE `zhihu_comment` ( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `comment_id` varchar(64) NOT NULL COMMENT '评论ID', + `parent_comment_id` varchar(64) DEFAULT NULL COMMENT '父评论ID', + `content` text NOT NULL COMMENT '评论内容', + `publish_time` varchar(32) NOT NULL COMMENT '发布时间', + `ip_location` varchar(64) DEFAULT NULL COMMENT 'IP地理位置', + `sub_comment_count` int NOT NULL DEFAULT '0' COMMENT '子评论数', + `like_count` int NOT NULL DEFAULT '0' COMMENT '点赞数', + `dislike_count` int NOT NULL DEFAULT '0' COMMENT '踩数', + `content_id` varchar(64) NOT NULL COMMENT '内容ID', + `content_type` varchar(16) NOT NULL COMMENT '内容类型(article | answer | zvideo)', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `user_link` varchar(255) NOT NULL COMMENT '用户主页链接', + `user_nickname` varchar(64) NOT NULL COMMENT '用户昵称', + `user_avatar` varchar(255) NOT NULL COMMENT '用户头像地址', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + PRIMARY KEY (`id`), + KEY `idx_zhihu_comment_comment_id` (`comment_id`), + KEY `idx_zhihu_comment_content_id` (`content_id`), + KEY `idx_zhihu_comment_publish_time` (`publish_time`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='知乎评论'; + + +CREATE TABLE `zhihu_creator` ( + `id` int NOT NULL AUTO_INCREMENT COMMENT '自增ID', + `user_id` varchar(64) NOT NULL COMMENT '用户ID', + `user_link` varchar(255) NOT NULL COMMENT '用户主页链接', + `user_nickname` varchar(64) NOT NULL COMMENT '用户昵称', + `user_avatar` varchar(255) NOT NULL COMMENT '用户头像地址', + `gender` varchar(16) DEFAULT NULL COMMENT '用户性别', + `ip_location` varchar(64) DEFAULT NULL COMMENT 'IP地理位置', + `follows` int NOT NULL DEFAULT '0' COMMENT '关注数', + `fans` int NOT NULL DEFAULT '0' COMMENT '粉丝数', + `add_ts` bigint NOT NULL COMMENT '记录添加时间戳', + `last_modify_ts` bigint NOT NULL COMMENT '记录最后修改时间戳', + PRIMARY KEY (`id`), + UNIQUE KEY `idx_zhihu_creator_user_id` (`user_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='知乎创作者'; diff --git a/store/zhihu/__init__.py b/store/zhihu/__init__.py new file mode 100644 index 0000000..fc944d7 --- /dev/null +++ b/store/zhihu/__init__.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +from typing import List + +import config +from base.base_crawler import AbstractStore +from model.m_zhihu import ZhihuComment, ZhihuContent +from store.zhihu.zhihu_store_impl import (ZhihuCsvStoreImplement, + ZhihuDbStoreImplement, + ZhihuJsonStoreImplement) +from tools import utils +from var import source_keyword_var + + +class ZhihuStoreFactory: + STORES = { + "csv": ZhihuCsvStoreImplement, + "db": ZhihuDbStoreImplement, + "json": ZhihuJsonStoreImplement + } + + @staticmethod + def create_store() -> AbstractStore: + store_class = ZhihuStoreFactory.STORES.get(config.SAVE_DATA_OPTION) + if not store_class: + raise ValueError("[ZhihuStoreFactory.create_store] Invalid save option only supported csv or db or json ...") + return store_class() + +async def update_zhihu_content(content_item: ZhihuContent): + """ + 更新知乎内容 + Args: + content_item: + + Returns: + + """ + content_item.source_keyword = source_keyword_var.get() + local_db_item = content_item.model_dump() + local_db_item.update({"last_modify_ts": utils.get_current_timestamp()}) + utils.logger.info(f"[store.zhihu.update_zhihu_content] zhihu content: {local_db_item}") + await ZhihuStoreFactory.create_store().store_content(local_db_item) + + + +async def batch_update_zhihu_note_comments(comments: List[ZhihuComment]): + """ + 批量更新知乎内容评论 + Args: + comments: + + Returns: + + """ + if not comments: + return + + for comment_item in comments: + await update_zhihu_content_comment(comment_item) + + +async def update_zhihu_content_comment(comment_item: ZhihuComment): + """ + 更新知乎内容评论 + Args: + comment_item: + + Returns: + + """ + local_db_item = comment_item.model_dump() + local_db_item.update({"last_modify_ts": utils.get_current_timestamp()}) + utils.logger.info(f"[store.zhihu.update_zhihu_note_comment] zhihu content comment:{local_db_item}") + await ZhihuStoreFactory.create_store().store_comment(local_db_item) diff --git a/store/zhihu/zhihu_store_impl.py b/store/zhihu/zhihu_store_impl.py new file mode 100644 index 0000000..15eec8a --- /dev/null +++ b/store/zhihu/zhihu_store_impl.py @@ -0,0 +1,245 @@ +# -*- coding: utf-8 -*- +import asyncio +import csv +import json +import os +import pathlib +from typing import Dict + +import aiofiles + +import config +from base.base_crawler import AbstractStore +from tools import utils, words +from var import crawler_type_var + + +def calculate_number_of_files(file_store_path: str) -> int: + """计算数据保存文件的前部分排序数字,支持每次运行代码不写到同一个文件中 + Args: + file_store_path; + Returns: + file nums + """ + if not os.path.exists(file_store_path): + return 1 + try: + return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1 + except ValueError: + return 1 + + +class ZhihuCsvStoreImplement(AbstractStore): + csv_store_path: str = "data/zhihu" + file_count: int = calculate_number_of_files(csv_store_path) + + def make_save_file_name(self, store_type: str) -> str: + """ + make save file name by store type + Args: + store_type: contents or comments + + Returns: eg: data/zhihu/search_comments_20240114.csv ... + + """ + return f"{self.csv_store_path}/{self.file_count}_{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.csv" + + async def save_data_to_csv(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in CSV format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: no returns + + """ + pathlib.Path(self.csv_store_path).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(store_type=store_type) + async with aiofiles.open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: + f.fileno() + writer = csv.writer(f) + if await f.tell() == 0: + await writer.writerow(save_item.keys()) + await writer.writerow(save_item.values()) + + async def store_content(self, content_item: Dict): + """ + Zhihu content CSV storage implementation + Args: + content_item: note item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=content_item, store_type="contents") + + async def store_comment(self, comment_item: Dict): + """ + Zhihu comment CSV storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + await self.save_data_to_csv(save_item=comment_item, store_type="comments") + + async def store_creator(self, creator: Dict): + """ + Zhihu content CSV storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_csv(save_item=creator, store_type="creator") + + +class ZhihuDbStoreImplement(AbstractStore): + async def store_content(self, content_item: Dict): + """ + Zhihu content DB storage implementation + Args: + content_item: content item dict + + Returns: + + """ + from .zhihu_store_sql import (add_new_content, + query_content_by_content_id, + update_content_by_content_id) + note_id = content_item.get("note_id") + note_detail: Dict = await query_content_by_content_id(content_id=note_id) + if not note_detail: + content_item["add_ts"] = utils.get_current_timestamp() + await add_new_content(content_item) + else: + await update_content_by_content_id(note_id, content_item=content_item) + + async def store_comment(self, comment_item: Dict): + """ + Zhihu content DB storage implementation + Args: + comment_item: comment item dict + + Returns: + + """ + from .zhihu_store_sql import (add_new_comment, + query_comment_by_comment_id, + update_comment_by_comment_id) + comment_id = comment_item.get("comment_id") + comment_detail: Dict = await query_comment_by_comment_id(comment_id=comment_id) + if not comment_detail: + comment_item["add_ts"] = utils.get_current_timestamp() + await add_new_comment(comment_item) + else: + await update_comment_by_comment_id(comment_id, comment_item=comment_item) + + async def store_creator(self, creator: Dict): + """ + Zhihu content DB storage implementation + Args: + creator: creator dict + + Returns: + + """ + from .zhihu_store_sql import (add_new_creator, + query_creator_by_user_id, + update_creator_by_user_id) + user_id = creator.get("user_id") + user_detail: Dict = await query_creator_by_user_id(user_id) + if not user_detail: + creator["add_ts"] = utils.get_current_timestamp() + await add_new_creator(creator) + else: + await update_creator_by_user_id(user_id, creator) + + +class ZhihuJsonStoreImplement(AbstractStore): + json_store_path: str = "data/zhihu/json" + words_store_path: str = "data/zhihu/words" + lock = asyncio.Lock() + file_count: int = calculate_number_of_files(json_store_path) + WordCloud = words.AsyncWordCloudGenerator() + + def make_save_file_name(self, store_type: str) -> (str, str): + """ + make save file name by store type + Args: + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + + return ( + f"{self.json_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}.json", + f"{self.words_store_path}/{crawler_type_var.get()}_{store_type}_{utils.get_current_date()}" + ) + + async def save_data_to_json(self, save_item: Dict, store_type: str): + """ + Below is a simple way to save it in json format. + Args: + save_item: save content dict info + store_type: Save type contains content and comments(contents | comments) + + Returns: + + """ + pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True) + pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True) + save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type) + save_data = [] + + async with self.lock: + if os.path.exists(save_file_name): + async with aiofiles.open(save_file_name, 'r', encoding='utf-8') as file: + save_data = json.loads(await file.read()) + + save_data.append(save_item) + async with aiofiles.open(save_file_name, 'w', encoding='utf-8') as file: + await file.write(json.dumps(save_data, ensure_ascii=False, indent=4)) + + if config.ENABLE_GET_COMMENTS and config.ENABLE_GET_WORDCLOUD: + try: + await self.WordCloud.generate_word_frequency_and_cloud(save_data, words_file_name_prefix) + except: + pass + + async def store_content(self, content_item: Dict): + """ + content JSON storage implementation + Args: + content_item: + + Returns: + + """ + await self.save_data_to_json(content_item, "contents") + + async def store_comment(self, comment_item: Dict): + """ + comment JSON storage implementatio + Args: + comment_item: + + Returns: + + """ + await self.save_data_to_json(comment_item, "comments") + + async def store_creator(self, creator: Dict): + """ + Zhihu content JSON storage implementation + Args: + creator: creator dict + + Returns: + + """ + await self.save_data_to_json(creator, "creator") diff --git a/store/zhihu/zhihu_store_sql.py b/store/zhihu/zhihu_store_sql.py new file mode 100644 index 0000000..7036d89 --- /dev/null +++ b/store/zhihu/zhihu_store_sql.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- +from typing import Dict, List + +from db import AsyncMysqlDB +from var import media_crawler_db_var + + +async def query_content_by_content_id(content_id: str) -> Dict: + """ + 查询一条内容记录(zhihu的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + sql: str = f"select * from zhihu_content where content_id = '{content_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_content(content_item: Dict) -> int: + """ + 新增一条内容记录(zhihu的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("zhihu_content", content_item) + return last_row_id + + +async def update_content_by_content_id(content_id: str, content_item: Dict) -> int: + """ + 更新一条记录(zhihu的帖子 | 抖音的视频 | 微博 | 快手视频 ...) + Args: + content_id: + content_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("zhihu_content", content_item, "content_id", content_id) + return effect_row + + + +async def query_comment_by_comment_id(comment_id: str) -> Dict: + """ + 查询一条评论内容 + Args: + comment_id: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + sql: str = f"select * from zhihu_comment where comment_id = '{comment_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_comment(comment_item: Dict) -> int: + """ + 新增一条评论记录 + Args: + comment_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("zhihu_comment", comment_item) + return last_row_id + + +async def update_comment_by_comment_id(comment_id: str, comment_item: Dict) -> int: + """ + 更新增一条评论记录 + Args: + comment_id: + comment_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("zhihu_comment", comment_item, "comment_id", comment_id) + return effect_row + + +async def query_creator_by_user_id(user_id: str) -> Dict: + """ + 查询一条创作者记录 + Args: + user_id: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + sql: str = f"select * from zhihu_creator where user_id = '{user_id}'" + rows: List[Dict] = await async_db_conn.query(sql) + if len(rows) > 0: + return rows[0] + return dict() + + +async def add_new_creator(creator_item: Dict) -> int: + """ + 新增一条创作者信息 + Args: + creator_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + last_row_id: int = await async_db_conn.item_to_table("zhihu_creator", creator_item) + return last_row_id + + +async def update_creator_by_user_id(user_id: str, creator_item: Dict) -> int: + """ + 更新一条创作者信息 + Args: + user_id: + creator_item: + + Returns: + + """ + async_db_conn: AsyncMysqlDB = media_crawler_db_var.get() + effect_row: int = await async_db_conn.update_table("zhihu_creator", creator_item, "user_id", user_id) + return effect_row \ No newline at end of file diff --git a/tools/crawler_util.py b/tools/crawler_util.py index 568b221..816f104 100644 --- a/tools/crawler_util.py +++ b/tools/crawler_util.py @@ -4,6 +4,7 @@ # @Desc : 爬虫相关的工具函数 import base64 +import json import random import re from io import BytesIO @@ -39,6 +40,28 @@ async def find_login_qrcode(page: Page, selector: str) -> str: return "" +async def find_qrcode_img_from_canvas(page: Page, canvas_selector: str) -> str: + """ + find qrcode image from canvas element + Args: + page: + canvas_selector: + + Returns: + + """ + + # 等待Canvas元素加载完成 + canvas = await page.wait_for_selector(canvas_selector) + + # 截取Canvas元素的截图 + screenshot = await canvas.screenshot() + + # 将截图转换为base64格式 + base64_image = base64.b64encode(screenshot).decode('utf-8') + return base64_image + + def show_qrcode(qr_code) -> None: # type: ignore """parse base64 encode qrcode image and show it""" if "," in qr_code: @@ -147,8 +170,12 @@ def format_proxy_info(ip_proxy_info) -> Tuple[Optional[Dict], Optional[Dict]]: } return playwright_proxy, httpx_proxy + def extract_text_from_html(html: str) -> str: """Extract text from HTML, removing all tags.""" + if not html: + return "" + # Remove script and style elements clean_html = re.sub(r'<(script|style)[^>]*>.*?', '', html, flags=re.DOTALL) # Remove all other tags