From 94b5030ef0ae1dacdc6b024440dca92a35a25112 Mon Sep 17 00:00:00 2001 From: Relakkes Date: Mon, 4 Dec 2023 00:02:00 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20B=E7=AB=99=E4=BA=8C=E7=BB=B4=E7=A0=81?= =?UTF-8?q?=E3=80=81Cookie=E7=99=BB=E5=BD=95=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 14 +++---- media_platform/bilibili/login.py | 67 ++++++++++++++++++++++++++++++-- media_platform/kuaishou/login.py | 2 +- 3 files changed, 72 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 0350cc4..6ee6ad9 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,8 @@ # 仓库描述 -**小红书爬虫**,**抖音爬虫**, **快手爬虫**...。 -目前能抓取小红书、抖音、快手的视频、图片、评论、点赞、转发等信息。 +**小红书爬虫**,**抖音爬虫**, **快手爬虫**, **B站爬虫**...。 +目前能抓取小红书、抖音、快手、B站的视频、图片、评论、点赞、转发等信息。 原理:利用[playwright](https://playwright.dev/)搭桥,保留登录成功后的上下文浏览器环境,通过执行JS表达式获取一些加密参数 通过使用此方式,免去了复现核心加密JS代码,逆向难度大大降低。 @@ -21,11 +21,11 @@ ## 功能列表 | 平台 | Cookie 登录 | 二维码登录 | 手机号登录 | 关键词搜索 | 指定视频/帖子 ID 爬取 | 登录状态缓存 | 数据保存 | IP 代理池 | 滑块验证码 | |:---:|:---------:|:-----:|:-----:|:-----:|:-------------:|:------:|:----:|:------:|:-----:| -| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | -| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| 快手 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | -| B 站 | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | -| 微博 | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | +| 小红书 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | +| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| 快手 | ✅ | ✅ | ✕ | ✅ | ✅ | ✅ | ✅ | ✅ | ✕ | +| B 站 | ✅ | ✅ | ✕ | ✅ | ✕ | ✅ | ✅ | ✕ | ✕ | +| 微博 | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ✕ | ## 使用方法 diff --git a/media_platform/bilibili/login.py b/media_platform/bilibili/login.py index 31b0421..25d8d68 100644 --- a/media_platform/bilibili/login.py +++ b/media_platform/bilibili/login.py @@ -34,13 +34,74 @@ class BilibiliLogin(AbstractLogin): self.cookie_str = cookie_str async def begin(self): - pass + """Start login xiaohongshu""" + utils.logger.info("Begin login Bilibili ...") + if self.login_type == "qrcode": + await self.login_by_qrcode() + elif self.login_type == "phone": + await self.login_by_mobile() + elif self.login_type == "cookie": + await self.login_by_cookies() + else: + raise ValueError("Invalid Login Type Currently only supported qrcode or phone or cookie ...") + + @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) + async def check_login_state(self) -> bool: + """ + Check if the current login status is successful and return True otherwise return False + retry decorator will retry 20 times if the return value is False, and the retry interval is 1 second + if max retry times reached, raise RetryError + """ + current_cookie = await self.browser_context.cookies() + _, cookie_dict = utils.convert_cookies(current_cookie) + if cookie_dict.get("SESSDATA", "") or cookie_dict.get("DedeUserID"): + return True + return False async def login_by_qrcode(self): - pass + """login bilibili website and keep webdriver login state""" + utils.logger.info("Begin login bilibili by qrcode ...") + + # click login button + login_button_ele = self.context_page.locator( + "xpath=//div[@class='right-entry__outside go-login-btn']//div" + ) + await login_button_ele.click() + + # find login qrcode + qrcode_img_selector = "//div[@class='login-scan-box']//img" + base64_qrcode_img = await utils.find_login_qrcode( + self.context_page, + selector=qrcode_img_selector + ) + if not base64_qrcode_img: + utils.logger.info("login failed , have not found qrcode please check ....") + sys.exit() + + # show login qrcode + partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) + asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) + + utils.logger.info(f"Waiting for scan code login, remaining time is 20s") + try: + await self.check_login_state() + except RetryError: + utils.logger.info("Login bilibili failed by qrcode login method ...") + sys.exit() + + wait_redirect_seconds = 5 + utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") + await asyncio.sleep(wait_redirect_seconds) async def login_by_mobile(self): pass async def login_by_cookies(self): - pass + utils.logger.info("Begin login bilibili by cookie ...") + for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): + await self.browser_context.add_cookies([{ + 'name': key, + 'value': value, + 'domain': ".bilibili.com", + 'path': "/" + }]) diff --git a/media_platform/kuaishou/login.py b/media_platform/kuaishou/login.py index ad9d02c..a7508ca 100644 --- a/media_platform/kuaishou/login.py +++ b/media_platform/kuaishou/login.py @@ -98,6 +98,6 @@ class KuaishouLogin(AbstractLogin): await self.browser_context.add_cookies([{ 'name': key, 'value': value, - 'domain': ".douyin.com", + 'domain': ".kuaishou.com", 'path': "/" }])