154 lines
6.5 KiB
Python
154 lines
6.5 KiB
Python
|
import sys
|
|||
|
import asyncio
|
|||
|
from typing import Optional, List, Dict
|
|||
|
|
|||
|
from playwright.async_api import Page
|
|||
|
from playwright.async_api import Cookie
|
|||
|
from playwright.async_api import BrowserContext
|
|||
|
from playwright.async_api import async_playwright
|
|||
|
|
|||
|
import utils
|
|||
|
from .client import XHSClient
|
|||
|
from base_crawler import Crawler
|
|||
|
|
|||
|
|
|||
|
class XiaoHongShuCrawler(Crawler):
|
|||
|
def __init__(self):
|
|||
|
self.keywords = None
|
|||
|
self.scan_qrcode_time = None
|
|||
|
self.cookies: Optional[List[Cookie]] = None
|
|||
|
self.browser_context: Optional[BrowserContext] = None
|
|||
|
self.context_page: Optional[Page] = None
|
|||
|
self.proxy: Optional[Dict] = None
|
|||
|
self.user_agent = utils.get_user_agent()
|
|||
|
self.xhs_client: Optional[XHSClient] = None
|
|||
|
self.login_url = "https://www.xiaohongshu.com"
|
|||
|
self.scan_qrcode_time = 20 # second
|
|||
|
|
|||
|
def init_config(self, **kwargs):
|
|||
|
self.keywords = kwargs.get("keywords")
|
|||
|
|
|||
|
async def update_cookies(self):
|
|||
|
self.cookies = await self.browser_context.cookies()
|
|||
|
|
|||
|
async def start(self):
|
|||
|
async with async_playwright() as playwright:
|
|||
|
# launch browser and create single browser context
|
|||
|
chromium = playwright.chromium
|
|||
|
browser = await chromium.launch(headless=True)
|
|||
|
self.browser_context = await browser.new_context(
|
|||
|
viewport={"width": 1920, "height": 1080},
|
|||
|
user_agent=self.user_agent,
|
|||
|
proxy=self.proxy
|
|||
|
)
|
|||
|
|
|||
|
# execute JS to bypass anti automation/crawler detection
|
|||
|
await self.browser_context.add_init_script(path="libs/stealth.min.js")
|
|||
|
self.context_page = await self.browser_context.new_page()
|
|||
|
await self.context_page.goto(self.login_url)
|
|||
|
|
|||
|
# scan qrcode login
|
|||
|
await self.login()
|
|||
|
await self.update_cookies()
|
|||
|
|
|||
|
# init request client
|
|||
|
cookie_str, cookie_dict = utils.convert_cookies(self.cookies)
|
|||
|
self.xhs_client = XHSClient(
|
|||
|
proxies=self.proxy,
|
|||
|
headers={
|
|||
|
"User-Agent": self.user_agent,
|
|||
|
"Cookie": cookie_str,
|
|||
|
"Origin": "https://www.xiaohongshu.com",
|
|||
|
"Referer": "https://www.xiaohongshu.com",
|
|||
|
"Content-Type": "application/json;charset=UTF-8"
|
|||
|
},
|
|||
|
playwright_page=self.context_page,
|
|||
|
cookie_dict=cookie_dict,
|
|||
|
)
|
|||
|
|
|||
|
# Search for notes and retrieve their comment information.
|
|||
|
note_res = await self.search_posts()
|
|||
|
for post_item in note_res.get("items"):
|
|||
|
note_id = post_item.get("id")
|
|||
|
await self.get_comments(note_id=note_id)
|
|||
|
await asyncio.sleep(1)
|
|||
|
|
|||
|
# block main crawler coroutine
|
|||
|
await asyncio.Event().wait()
|
|||
|
|
|||
|
async def login(self):
|
|||
|
"""login xiaohongshu website and keep webdriver login state"""
|
|||
|
print("Begin login xiaohongshu ...")
|
|||
|
|
|||
|
# find login qrcode
|
|||
|
base64_qrcode_img = await utils.find_login_qrcode(
|
|||
|
self.context_page,
|
|||
|
selector="div.login-container > div.left > div.qrcode > img"
|
|||
|
)
|
|||
|
current_cookie = await self.browser_context.cookies()
|
|||
|
_, cookie_dict = utils.convert_cookies(current_cookie)
|
|||
|
no_logged_in_session = cookie_dict.get("web_session")
|
|||
|
if not base64_qrcode_img:
|
|||
|
|
|||
|
if await self.check_login_state(no_logged_in_session):
|
|||
|
return
|
|||
|
# todo ...if this website does not automatically popup login dialog box, we will manual click login button
|
|||
|
print("login failed , have not found qrcode please check ....")
|
|||
|
sys.exit()
|
|||
|
|
|||
|
# show login qrcode
|
|||
|
utils.show_qrcode(base64_qrcode_img)
|
|||
|
|
|||
|
while self.scan_qrcode_time > 0:
|
|||
|
await asyncio.sleep(1)
|
|||
|
self.scan_qrcode_time -= 1
|
|||
|
print(f"waiting for scan code login, remaining time is {self.scan_qrcode_time} seconds")
|
|||
|
# get login state from browser
|
|||
|
if await self.check_login_state(no_logged_in_session):
|
|||
|
# If the QR code login is successful, you need to wait for a moment.
|
|||
|
# Because there will be a second redirection after successful login
|
|||
|
# executing JS during this period may be performed in a Page that has already been destroyed.
|
|||
|
wait_for_seconds = 5
|
|||
|
print(f"Login successful then wait for {wait_for_seconds} seconds redirect ...")
|
|||
|
while wait_for_seconds > 0:
|
|||
|
await asyncio.sleep(1)
|
|||
|
print(f"remaining wait {wait_for_seconds} seconds ...")
|
|||
|
wait_for_seconds -= 1
|
|||
|
break
|
|||
|
else:
|
|||
|
sys.exit()
|
|||
|
|
|||
|
async def check_login_state(self, no_logged_in_session: str) -> bool:
|
|||
|
"""Check if the current login status is successful and return True otherwise return False"""
|
|||
|
current_cookie = await self.browser_context.cookies()
|
|||
|
_, cookie_dict = utils.convert_cookies(current_cookie)
|
|||
|
current_web_session = cookie_dict.get("web_session")
|
|||
|
if current_web_session != no_logged_in_session:
|
|||
|
return True
|
|||
|
return False
|
|||
|
|
|||
|
async def search_posts(self):
|
|||
|
# This function only retrieves the first 10 note
|
|||
|
# And you can continue to make requests to obtain more by checking the boolean status of "has_more".
|
|||
|
print("Begin search xiaohongshu keywords: ", self.keywords)
|
|||
|
posts_res = await self.xhs_client.get_note_by_keyword(keyword=self.keywords)
|
|||
|
for post_item in posts_res.get("items"):
|
|||
|
note_id = post_item.get("id")
|
|||
|
title = post_item.get("note_card", {}).get("display_title")
|
|||
|
print(f"Note ID:{note_id}; Title:{title}")
|
|||
|
# todo record note or save to db or csv
|
|||
|
return posts_res
|
|||
|
|
|||
|
async def get_comments(self, note_id: str):
|
|||
|
# This function only retrieves the first 10 comments
|
|||
|
# And you can continue to make requests to obtain more by checking the boolean status of "has_more".
|
|||
|
print("Begin get note id comments ", note_id)
|
|||
|
res = await self.xhs_client.get_note_comments(note_id=note_id)
|
|||
|
# res = await self.xhs_client.get_note_all_comments(note_id=note_id)
|
|||
|
for comment in res.get("comments"):
|
|||
|
nick_name = comment.get("user_info").get("nickname")
|
|||
|
comment_content = comment.get("content")
|
|||
|
print(f"Nickname:{nick_name}; Comment content:{comment_content}")
|
|||
|
# todo save to db or csv
|
|||
|
return res
|