fix: 增加小红书登录两种形态下弹窗的兼容代码

This commit is contained in:
Relakkes 2023-06-22 22:43:26 +08:00
parent 88e8ee302e
commit 1085a2a769
4 changed files with 75 additions and 23 deletions

View File

@ -2,9 +2,12 @@
platform = "xhs"
keyword = "健身"
login_type = "handby" # qrcode or phone
login_phone = "13812345678" # your login phone
login_webSession="040069b5f35b1cfef9787551bd364b86f4d839"
login_type = "cookie" # qrcode or phone or cookie
login_phone = "" # your login phone
# If it's on the Xiaohongshu platform, only the web_session cookie will be kept.
# web_session=040069b2acxxxxxxxxxxxxxxxxxxxx;
cookies = ""
# redis config
redis_db_host = "redis://127.0.0.1"

View File

@ -23,9 +23,9 @@ async def main():
parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform)
parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | handby)', default=config.login_type)
parser.add_argument('--web_session', type=str, help='cookies to keep log in', default=config.login_webSession)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.login_type)
parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone)
parser.add_argument('--cookies', type=str, help='cookies to keep log in', default=config.cookies)
args = parser.parse_args()
crawler = CrawlerFactory().create_crawler(platform=args.platform)
@ -33,7 +33,7 @@ async def main():
keywords=args.keywords,
login_phone=args.phone,
login_type=args.lt,
web_session=args.web_session
cookie_str=args.cookies
)
await crawler.start()

View File

@ -30,7 +30,8 @@ class XiaoHongShuCrawler(Crawler):
self.login_type = None
self.keywords = None
self.web_session = None
self.cookies: Optional[List[Cookie]] = None
self.cookies: Optional[List[Cookie]] = None # cookies from browser context
self.cookie_str: Optional[str] = None # cookie string from config or command line
self.browser_context: Optional[BrowserContext] = None
self.context_page: Optional[Page] = None
self.proxy: Optional[Dict] = None
@ -88,28 +89,51 @@ class XiaoHongShuCrawler(Crawler):
async def login(self):
"""login xiaohongshu website and keep webdriver login state"""
# There are two ways to log in:
# There are three ways to log in:
# 1. Semi-automatic: Log in by scanning the QR code.
# 2. Fully automatic: Log in using forwarded text message notifications
# 3. handby automatic: Log in using preset cookie
# which includes mobile phone number and verification code.
# 3. Semi-automatic: Log in using preset cookie
if self.login_type == "qrcode":
await self.login_by_qrcode()
elif self.login_type == "phone":
await self.login_by_mobile()
elif self.login_type == "handby":
await self.browser_context.add_cookies([{
'name': 'web_session',
'value': self.web_session,
'domain': ".xiaohongshu.com",
'path': "/"
}])
else:
elif self.login_type == "cookie":
# cookie str convert to cookie dict
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{
'name': key,
'value': value,
'domain': ".xiaohongshu.com",
'path': "/"
}])
else:
pass
async def login_by_mobile(self):
print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")
await asyncio.sleep(1)
try:
# After entering the main page of Xiaohongshu,
# the login window may not pop up automatically and you need to manually click the login button.
login_button_ele = await self.context_page.wait_for_selector(
selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
timeout=5000
)
await login_button_ele.click()
# There are also two types of login dialog boxes for pop-ups.
# One type directly shows the phone number and verification code.
# Another type requires clicking to switch to mobile login.
element = await self.context_page.wait_for_selector(
selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
timeout=5000
)
await element.click()
except:
print("have not found mobile button icon and keep going ...")
await asyncio.sleep(1)
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
# Fill login phone
input_ele = await login_container_ele.query_selector("label.phone > input")
@ -158,16 +182,25 @@ class XiaoHongShuCrawler(Crawler):
async def login_by_qrcode(self):
"""login xiaohongshu website and keep webdriver login state"""
print("Start scanning QR code to log in to Xiaohongshu. ...")
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
# find login qrcode
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector="div.login-container > div.left > div.qrcode > img"
selector=qrcode_img_selector
)
if not base64_qrcode_img:
# todo ...if this website does not automatically popup login dialog box, we will manual click login button
print("login failed , have not found qrcode please check ....")
sys.exit()
print("have not found qrcode and try again get it ....")
# if this website does not automatically popup login dialog box, we will manual click login button
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
await login_button_ele.click()
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
print("login failed , program exit ...")
sys.exit()
# get not logged session
current_cookie = await self.browser_context.cookies()

View File

@ -61,6 +61,22 @@ def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]:
return cookies_str, cookie_dict
def convert_str_cookie_to_dict(cookie_str: str) -> Dict:
cookie_dict = dict()
if not cookie_str:
return cookie_dict
for cookie in cookie_str.split(";"):
cookie = cookie.strip()
if not cookie:
continue
cookie = cookie.split("=")
cookie_value = cookie[1]
if isinstance(cookie_value, list):
cookie_value = "".join(cookie_value)
cookie_dict[cookie[0]] = cookie_value
return cookie_dict
def get_current_timestamp():
return int(time.time() * 1000)