fix: 增加小红书登录两种形态下弹窗的兼容代码
This commit is contained in:
parent
88e8ee302e
commit
1085a2a769
|
@ -2,9 +2,12 @@
|
|||
|
||||
platform = "xhs"
|
||||
keyword = "健身"
|
||||
login_type = "handby" # qrcode or phone
|
||||
login_phone = "13812345678" # your login phone
|
||||
login_webSession="040069b5f35b1cfef9787551bd364b86f4d839"
|
||||
login_type = "cookie" # qrcode or phone or cookie
|
||||
login_phone = "" # your login phone
|
||||
|
||||
# If it's on the Xiaohongshu platform, only the web_session cookie will be kept.
|
||||
# web_session=040069b2acxxxxxxxxxxxxxxxxxxxx;
|
||||
cookies = ""
|
||||
|
||||
# redis config
|
||||
redis_db_host = "redis://127.0.0.1"
|
||||
|
|
6
main.py
6
main.py
|
@ -23,9 +23,9 @@ async def main():
|
|||
parser = argparse.ArgumentParser(description='Media crawler program.')
|
||||
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform)
|
||||
parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword)
|
||||
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | handby)', default=config.login_type)
|
||||
parser.add_argument('--web_session', type=str, help='cookies to keep log in', default=config.login_webSession)
|
||||
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.login_type)
|
||||
parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone)
|
||||
parser.add_argument('--cookies', type=str, help='cookies to keep log in', default=config.cookies)
|
||||
|
||||
args = parser.parse_args()
|
||||
crawler = CrawlerFactory().create_crawler(platform=args.platform)
|
||||
|
@ -33,7 +33,7 @@ async def main():
|
|||
keywords=args.keywords,
|
||||
login_phone=args.phone,
|
||||
login_type=args.lt,
|
||||
web_session=args.web_session
|
||||
cookie_str=args.cookies
|
||||
)
|
||||
await crawler.start()
|
||||
|
||||
|
|
|
@ -30,7 +30,8 @@ class XiaoHongShuCrawler(Crawler):
|
|||
self.login_type = None
|
||||
self.keywords = None
|
||||
self.web_session = None
|
||||
self.cookies: Optional[List[Cookie]] = None
|
||||
self.cookies: Optional[List[Cookie]] = None # cookies from browser context
|
||||
self.cookie_str: Optional[str] = None # cookie string from config or command line
|
||||
self.browser_context: Optional[BrowserContext] = None
|
||||
self.context_page: Optional[Page] = None
|
||||
self.proxy: Optional[Dict] = None
|
||||
|
@ -88,28 +89,51 @@ class XiaoHongShuCrawler(Crawler):
|
|||
|
||||
async def login(self):
|
||||
"""login xiaohongshu website and keep webdriver login state"""
|
||||
# There are two ways to log in:
|
||||
# There are three ways to log in:
|
||||
# 1. Semi-automatic: Log in by scanning the QR code.
|
||||
# 2. Fully automatic: Log in using forwarded text message notifications
|
||||
# 3. handby automatic: Log in using preset cookie
|
||||
# which includes mobile phone number and verification code.
|
||||
# 3. Semi-automatic: Log in using preset cookie
|
||||
if self.login_type == "qrcode":
|
||||
await self.login_by_qrcode()
|
||||
elif self.login_type == "phone":
|
||||
await self.login_by_mobile()
|
||||
elif self.login_type == "handby":
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': 'web_session',
|
||||
'value': self.web_session,
|
||||
'domain': ".xiaohongshu.com",
|
||||
'path': "/"
|
||||
}])
|
||||
else:
|
||||
elif self.login_type == "cookie":
|
||||
# cookie str convert to cookie dict
|
||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||
await self.browser_context.add_cookies([{
|
||||
'name': key,
|
||||
'value': value,
|
||||
'domain': ".xiaohongshu.com",
|
||||
'path': "/"
|
||||
}])
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
async def login_by_mobile(self):
|
||||
print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")
|
||||
|
||||
await asyncio.sleep(1)
|
||||
try:
|
||||
# After entering the main page of Xiaohongshu,
|
||||
# the login window may not pop up automatically and you need to manually click the login button.
|
||||
login_button_ele = await self.context_page.wait_for_selector(
|
||||
selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
|
||||
timeout=5000
|
||||
)
|
||||
await login_button_ele.click()
|
||||
|
||||
# There are also two types of login dialog boxes for pop-ups.
|
||||
# One type directly shows the phone number and verification code.
|
||||
# Another type requires clicking to switch to mobile login.
|
||||
element = await self.context_page.wait_for_selector(
|
||||
selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
|
||||
timeout=5000
|
||||
)
|
||||
await element.click()
|
||||
except:
|
||||
print("have not found mobile button icon and keep going ...")
|
||||
await asyncio.sleep(1)
|
||||
|
||||
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
|
||||
# Fill login phone
|
||||
input_ele = await login_container_ele.query_selector("label.phone > input")
|
||||
|
@ -158,16 +182,25 @@ class XiaoHongShuCrawler(Crawler):
|
|||
async def login_by_qrcode(self):
|
||||
"""login xiaohongshu website and keep webdriver login state"""
|
||||
print("Start scanning QR code to log in to Xiaohongshu. ...")
|
||||
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
|
||||
|
||||
# find login qrcode
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector="div.login-container > div.left > div.qrcode > img"
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
# todo ...if this website does not automatically popup login dialog box, we will manual click login button
|
||||
print("login failed , have not found qrcode please check ....")
|
||||
sys.exit()
|
||||
print("have not found qrcode and try again get it ....")
|
||||
# if this website does not automatically popup login dialog box, we will manual click login button
|
||||
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
|
||||
await login_button_ele.click()
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
)
|
||||
if not base64_qrcode_img:
|
||||
print("login failed , program exit ...")
|
||||
sys.exit()
|
||||
|
||||
# get not logged session
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
|
|
16
utils.py
16
utils.py
|
@ -61,6 +61,22 @@ def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]:
|
|||
return cookies_str, cookie_dict
|
||||
|
||||
|
||||
def convert_str_cookie_to_dict(cookie_str: str) -> Dict:
|
||||
cookie_dict = dict()
|
||||
if not cookie_str:
|
||||
return cookie_dict
|
||||
for cookie in cookie_str.split(";"):
|
||||
cookie = cookie.strip()
|
||||
if not cookie:
|
||||
continue
|
||||
cookie = cookie.split("=")
|
||||
cookie_value = cookie[1]
|
||||
if isinstance(cookie_value, list):
|
||||
cookie_value = "".join(cookie_value)
|
||||
cookie_dict[cookie[0]] = cookie_value
|
||||
return cookie_dict
|
||||
|
||||
|
||||
def get_current_timestamp():
|
||||
return int(time.time() * 1000)
|
||||
|
||||
|
|
Loading…
Reference in New Issue