fix: 增加小红书登录两种形态下弹窗的兼容代码
This commit is contained in:
parent
88e8ee302e
commit
1085a2a769
|
@ -2,9 +2,12 @@
|
||||||
|
|
||||||
platform = "xhs"
|
platform = "xhs"
|
||||||
keyword = "健身"
|
keyword = "健身"
|
||||||
login_type = "handby" # qrcode or phone
|
login_type = "cookie" # qrcode or phone or cookie
|
||||||
login_phone = "13812345678" # your login phone
|
login_phone = "" # your login phone
|
||||||
login_webSession="040069b5f35b1cfef9787551bd364b86f4d839"
|
|
||||||
|
# If it's on the Xiaohongshu platform, only the web_session cookie will be kept.
|
||||||
|
# web_session=040069b2acxxxxxxxxxxxxxxxxxxxx;
|
||||||
|
cookies = ""
|
||||||
|
|
||||||
# redis config
|
# redis config
|
||||||
redis_db_host = "redis://127.0.0.1"
|
redis_db_host = "redis://127.0.0.1"
|
||||||
|
|
6
main.py
6
main.py
|
@ -23,9 +23,9 @@ async def main():
|
||||||
parser = argparse.ArgumentParser(description='Media crawler program.')
|
parser = argparse.ArgumentParser(description='Media crawler program.')
|
||||||
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform)
|
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform)
|
||||||
parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword)
|
parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword)
|
||||||
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | handby)', default=config.login_type)
|
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.login_type)
|
||||||
parser.add_argument('--web_session', type=str, help='cookies to keep log in', default=config.login_webSession)
|
|
||||||
parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone)
|
parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone)
|
||||||
|
parser.add_argument('--cookies', type=str, help='cookies to keep log in', default=config.cookies)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
crawler = CrawlerFactory().create_crawler(platform=args.platform)
|
crawler = CrawlerFactory().create_crawler(platform=args.platform)
|
||||||
|
@ -33,7 +33,7 @@ async def main():
|
||||||
keywords=args.keywords,
|
keywords=args.keywords,
|
||||||
login_phone=args.phone,
|
login_phone=args.phone,
|
||||||
login_type=args.lt,
|
login_type=args.lt,
|
||||||
web_session=args.web_session
|
cookie_str=args.cookies
|
||||||
)
|
)
|
||||||
await crawler.start()
|
await crawler.start()
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,8 @@ class XiaoHongShuCrawler(Crawler):
|
||||||
self.login_type = None
|
self.login_type = None
|
||||||
self.keywords = None
|
self.keywords = None
|
||||||
self.web_session = None
|
self.web_session = None
|
||||||
self.cookies: Optional[List[Cookie]] = None
|
self.cookies: Optional[List[Cookie]] = None # cookies from browser context
|
||||||
|
self.cookie_str: Optional[str] = None # cookie string from config or command line
|
||||||
self.browser_context: Optional[BrowserContext] = None
|
self.browser_context: Optional[BrowserContext] = None
|
||||||
self.context_page: Optional[Page] = None
|
self.context_page: Optional[Page] = None
|
||||||
self.proxy: Optional[Dict] = None
|
self.proxy: Optional[Dict] = None
|
||||||
|
@ -88,28 +89,51 @@ class XiaoHongShuCrawler(Crawler):
|
||||||
|
|
||||||
async def login(self):
|
async def login(self):
|
||||||
"""login xiaohongshu website and keep webdriver login state"""
|
"""login xiaohongshu website and keep webdriver login state"""
|
||||||
# There are two ways to log in:
|
# There are three ways to log in:
|
||||||
# 1. Semi-automatic: Log in by scanning the QR code.
|
# 1. Semi-automatic: Log in by scanning the QR code.
|
||||||
# 2. Fully automatic: Log in using forwarded text message notifications
|
# 2. Fully automatic: Log in using forwarded text message notifications
|
||||||
# 3. handby automatic: Log in using preset cookie
|
# 3. Semi-automatic: Log in using preset cookie
|
||||||
# which includes mobile phone number and verification code.
|
|
||||||
if self.login_type == "qrcode":
|
if self.login_type == "qrcode":
|
||||||
await self.login_by_qrcode()
|
await self.login_by_qrcode()
|
||||||
elif self.login_type == "phone":
|
elif self.login_type == "phone":
|
||||||
await self.login_by_mobile()
|
await self.login_by_mobile()
|
||||||
elif self.login_type == "handby":
|
elif self.login_type == "cookie":
|
||||||
await self.browser_context.add_cookies([{
|
# cookie str convert to cookie dict
|
||||||
'name': 'web_session',
|
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||||
'value': self.web_session,
|
await self.browser_context.add_cookies([{
|
||||||
'domain': ".xiaohongshu.com",
|
'name': key,
|
||||||
'path': "/"
|
'value': value,
|
||||||
}])
|
'domain': ".xiaohongshu.com",
|
||||||
else:
|
'path': "/"
|
||||||
|
}])
|
||||||
|
else:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
async def login_by_mobile(self):
|
async def login_by_mobile(self):
|
||||||
print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")
|
print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")
|
||||||
|
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
try:
|
||||||
|
# After entering the main page of Xiaohongshu,
|
||||||
|
# the login window may not pop up automatically and you need to manually click the login button.
|
||||||
|
login_button_ele = await self.context_page.wait_for_selector(
|
||||||
|
selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
|
||||||
|
timeout=5000
|
||||||
|
)
|
||||||
|
await login_button_ele.click()
|
||||||
|
|
||||||
|
# There are also two types of login dialog boxes for pop-ups.
|
||||||
|
# One type directly shows the phone number and verification code.
|
||||||
|
# Another type requires clicking to switch to mobile login.
|
||||||
|
element = await self.context_page.wait_for_selector(
|
||||||
|
selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
|
||||||
|
timeout=5000
|
||||||
|
)
|
||||||
|
await element.click()
|
||||||
|
except:
|
||||||
|
print("have not found mobile button icon and keep going ...")
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
|
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
|
||||||
# Fill login phone
|
# Fill login phone
|
||||||
input_ele = await login_container_ele.query_selector("label.phone > input")
|
input_ele = await login_container_ele.query_selector("label.phone > input")
|
||||||
|
@ -158,16 +182,25 @@ class XiaoHongShuCrawler(Crawler):
|
||||||
async def login_by_qrcode(self):
|
async def login_by_qrcode(self):
|
||||||
"""login xiaohongshu website and keep webdriver login state"""
|
"""login xiaohongshu website and keep webdriver login state"""
|
||||||
print("Start scanning QR code to log in to Xiaohongshu. ...")
|
print("Start scanning QR code to log in to Xiaohongshu. ...")
|
||||||
|
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
|
||||||
|
|
||||||
# find login qrcode
|
# find login qrcode
|
||||||
base64_qrcode_img = await utils.find_login_qrcode(
|
base64_qrcode_img = await utils.find_login_qrcode(
|
||||||
self.context_page,
|
self.context_page,
|
||||||
selector="div.login-container > div.left > div.qrcode > img"
|
selector=qrcode_img_selector
|
||||||
)
|
)
|
||||||
if not base64_qrcode_img:
|
if not base64_qrcode_img:
|
||||||
# todo ...if this website does not automatically popup login dialog box, we will manual click login button
|
print("have not found qrcode and try again get it ....")
|
||||||
print("login failed , have not found qrcode please check ....")
|
# if this website does not automatically popup login dialog box, we will manual click login button
|
||||||
sys.exit()
|
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
|
||||||
|
await login_button_ele.click()
|
||||||
|
base64_qrcode_img = await utils.find_login_qrcode(
|
||||||
|
self.context_page,
|
||||||
|
selector=qrcode_img_selector
|
||||||
|
)
|
||||||
|
if not base64_qrcode_img:
|
||||||
|
print("login failed , program exit ...")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
# get not logged session
|
# get not logged session
|
||||||
current_cookie = await self.browser_context.cookies()
|
current_cookie = await self.browser_context.cookies()
|
||||||
|
|
16
utils.py
16
utils.py
|
@ -61,6 +61,22 @@ def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]:
|
||||||
return cookies_str, cookie_dict
|
return cookies_str, cookie_dict
|
||||||
|
|
||||||
|
|
||||||
|
def convert_str_cookie_to_dict(cookie_str: str) -> Dict:
|
||||||
|
cookie_dict = dict()
|
||||||
|
if not cookie_str:
|
||||||
|
return cookie_dict
|
||||||
|
for cookie in cookie_str.split(";"):
|
||||||
|
cookie = cookie.strip()
|
||||||
|
if not cookie:
|
||||||
|
continue
|
||||||
|
cookie = cookie.split("=")
|
||||||
|
cookie_value = cookie[1]
|
||||||
|
if isinstance(cookie_value, list):
|
||||||
|
cookie_value = "".join(cookie_value)
|
||||||
|
cookie_dict[cookie[0]] = cookie_value
|
||||||
|
return cookie_dict
|
||||||
|
|
||||||
|
|
||||||
def get_current_timestamp():
|
def get_current_timestamp():
|
||||||
return int(time.time() * 1000)
|
return int(time.time() * 1000)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue