初始化项目
This commit is contained in:
parent
1e78c893bf
commit
50fa6ae8b8
|
@ -0,0 +1,31 @@
|
||||||
|
## 招聘网站数据爬取
|
||||||
|
|
||||||
|
### 参考项目:
|
||||||
|
|
||||||
|
#### 拉钩网:
|
||||||
|
|
||||||
|
> 简单的 python 爬取网站的案例 全网代理、58 到家、房价网、东方财富、ITOrange、邮政编码、康美中药、拉钩、猫眼、投融资、中国裁判文书网、自如网、百科网、中国房价网、网易云音乐、去哪儿网、汽车之家
|
||||||
|
- [spider-project](https://github.com/tanjunchen/spider-project)
|
||||||
|
|
||||||
|
> 一个拉钩网招聘信息的爬虫,仅供学习参考用!任何商业用途后果自负
|
||||||
|
- [lagou_crawler](https://github.com/DE009/lagou_crawler)
|
||||||
|
|
||||||
|
#### BOSS 直聘
|
||||||
|
|
||||||
|
> 爬虫逆向案例,已完成:网易易盾 | 微信小程序反编译逆向(百达星系) | 同花顺 | rpc解密 | 加速乐 | 极验滑块验证码 | 巨量算数 | Boss直聘 | 企查查 | 中国五矿 | qq音乐 | 产业政策大数据平台 | 企知道 | 雪球网(acw_sc__v2) | 1688 | 七麦数据 | whggzy | 企名科技 | mohurd | 艺恩数据 | 欧科云链
|
||||||
|
- [spider_reverse](https://github.com/ChenZixinn/spider_reverse)
|
||||||
|
|
||||||
|
> python爬虫项目合集,从基础到js逆向,包含基础篇、自动化篇、进阶篇以及验证码篇。案例涵盖各大网站(xhs douyin weibo ins boss job,jd...),你将会学到有关爬虫以及反爬虫、自动化和验证码的各方面知识
|
||||||
|
-[crawlProject](https://github.com/xishandong/crawlProject)
|
||||||
|
|
||||||
|
#### 智联招聘
|
||||||
|
|
||||||
|
|
||||||
|
#### 简历自动投递
|
||||||
|
|
||||||
|
> 帮你自动在拉钩网上投递简历
|
||||||
|
|
||||||
|
- [拉钩网自动投递](https://github.com/BeammNotFound/get-jobs-lagou)
|
||||||
|
- [Boss直聘自动投递](https://github.com/BeammNotFound/get-jobs-boss)
|
||||||
|
- [前程无忧自动投递](https://github.com/BeammNotFound/get-jobs-51job)
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
[获取全部检索条件的URL](https://www.zhipin.com/wapi/zpgeek/search/job/condition.json)
|
||||||
|
|
||||||
|
[获取全国城市URL](https://www.zhipin.com/wapi/zpCommon/data/cityGroup.json)
|
||||||
|
|
||||||
|
[职位搜索的主URL,需要参数](https://www.zhipin.com/wapi/zpgeek/mobile/search/joblist.json)
|
||||||
|
- scene: 1
|
||||||
|
- query: Java
|
||||||
|
- city: 101010100
|
||||||
|
- experience:
|
||||||
|
- payType:
|
||||||
|
- partTime:
|
||||||
|
- degree:
|
||||||
|
- industry:
|
||||||
|
- scale:
|
||||||
|
- stage:
|
||||||
|
- position:
|
||||||
|
- jobType:
|
||||||
|
- salary:
|
||||||
|
- multiBusinessDistrict:
|
||||||
|
- multiSubway:
|
||||||
|
- page: 1
|
||||||
|
- pageSize: 30
|
||||||
|
|
||||||
|
[获取相关搜索条件的全部关键词](https://www.zhipin.com/wapi/zpgeek/search/job/related/word.json?query=关键词)
|
||||||
|
[获取相关城市城区信息](https://www.zhipin.com/wapi/zpgeek/businessDistrict.json?cityCode=101010100)
|
||||||
|
[获取地铁线路信息,根据地铁线路进行检索时需要](https://www.zhipin.com/wapi/zpCommon/data/getSubwayByCity?cityCode=101010100)
|
||||||
|
[公司行业检索条件数据](https://www.zhipin.com/wapi/zpCommon/data/industry.json)
|
||||||
|
[职位类型检索条件数据](https://www.zhipin.com/wapi/zpCommon/data/getCityShowPosition)
|
|
@ -0,0 +1,367 @@
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from csv import DictWriter
|
||||||
|
from itertools import islice
|
||||||
|
from typing import Literal, Iterator, Union
|
||||||
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
|
||||||
|
import execjs
|
||||||
|
import requests
|
||||||
|
from lxml import etree
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# ip代理信息
|
||||||
|
# from Proxy_info import proxies, get_api
|
||||||
|
# from boss.点选 import BossSlide
|
||||||
|
|
||||||
|
# 类型控制
|
||||||
|
Accept = Literal['json', 'text', 'contents']
|
||||||
|
city_code_dict: dict = json.load(open('cityCode.json', 'r', encoding='utf-8'))
|
||||||
|
|
||||||
|
# 休眠时间
|
||||||
|
sleepTime = 5
|
||||||
|
|
||||||
|
|
||||||
|
class BossJob:
|
||||||
|
def __init__(self, js_name: str = '', proxy: dict = None):
|
||||||
|
self.isFirst: bool = True # 是否为初次访问
|
||||||
|
self.js_name: str = js_name # js的名称
|
||||||
|
self.seed: str = '' # 随机种子
|
||||||
|
self.ts: str = '' # 时间戳
|
||||||
|
# api列表
|
||||||
|
self.apiList: list[str] = [
|
||||||
|
'https://www.zhipin.com/wapi/zpgeek/mobile/search/joblist.json', # 职位搜索页, 需要指定params
|
||||||
|
'https://www.zhipin.com/job_detail/', # 不需要指定params
|
||||||
|
f'https://www.zhipin.com/web/common/security-js/{self.js_name}.js', # 动态加载js的链接
|
||||||
|
'https://www.zhipin.com/wapi/zpgeek/search/joblist.json' # web api
|
||||||
|
]
|
||||||
|
# 请求头
|
||||||
|
self.headers: dict = {
|
||||||
|
'Accept': 'application/json, text/plain, */*',
|
||||||
|
}
|
||||||
|
self.cookies: dict = {} # cookie
|
||||||
|
self.js = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()) # 调用的js
|
||||||
|
self.stop: bool = False # 控制手机端搜索停止
|
||||||
|
self.checkEnd: str = '' # 检测手机端是否爬完
|
||||||
|
self.proxy = proxy # 代理
|
||||||
|
|
||||||
|
# 发送请求
|
||||||
|
def ajax_request(self, url: str, params: dict = None, cookies=None) -> requests.Response:
|
||||||
|
for _ in range(5):
|
||||||
|
try:
|
||||||
|
resp = requests.get(url, params=params, headers=self.headers, cookies=cookies, timeout=10,
|
||||||
|
# proxies=self.proxy
|
||||||
|
)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
return resp
|
||||||
|
elif resp.status_code == 403:
|
||||||
|
print("=====出现响应码403, ip被封=====")
|
||||||
|
self.show_pro(sleepTime)
|
||||||
|
self.change_ip()
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
print('HTTP Error: %s' % resp.status_code)
|
||||||
|
self.show_pro(sleepTime)
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print('出现错误: ', e)
|
||||||
|
print('链接为: ', url)
|
||||||
|
self.show_pro(sleepTime)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise Exception('超过5次也无法正常获取响应...')
|
||||||
|
|
||||||
|
# 初始化搜索
|
||||||
|
def first_get_seed(self, url: str, params: dict = None, isWeb: bool = False) -> Union[requests.Response, None]:
|
||||||
|
if self.isFirst:
|
||||||
|
resp = self.ajax_request(url=url, params=params)
|
||||||
|
self.isFirst = False
|
||||||
|
else:
|
||||||
|
resp = self.ajax_request(url=url, params=params, cookies=self.cookies)
|
||||||
|
# 未发生重定向以及是web端的情况
|
||||||
|
if resp.url == url and not isWeb:
|
||||||
|
print(f'=====本次没有更新cookie: {resp.url} =====')
|
||||||
|
return resp
|
||||||
|
elif isWeb:
|
||||||
|
zpData = resp.json()['zpData']
|
||||||
|
self.seed = zpData['seed']
|
||||||
|
self.ts = zpData['ts']
|
||||||
|
name = zpData['name']
|
||||||
|
self.check_js(name)
|
||||||
|
return
|
||||||
|
# 处理重定向到检查页面的情况
|
||||||
|
parsedUrl = urlparse(resp.url)
|
||||||
|
generatedDict = parse_qs(parsedUrl.query)
|
||||||
|
self.seed = generatedDict['seed'][0]
|
||||||
|
self.ts = generatedDict['ts'][0]
|
||||||
|
name = generatedDict['name'][0]
|
||||||
|
self.check_js(name)
|
||||||
|
|
||||||
|
# 手机端搜索职位
|
||||||
|
def search_job_mobile(self, position: str, city: str, startPage: int = 1) -> Iterator:
|
||||||
|
self.headers.update({
|
||||||
|
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36',
|
||||||
|
})
|
||||||
|
city_code = city_code_dict.get(city)
|
||||||
|
if city_code:
|
||||||
|
params: dict = {
|
||||||
|
'city': city_code,
|
||||||
|
'query': position,
|
||||||
|
'page': startPage
|
||||||
|
}
|
||||||
|
# 初始化搜索
|
||||||
|
self.first_get_seed(self.apiList[1], params)
|
||||||
|
self.update_cookie()
|
||||||
|
continuations: list = [params]
|
||||||
|
# 模拟翻页
|
||||||
|
while continuations:
|
||||||
|
continuation = continuations.pop()
|
||||||
|
resp = self.ajax_request('https://www.zhipin.com/wapi/zpgeek/mobile/search/joblist.json',
|
||||||
|
params=continuation, cookies=self.cookies)
|
||||||
|
html = resp.json().get('zpData', {}).get('html')
|
||||||
|
# 存在新的帖子
|
||||||
|
if html and self.stop is False:
|
||||||
|
print(f'=====爬取{position}-{city}第{continuation["page"]}页=====')
|
||||||
|
continuation['page'] += 1
|
||||||
|
continuations.append(continuation)
|
||||||
|
# 提交数据
|
||||||
|
yield from self.parse_search_html(html)
|
||||||
|
# 控制爬取频率
|
||||||
|
self.show_pro(sleepTime)
|
||||||
|
elif not html and self.stop is False:
|
||||||
|
print('=====ip被封=====')
|
||||||
|
continuations.append(continuation)
|
||||||
|
self.show_pro(sleepTime)
|
||||||
|
self.change_ip()
|
||||||
|
self.isFirst = True
|
||||||
|
self.first_get_seed(self.apiList[1], params)
|
||||||
|
self.update_cookie()
|
||||||
|
else:
|
||||||
|
print(f'=====爬取{position}-{city}停止=====')
|
||||||
|
else:
|
||||||
|
raise Exception(f'错误的城市名称: {city}')
|
||||||
|
|
||||||
|
# web端搜索
|
||||||
|
def search_job_web(self, position: str, city: str, startPage: int = 1, totalPage: int = 1) -> Iterator:
|
||||||
|
self.headers.update({
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
|
||||||
|
})
|
||||||
|
city_code = city_code_dict.get(city)
|
||||||
|
|
||||||
|
if city_code:
|
||||||
|
params = {
|
||||||
|
'query': position,
|
||||||
|
'city': city_code,
|
||||||
|
'page': 1,
|
||||||
|
'pageSize': '30',
|
||||||
|
'scene': '1',
|
||||||
|
}
|
||||||
|
# 初次访问
|
||||||
|
self.isFirst = True
|
||||||
|
self.first_get_seed(self.apiList[3], params=params, isWeb=True)
|
||||||
|
page = startPage
|
||||||
|
# 控制翻页
|
||||||
|
while page <= totalPage:
|
||||||
|
params.update({'page': page})
|
||||||
|
self.update_cookie()
|
||||||
|
resp = self.ajax_request(self.apiList[3], params=params, cookies=self.cookies)
|
||||||
|
|
||||||
|
print(f'=====爬取{position}-{city}第{page}页=====')
|
||||||
|
# 出现访问异常,重新生成cookie
|
||||||
|
if resp.json().get('code') == 37:
|
||||||
|
print(f'====={resp.json().get("message")}, 正在重试 =====')
|
||||||
|
zpData = resp.json()['zpData']
|
||||||
|
self.seed = zpData['seed']
|
||||||
|
self.ts = zpData['ts']
|
||||||
|
self.show_pro(sleepTime)
|
||||||
|
continue
|
||||||
|
# 出现ip被封,暂停一下
|
||||||
|
elif resp.json().get('code') == 5002:
|
||||||
|
print(f'====={resp.json().get("message")}=====')
|
||||||
|
self.show_pro(sleepTime)
|
||||||
|
self.change_ip()
|
||||||
|
self.isFirst = True
|
||||||
|
self.first_get_seed(self.apiList[3], params=params, isWeb=True)
|
||||||
|
continue
|
||||||
|
# 得到数据
|
||||||
|
searchData = resp.json().get('zpData', {}).get('jobList')
|
||||||
|
if searchData:
|
||||||
|
page += 1
|
||||||
|
# 提交管道
|
||||||
|
yield from self.parse_search_data(searchData)
|
||||||
|
# 休息一下
|
||||||
|
self.show_pro(sleepTime)
|
||||||
|
# 获取下一次访问所需种子和时间戳
|
||||||
|
self.seed = resp.cookies['__zp_sseed__']
|
||||||
|
self.ts = resp.cookies['__zp_sts__']
|
||||||
|
else:
|
||||||
|
raise Exception(f'错误的城市名称: {city}')
|
||||||
|
|
||||||
|
# 获取详情页
|
||||||
|
def get_job_details_by_id(self, encryptJobId: str) -> str:
|
||||||
|
url = self.apiList[1] + encryptJobId + '.html'
|
||||||
|
return self.get_job_details_bt_url(url)
|
||||||
|
|
||||||
|
# 获取详情页
|
||||||
|
def get_job_details_bt_url(self, url: str) -> str:
|
||||||
|
self.headers.update({
|
||||||
|
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36',
|
||||||
|
})
|
||||||
|
resp = self.first_get_seed(url)
|
||||||
|
self.update_cookie()
|
||||||
|
|
||||||
|
if not resp:
|
||||||
|
resp = self.ajax_request(url, cookies=self.cookies)
|
||||||
|
tree = etree.HTML(resp.text)
|
||||||
|
texts = tree.xpath('//div[@class="detail-content"]//text()')
|
||||||
|
textList: list = [i.strip() for i in texts if i.strip()]
|
||||||
|
|
||||||
|
if not textList:
|
||||||
|
print('===== 重置cookie获取详情页 =====')
|
||||||
|
self.isFirst = True
|
||||||
|
self.show_pro(sleepTime)
|
||||||
|
return self.get_job_details_bt_url(url)
|
||||||
|
|
||||||
|
return '\n'.join(textList)
|
||||||
|
|
||||||
|
# 保存手机端搜索结果
|
||||||
|
def save_job_list_to_csv(self, position: str, city: str, startPage: int = 1, saveCount: int = 100):
|
||||||
|
dataSet: Iterator = self.search_job_mobile(position, city, startPage)
|
||||||
|
|
||||||
|
header = ['job_name', 'detail_url', 'pay', 'company_name', 'requirement']
|
||||||
|
fp = open(f'mobile-{position}-{city}.csv', 'w', encoding='utf-8', newline='')
|
||||||
|
writer = DictWriter(fp, header)
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
for job in islice(dataSet, saveCount):
|
||||||
|
job['requirement'] = ';'.join(job['requirement'])
|
||||||
|
writer.writerow(job)
|
||||||
|
|
||||||
|
# 保存web端搜索结果
|
||||||
|
def save_job_list_to_csv_web(self, position: str, city: str, startPage: int = 1, savePage: int = 2):
|
||||||
|
dataSet = self.search_job_web(position, city, startPage, savePage)
|
||||||
|
|
||||||
|
header = [
|
||||||
|
'jobName', 'encryptJobId', 'salaryDesc', 'jobLabels', 'skills', 'jobExperience',
|
||||||
|
'jobDegree', 'cityName', 'brandName', 'brandScaleName', 'welfareList', 'brandIndustry'
|
||||||
|
]
|
||||||
|
fp = open(f'web-{position}-{city}.csv', 'w', encoding='utf-8', newline='')
|
||||||
|
writer = DictWriter(fp, header)
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
for job in dataSet:
|
||||||
|
job['jobLabels'] = ';'.join(job['jobLabels'])
|
||||||
|
job['skills'] = ';'.join(job['skills'])
|
||||||
|
job['welfareList'] = ';'.join(job['welfareList'])
|
||||||
|
writer.writerow(job)
|
||||||
|
|
||||||
|
# 更新cookie
|
||||||
|
def update_cookie(self):
|
||||||
|
print(f"seed === {self.seed} , ts === {self.ts}")
|
||||||
|
__zp = self.js.call('r', self.seed, self.ts)
|
||||||
|
self.cookies['__zp_stoken__'] = __zp
|
||||||
|
print(f'=====更新cookie: {self.cookies["__zp_stoken__"]}')
|
||||||
|
|
||||||
|
# 解析手机端搜索
|
||||||
|
def parse_search_html(self, html: str) -> Iterator:
|
||||||
|
tree = etree.HTML(html)
|
||||||
|
li_list = tree.xpath('//li')
|
||||||
|
|
||||||
|
for num, li in enumerate(li_list, start=1):
|
||||||
|
if num == 1:
|
||||||
|
if self.checkEnd == li.xpath('./a/@href')[0]:
|
||||||
|
self.stop = True
|
||||||
|
return
|
||||||
|
self.checkEnd = li.xpath('./a/@href')[0]
|
||||||
|
|
||||||
|
yield {
|
||||||
|
'job_name': li.xpath('./a/div[1]/span[1]/text()')[0],
|
||||||
|
'detail_url': 'https://www.zhipin.com' + li.xpath('./a/@href')[0],
|
||||||
|
'pay': li.xpath('a/div[1]/span[2]/text()')[0],
|
||||||
|
'company_name': li.xpath('./a/div[2]/span[1]/text()')[0],
|
||||||
|
'requirement': [r.strip() for r in li.xpath('./a/div[3]//text()') if r.strip()]
|
||||||
|
}
|
||||||
|
|
||||||
|
# 检查js是否为最新
|
||||||
|
def check_js(self, name):
|
||||||
|
if self.js_name != name:
|
||||||
|
self.js_name = name
|
||||||
|
print(f"=====这次的js名称 -----> {name} =====")
|
||||||
|
resp = self.ajax_request(f'https://www.zhipin.com/web/common/security-js/{self.js_name}.js').text
|
||||||
|
resp_ = resp.split('module,')
|
||||||
|
resp = ''
|
||||||
|
|
||||||
|
# 对 module 进行处理,否则容易识别为爬虫
|
||||||
|
for i in range(len(resp_)):
|
||||||
|
resp += resp_[i]
|
||||||
|
if i == 0:
|
||||||
|
resp += 'module_,'
|
||||||
|
if i == 1:
|
||||||
|
resp += 'module,'
|
||||||
|
|
||||||
|
with open('./jssss.js', 'w', encoding='utf-8') as f:
|
||||||
|
f.write(resp)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
# 解析web端搜索结果
|
||||||
|
def parse_search_data(searchData: list[dict]) -> Iterator:
|
||||||
|
for job in searchData:
|
||||||
|
yield {
|
||||||
|
'jobName': job['jobName'],
|
||||||
|
'encryptJobId': job['encryptJobId'],
|
||||||
|
'salaryDesc': job['salaryDesc'],
|
||||||
|
'jobLabels': job['jobLabels'],
|
||||||
|
'skills': job['skills'],
|
||||||
|
'jobExperience': job['jobExperience'],
|
||||||
|
'jobDegree': job['jobDegree'],
|
||||||
|
'cityName': job['cityName'],
|
||||||
|
'brandName': job['brandName'],
|
||||||
|
'brandScaleName': job['brandScaleName'],
|
||||||
|
'welfareList': job['welfareList'],
|
||||||
|
'brandIndustry': job['brandIndustry']
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def change_ip():
|
||||||
|
# response = requests.get(
|
||||||
|
# 'https://www.zhipin.com/wapi/zpAntispam/v2/geetest/validate',
|
||||||
|
# params=self.__do_verify(),
|
||||||
|
# cookies=self.cookies,
|
||||||
|
# headers=self.headers,
|
||||||
|
# )
|
||||||
|
# print(response.text)
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
# 展示休息进度条
|
||||||
|
def show_pro(t: int, isOpen: bool = True):
|
||||||
|
pass
|
||||||
|
# time.sleep(1)
|
||||||
|
# if isOpen:
|
||||||
|
# for _ in tqdm(
|
||||||
|
# range(t * 10),
|
||||||
|
# leave=False,
|
||||||
|
# colour='blue',
|
||||||
|
# desc='正在等待中...',
|
||||||
|
# ascii='*-'
|
||||||
|
# ):
|
||||||
|
# time.sleep(0.1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
boss = BossJob('8955eed0')
|
||||||
|
# 通过url获取详情页
|
||||||
|
# detail = boss.get_job_details_bt_url('https://www.zhipin.com/job_detail/fc823036861698e10nF42NW0GVo~.html')
|
||||||
|
# 通过加密id获取详情页
|
||||||
|
# detail = boss.get_job_details_by_id('05988daddc5b6afc1n1-3du1FVZW')
|
||||||
|
# print(detail)
|
||||||
|
# 保存数据
|
||||||
|
# boss.save_job_list_to_csv('python', '上海', saveCount=20)
|
||||||
|
# boss.save_job_list_to_csv_web('python', '上海', 2, 2)
|
||||||
|
# web搜索
|
||||||
|
items = boss.search_job_web('python', '上海', 1, 10)
|
||||||
|
# mobile搜搜
|
||||||
|
# items = boss.search_job_mobile('web', '上海')
|
||||||
|
for item in items:
|
||||||
|
print(item)
|
|
@ -0,0 +1,373 @@
|
||||||
|
{
|
||||||
|
"鞍山": 101070300,
|
||||||
|
"阿拉善盟": 101081200,
|
||||||
|
"安康": 101110700,
|
||||||
|
"阿克苏地区": 101131000,
|
||||||
|
"阿勒泰地区": 101131500,
|
||||||
|
"阿拉尔": 101131700,
|
||||||
|
"阿里地区": 101140700,
|
||||||
|
"安阳": 101180200,
|
||||||
|
"安庆": 101220600,
|
||||||
|
"安顺": 101260300,
|
||||||
|
"阿坝藏族羌族自治州": 101271900,
|
||||||
|
"澳门": 101330100,
|
||||||
|
"北京": 101010100,
|
||||||
|
"白城": 101060500,
|
||||||
|
"白山": 101060800,
|
||||||
|
"本溪": 101070500,
|
||||||
|
"包头": 101080200,
|
||||||
|
"巴彦淖尔": 101080800,
|
||||||
|
"保定": 101090200,
|
||||||
|
"宝鸡": 101110900,
|
||||||
|
"滨州": 101121100,
|
||||||
|
"巴音郭楞蒙古自治州": 101130400,
|
||||||
|
"博尔塔拉蒙古自治州": 101130500,
|
||||||
|
"北屯市": 101132100,
|
||||||
|
"白银": 101161000,
|
||||||
|
"蚌埠": 101220200,
|
||||||
|
"亳州": 101220900,
|
||||||
|
"毕节": 101260500,
|
||||||
|
"巴中": 101270900,
|
||||||
|
"保山": 101290300,
|
||||||
|
"百色": 101301000,
|
||||||
|
"北海": 101301300,
|
||||||
|
"白沙黎族自治县": 101311400,
|
||||||
|
"保亭黎族苗族自治县": 101311800,
|
||||||
|
"重庆": 101040100,
|
||||||
|
"长春": 101060100,
|
||||||
|
"朝阳": 101071200,
|
||||||
|
"赤峰": 101080500,
|
||||||
|
"承德": 101090400,
|
||||||
|
"沧州": 101090700,
|
||||||
|
"长治": 101100500,
|
||||||
|
"昌吉回族自治州": 101130300,
|
||||||
|
"昌都": 101140300,
|
||||||
|
"常州": 101191100,
|
||||||
|
"滁州": 101221000,
|
||||||
|
"池州": 101221500,
|
||||||
|
"长沙": 101250100,
|
||||||
|
"郴州": 101250500,
|
||||||
|
"常德": 101250600,
|
||||||
|
"成都": 101270100,
|
||||||
|
"潮州": 101281500,
|
||||||
|
"楚雄彝族自治州": 101291700,
|
||||||
|
"崇左": 101300200,
|
||||||
|
"澄迈": 101311200,
|
||||||
|
"昌江黎族自治县": 101311500,
|
||||||
|
"大庆": 101050800,
|
||||||
|
"大兴安岭地区": 101051300,
|
||||||
|
"大连": 101070200,
|
||||||
|
"丹东": 101070600,
|
||||||
|
"大同": 101100200,
|
||||||
|
"德州": 101120400,
|
||||||
|
"东营": 101121200,
|
||||||
|
"定西": 101160200,
|
||||||
|
"达州": 101270600,
|
||||||
|
"德阳": 101271700,
|
||||||
|
"东莞": 101281600,
|
||||||
|
"东沙群岛": 101282200,
|
||||||
|
"德宏傣族景颇族自治州": 101291300,
|
||||||
|
"迪庆藏族自治州": 101291500,
|
||||||
|
"大理白族自治州": 101291600,
|
||||||
|
"儋州": 101310400,
|
||||||
|
"东方": 101310900,
|
||||||
|
"定安": 101311000,
|
||||||
|
"鄂尔多斯": 101080600,
|
||||||
|
"鄂州": 101200300,
|
||||||
|
"恩施土家族苗族自治州": 101201300,
|
||||||
|
"抚顺": 101070400,
|
||||||
|
"阜新": 101070900,
|
||||||
|
"阜阳": 101220800,
|
||||||
|
"福州": 101230100,
|
||||||
|
"抚州": 101240400,
|
||||||
|
"佛山": 101280800,
|
||||||
|
"防城港": 101301400,
|
||||||
|
"果洛藏族自治州": 101150600,
|
||||||
|
"甘南藏族自治州": 101161400,
|
||||||
|
"固原": 101170400,
|
||||||
|
"赣州": 101240700,
|
||||||
|
"贵阳": 101260100,
|
||||||
|
"广安": 101270800,
|
||||||
|
"广元": 101271800,
|
||||||
|
"甘孜藏族自治州": 101272100,
|
||||||
|
"广州": 101280100,
|
||||||
|
"桂林": 101300500,
|
||||||
|
"贵港": 101300800,
|
||||||
|
"哈尔滨": 101050100,
|
||||||
|
"黑河": 101050600,
|
||||||
|
"鹤岗": 101051100,
|
||||||
|
"葫芦岛": 101071400,
|
||||||
|
"呼和浩特": 101080100,
|
||||||
|
"呼伦贝尔": 101080700,
|
||||||
|
"衡水": 101090800,
|
||||||
|
"邯郸": 101091000,
|
||||||
|
"汉中": 101110800,
|
||||||
|
"菏泽": 101121000,
|
||||||
|
"哈密": 101130900,
|
||||||
|
"和田地区": 101131300,
|
||||||
|
"海东": 101150200,
|
||||||
|
"海北藏族自治州": 101150300,
|
||||||
|
"黄南藏族自治州": 101150400,
|
||||||
|
"海南藏族自治州": 101150500,
|
||||||
|
"海西蒙古族藏族自治州": 101150800,
|
||||||
|
"鹤壁": 101181200,
|
||||||
|
"淮安": 101190900,
|
||||||
|
"黄冈": 101200500,
|
||||||
|
"黄石": 101200600,
|
||||||
|
"杭州": 101210100,
|
||||||
|
"湖州": 101210200,
|
||||||
|
"合肥": 101220100,
|
||||||
|
"淮南": 101220400,
|
||||||
|
"淮北": 101221100,
|
||||||
|
"黄山": 101221600,
|
||||||
|
"衡阳": 101250400,
|
||||||
|
"怀化": 101251200,
|
||||||
|
"惠州": 101280300,
|
||||||
|
"河源": 101281200,
|
||||||
|
"红河哈尼族彝族自治州": 101291200,
|
||||||
|
"贺州": 101300700,
|
||||||
|
"河池": 101301200,
|
||||||
|
"海口": 101310100,
|
||||||
|
"佳木斯": 101050400,
|
||||||
|
"鸡西": 101051000,
|
||||||
|
"吉林": 101060200,
|
||||||
|
"锦州": 101070700,
|
||||||
|
"晋中": 101100400,
|
||||||
|
"晋城": 101100600,
|
||||||
|
"济南": 101120100,
|
||||||
|
"济宁": 101120700,
|
||||||
|
"金昌": 101160600,
|
||||||
|
"酒泉": 101160800,
|
||||||
|
"嘉峪关": 101161200,
|
||||||
|
"焦作": 101181100,
|
||||||
|
"济源": 101181800,
|
||||||
|
"荆州": 101200800,
|
||||||
|
"荆门": 101201200,
|
||||||
|
"嘉兴": 101210300,
|
||||||
|
"金华": 101210900,
|
||||||
|
"九江": 101240200,
|
||||||
|
"吉安": 101240600,
|
||||||
|
"景德镇": 101240800,
|
||||||
|
"江门": 101281100,
|
||||||
|
"揭阳": 101281900,
|
||||||
|
"克拉玛依": 101130200,
|
||||||
|
"克孜勒苏柯尔克孜自治州": 101131100,
|
||||||
|
"喀什地区": 101131200,
|
||||||
|
"可克达拉市": 101132200,
|
||||||
|
"昆玉市": 101132300,
|
||||||
|
"开封": 101180800,
|
||||||
|
"昆明": 101290100,
|
||||||
|
"辽源": 101060600,
|
||||||
|
"辽阳": 101071000,
|
||||||
|
"廊坊": 101090600,
|
||||||
|
"临汾": 101100700,
|
||||||
|
"吕梁": 101101100,
|
||||||
|
"临沂": 101120900,
|
||||||
|
"聊城": 101121700,
|
||||||
|
"拉萨": 101140100,
|
||||||
|
"林芝": 101140400,
|
||||||
|
"兰州": 101160100,
|
||||||
|
"陇南": 101161100,
|
||||||
|
"临夏回族自治州": 101161300,
|
||||||
|
"洛阳": 101180900,
|
||||||
|
"漯河": 101181500,
|
||||||
|
"连云港": 101191000,
|
||||||
|
"丽水": 101210800,
|
||||||
|
"六安": 101221400,
|
||||||
|
"龙岩": 101230700,
|
||||||
|
"娄底": 101250800,
|
||||||
|
"六盘水": 101260600,
|
||||||
|
"泸州": 101271000,
|
||||||
|
"乐山": 101271400,
|
||||||
|
"凉山彝族自治州": 101272000,
|
||||||
|
"临沧": 101290800,
|
||||||
|
"丽江": 101290900,
|
||||||
|
"柳州": 101300300,
|
||||||
|
"来宾": 101300400,
|
||||||
|
"临高": 101311300,
|
||||||
|
"乐东黎族自治县": 101311600,
|
||||||
|
"陵水黎族自治县": 101311700,
|
||||||
|
"牡丹江": 101050300,
|
||||||
|
"马鞍山": 101220500,
|
||||||
|
"绵阳": 101270400,
|
||||||
|
"眉山": 101271500,
|
||||||
|
"梅州": 101280400,
|
||||||
|
"茂名": 101282000,
|
||||||
|
"那曲": 101140600,
|
||||||
|
"南阳": 101180700,
|
||||||
|
"南京": 101190100,
|
||||||
|
"南通": 101190500,
|
||||||
|
"宁波": 101210400,
|
||||||
|
"宁德": 101230300,
|
||||||
|
"南平": 101230900,
|
||||||
|
"南昌": 101240100,
|
||||||
|
"南充": 101270500,
|
||||||
|
"内江": 101271200,
|
||||||
|
"怒江傈僳族自治州": 101291400,
|
||||||
|
"南宁": 101300100,
|
||||||
|
"盘锦": 101071300,
|
||||||
|
"平凉": 101160300,
|
||||||
|
"平顶山": 101180500,
|
||||||
|
"濮阳": 101181300,
|
||||||
|
"莆田": 101230400,
|
||||||
|
"萍乡": 101240900,
|
||||||
|
"攀枝花": 101270200,
|
||||||
|
"普洱": 101290500,
|
||||||
|
"齐齐哈尔": 101050200,
|
||||||
|
"七台河": 101050900,
|
||||||
|
"秦皇岛": 101091100,
|
||||||
|
"青岛": 101120200,
|
||||||
|
"庆阳": 101160400,
|
||||||
|
"潜江": 101201500,
|
||||||
|
"衢州": 101211000,
|
||||||
|
"泉州": 101230500,
|
||||||
|
"黔东南苗族侗族自治州": 101260700,
|
||||||
|
"黔南布依族苗族自治州": 101260800,
|
||||||
|
"黔西南布依族苗族自治州": 101260900,
|
||||||
|
"清远": 101281300,
|
||||||
|
"曲靖": 101290200,
|
||||||
|
"钦州": 101301100,
|
||||||
|
"琼海": 101310600,
|
||||||
|
"琼中黎族苗族自治县": 101311900,
|
||||||
|
"日照": 101121500,
|
||||||
|
"日喀则": 101140200,
|
||||||
|
"上海": 101020100,
|
||||||
|
"绥化": 101050500,
|
||||||
|
"双鸭山": 101051200,
|
||||||
|
"四平": 101060300,
|
||||||
|
"松原": 101060700,
|
||||||
|
"沈阳": 101070100,
|
||||||
|
"石家庄": 101090100,
|
||||||
|
"朔州": 101100900,
|
||||||
|
"商洛": 101110600,
|
||||||
|
"石河子": 101131600,
|
||||||
|
"双河市": 101132400,
|
||||||
|
"山南": 101140500,
|
||||||
|
"石嘴山": 101170200,
|
||||||
|
"商丘": 101181000,
|
||||||
|
"三门峡": 101181700,
|
||||||
|
"苏州": 101190400,
|
||||||
|
"宿迁": 101191300,
|
||||||
|
"十堰": 101201000,
|
||||||
|
"随州": 101201100,
|
||||||
|
"神农架": 101201700,
|
||||||
|
"绍兴": 101210500,
|
||||||
|
"宿州": 101220700,
|
||||||
|
"三明": 101230800,
|
||||||
|
"上饶": 101240300,
|
||||||
|
"邵阳": 101250900,
|
||||||
|
"遂宁": 101270700,
|
||||||
|
"韶关": 101280200,
|
||||||
|
"汕头": 101280500,
|
||||||
|
"深圳": 101280600,
|
||||||
|
"汕尾": 101282100,
|
||||||
|
"三亚": 101310200,
|
||||||
|
"三沙": 101310300,
|
||||||
|
"天津": 101030100,
|
||||||
|
"通化": 101060400,
|
||||||
|
"铁岭": 101071100,
|
||||||
|
"通辽": 101080400,
|
||||||
|
"唐山": 101090500,
|
||||||
|
"太原": 101100100,
|
||||||
|
"铜川": 101111000,
|
||||||
|
"泰安": 101120800,
|
||||||
|
"吐鲁番": 101130800,
|
||||||
|
"塔城地区": 101131400,
|
||||||
|
"图木舒克": 101131800,
|
||||||
|
"铁门关": 101132000,
|
||||||
|
"天水": 101160900,
|
||||||
|
"泰州": 101191200,
|
||||||
|
"天门": 101201600,
|
||||||
|
"台州": 101210600,
|
||||||
|
"铜陵": 101221200,
|
||||||
|
"铜仁": 101260400,
|
||||||
|
"屯昌": 101311100,
|
||||||
|
"台湾": 101341100,
|
||||||
|
"乌海": 101080300,
|
||||||
|
"乌兰察布": 101080900,
|
||||||
|
"渭南": 101110500,
|
||||||
|
"潍坊": 101120600,
|
||||||
|
"威海": 101121300,
|
||||||
|
"乌鲁木齐": 101130100,
|
||||||
|
"五家渠": 101131900,
|
||||||
|
"武威": 101160500,
|
||||||
|
"吴忠": 101170300,
|
||||||
|
"无锡": 101190200,
|
||||||
|
"武汉": 101200100,
|
||||||
|
"温州": 101210700,
|
||||||
|
"芜湖": 101220300,
|
||||||
|
"文山壮族苗族自治州": 101291100,
|
||||||
|
"梧州": 101300600,
|
||||||
|
"五指山": 101310500,
|
||||||
|
"文昌": 101310700,
|
||||||
|
"万宁": 101310800,
|
||||||
|
"锡林郭勒盟": 101081000,
|
||||||
|
"兴安盟": 101081100,
|
||||||
|
"邢台": 101090900,
|
||||||
|
"忻州": 101101000,
|
||||||
|
"西安": 101110100,
|
||||||
|
"咸阳": 101110200,
|
||||||
|
"新星市": 101132500,
|
||||||
|
"西宁": 101150100,
|
||||||
|
"新乡": 101180300,
|
||||||
|
"许昌": 101180400,
|
||||||
|
"信阳": 101180600,
|
||||||
|
"徐州": 101190800,
|
||||||
|
"襄阳": 101200200,
|
||||||
|
"孝感": 101200400,
|
||||||
|
"咸宁": 101200700,
|
||||||
|
"仙桃": 101201400,
|
||||||
|
"宣城": 101221300,
|
||||||
|
"厦门": 101230200,
|
||||||
|
"新余": 101241000,
|
||||||
|
"湘潭": 101250200,
|
||||||
|
"湘西土家族苗族自治州": 101251400,
|
||||||
|
"西双版纳傣族自治州": 101291000,
|
||||||
|
"香港": 101320300,
|
||||||
|
"伊春": 101050700,
|
||||||
|
"延边朝鲜族自治州": 101060900,
|
||||||
|
"营口": 101070800,
|
||||||
|
"阳泉": 101100300,
|
||||||
|
"运城": 101100800,
|
||||||
|
"延安": 101110300,
|
||||||
|
"榆林": 101110400,
|
||||||
|
"烟台": 101120500,
|
||||||
|
"伊犁哈萨克自治州": 101130600,
|
||||||
|
"玉树藏族自治州": 101150700,
|
||||||
|
"银川": 101170100,
|
||||||
|
"扬州": 101190600,
|
||||||
|
"盐城": 101190700,
|
||||||
|
"宜昌": 101200900,
|
||||||
|
"宜春": 101240500,
|
||||||
|
"鹰潭": 101241100,
|
||||||
|
"益阳": 101250700,
|
||||||
|
"岳阳": 101251000,
|
||||||
|
"永州": 101251300,
|
||||||
|
"宜宾": 101271100,
|
||||||
|
"雅安": 101271600,
|
||||||
|
"云浮": 101281400,
|
||||||
|
"阳江": 101281800,
|
||||||
|
"玉溪": 101290400,
|
||||||
|
"玉林": 101300900,
|
||||||
|
"张家口": 101090300,
|
||||||
|
"淄博": 101120300,
|
||||||
|
"枣庄": 101121400,
|
||||||
|
"张掖": 101160700,
|
||||||
|
"中卫": 101170500,
|
||||||
|
"郑州": 101180100,
|
||||||
|
"周口": 101181400,
|
||||||
|
"驻马店": 101181600,
|
||||||
|
"镇江": 101190300,
|
||||||
|
"舟山": 101211100,
|
||||||
|
"漳州": 101230600,
|
||||||
|
"株洲": 101250300,
|
||||||
|
"张家界": 101251100,
|
||||||
|
"遵义": 101260200,
|
||||||
|
"自贡": 101270300,
|
||||||
|
"资阳": 101271300,
|
||||||
|
"珠海": 101280700,
|
||||||
|
"肇庆": 101280900,
|
||||||
|
"湛江": 101281000,
|
||||||
|
"中山": 101281700,
|
||||||
|
"昭通": 101290700
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,4 @@
|
||||||
|
import execjs
|
||||||
|
|
||||||
|
js = execjs.compile(open('demo.js', 'r', encoding='utf-8').read())
|
||||||
|
js.call('r', "1EAWUR51t3ADpSjeK5ywydCLIV2U4WaF93nocYiDXQs=", "1699709623728")
|
|
@ -0,0 +1,24 @@
|
||||||
|
# 招聘数据拉取
|
||||||
|
|
||||||
|
## 拉钩网 使用说明
|
||||||
|
|
||||||
|
> 注意:需要有node环境,具体安装和配置请自行搜索~!
|
||||||
|
|
||||||
|
1. 首先进入到`Job`文件夹下,安装三个包,命令如下:
|
||||||
|
```shell
|
||||||
|
npm install crypto-js
|
||||||
|
npm install jsencrypt
|
||||||
|
npm install get-random-values
|
||||||
|
```
|
||||||
|
安装完成后便可执行 `la_gou.py` 脚本
|
||||||
|
|
||||||
|
2. 执行中如果出现如下错误:
|
||||||
|
```shell
|
||||||
|
window is not defined
|
||||||
|
```
|
||||||
|
需要找到`jsencrypt`的安装目录,`node_modules/jsencrypt/bin/jsencrypt.js` 下,加入如下代码:
|
||||||
|
```javascript
|
||||||
|
var window = {};
|
||||||
|
var navigator ={};
|
||||||
|
```
|
||||||
|
之后再次执行。
|
|
@ -0,0 +1,267 @@
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import execjs
|
||||||
|
import requests
|
||||||
|
from lxml import etree
|
||||||
|
from urllib import parse
|
||||||
|
|
||||||
|
aes_key = ""
|
||||||
|
secret_key_value = ""
|
||||||
|
|
||||||
|
with open('lagou.js', 'r', encoding='utf-8') as f:
|
||||||
|
lagou_js = execjs.compile(f.read())
|
||||||
|
|
||||||
|
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
|
||||||
|
|
||||||
|
x_anit = {
|
||||||
|
"x-anit-forge-code": "0",
|
||||||
|
"x-anit-forge-token": None
|
||||||
|
}
|
||||||
|
|
||||||
|
global_cookies = {
|
||||||
|
# - 如果 IP 被拉黑,访问需要登录,或者提示太频繁,那么就需要那么就需要补全登录后的 cookie!
|
||||||
|
# - 以下所有值都必须在登录后再复制过来,特别是 JSESSIONID,没登录得到的 JSESSIONID 是无效的!
|
||||||
|
# - 经过测试主要是以下三个 cookie 起作用,能保持登录:login、gate_login_token、_putrc
|
||||||
|
# - 还有一个 JSESSIONID 值,主要是用来获取请求头 x-anit-forge-code 和 x-anit-forge-token
|
||||||
|
|
||||||
|
# "login": "true",
|
||||||
|
# "gate_login_token": "",
|
||||||
|
# "_putrc": "",
|
||||||
|
# "JSESSIONID": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_trace_token() -> str:
|
||||||
|
# 获取 cookie 中的 user_trace_token
|
||||||
|
json_url = "https://a.lagou.com/json"
|
||||||
|
headers = {
|
||||||
|
"Host": "a.lagou.com",
|
||||||
|
"Referer": "https://www.lagou.com/",
|
||||||
|
"User-Agent": UA
|
||||||
|
}
|
||||||
|
params = {
|
||||||
|
"lt": "trackshow",
|
||||||
|
"t": "ad",
|
||||||
|
"v": 0,
|
||||||
|
"dl": "https://www.lagou.com/",
|
||||||
|
"dr": "https://www.lagou.com",
|
||||||
|
"time": str(int(time.time() * 1000))
|
||||||
|
}
|
||||||
|
response = requests.get(url=json_url, headers=headers, params=params)
|
||||||
|
user_trace_token = response.cookies.get_dict()["user_trace_token"]
|
||||||
|
return user_trace_token
|
||||||
|
|
||||||
|
|
||||||
|
def get_lg_stoken(original_data: dict) -> str:
|
||||||
|
# 获取 cookie 中的 __lg_stoken__
|
||||||
|
token_url = "https://www.lagou.com/wn/jobs"
|
||||||
|
token_headers = {
|
||||||
|
"Host": "www.lagou.com",
|
||||||
|
"Referer": "https://www.lagou.com/",
|
||||||
|
"User-Agent": UA
|
||||||
|
}
|
||||||
|
params = {
|
||||||
|
"kd": original_data["kd"],
|
||||||
|
"city": original_data["city"],
|
||||||
|
"fromSearch": original_data["fromSearch"],
|
||||||
|
"pn": original_data["pn"],
|
||||||
|
"px": original_data["px"]
|
||||||
|
}
|
||||||
|
token_response = requests.get(url=token_url, params=params, headers=token_headers, cookies=global_cookies,
|
||||||
|
allow_redirects=False)
|
||||||
|
if token_response.status_code != 302:
|
||||||
|
raise Exception("获取跳转链接异常!检查 global_cookies 是否已包含 __lg_stoken__!")
|
||||||
|
# 获取 302 跳转的地址
|
||||||
|
security_check_url = token_response.headers["Location"]
|
||||||
|
print(f"security_check_url --->>> {security_check_url}")
|
||||||
|
if "login" in security_check_url:
|
||||||
|
raise Exception("IP 被关进小黑屋啦!需要登录!请补全登录后的 Cookie,或者自行添加代理!")
|
||||||
|
parse_result = parse.urlparse(security_check_url)
|
||||||
|
# url 的参数为待加密对象
|
||||||
|
security_check_params = parse_result.query
|
||||||
|
# 取 name 参数,为混淆 js 的文件名
|
||||||
|
security_check_js_name = parse.parse_qs(security_check_params)["name"][0]
|
||||||
|
|
||||||
|
# 发送请求,获取混淆的 js
|
||||||
|
js_url = "https://www.lagou.com/common-sec/dist/" + security_check_js_name + ".js"
|
||||||
|
js_headers = {
|
||||||
|
"Host": "www.lagou.com",
|
||||||
|
"Referer": security_check_url,
|
||||||
|
"User-Agent": UA
|
||||||
|
}
|
||||||
|
js_response = requests.get(url=js_url, headers=js_headers, cookies=global_cookies).text
|
||||||
|
# 补全 js,添加 window 参数和一个方法,用于获取 __lg_stoken__ 的值
|
||||||
|
lg_js = """
|
||||||
|
window = {
|
||||||
|
"location": {
|
||||||
|
"hostname": "www.lagou.com",
|
||||||
|
"search": '?%s'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function getLgStoken(){
|
||||||
|
return window.gt.prototype.a()
|
||||||
|
}
|
||||||
|
""" % security_check_params + js_response
|
||||||
|
|
||||||
|
lg_stoken = execjs.compile(lg_js).call("getLgStoken")
|
||||||
|
print(f"lg_stoken --->>> {lg_stoken}")
|
||||||
|
return lg_stoken
|
||||||
|
|
||||||
|
|
||||||
|
def update_cookies(original_data: dict) -> None:
|
||||||
|
global global_cookies
|
||||||
|
# 获取 user_trace_token
|
||||||
|
user_trace_token = get_user_trace_token()
|
||||||
|
# 获取 X_HTTP_TOKEN
|
||||||
|
x_http_token = lagou_js.call("getXHttpToken", "user_trace_token=" + user_trace_token)
|
||||||
|
# 第一次更新全局 cookies,后续获取 __lg_stoken__ 会用到
|
||||||
|
global_cookies.update({
|
||||||
|
"user_trace_token": user_trace_token,
|
||||||
|
"X_HTTP_TOKEN": x_http_token
|
||||||
|
})
|
||||||
|
|
||||||
|
# 获取 __lg_stoken__
|
||||||
|
lg_stoken = get_lg_stoken(original_data)
|
||||||
|
# 第二次更新全局 cookies
|
||||||
|
global_cookies.update({
|
||||||
|
"__lg_stoken__": lg_stoken,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def update_aes_key() -> None:
|
||||||
|
# 通过JS获取 AES Key,并通过接口激活,接口激活后会返回一个 secretKeyValue,后续请求头会用到
|
||||||
|
global aes_key, secret_key_value
|
||||||
|
url = "https://gate.lagou.com/system/agreement"
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Host": "gate.lagou.com",
|
||||||
|
"Origin": "https://www.lagou.com",
|
||||||
|
"Referer": "https://www.lagou.com/",
|
||||||
|
"User-Agent": UA
|
||||||
|
}
|
||||||
|
encrypt_data = lagou_js.call("getAesKeyAndRsaEncryptData")
|
||||||
|
aes_key = encrypt_data["aesKey"]
|
||||||
|
rsa_encrypt_data = encrypt_data["rsaEncryptData"]
|
||||||
|
data = {"secretKeyDecode": rsa_encrypt_data}
|
||||||
|
response = requests.post(url=url, headers=headers, json=data).json()
|
||||||
|
secret_key_value = response["content"]["secretKeyValue"]
|
||||||
|
|
||||||
|
|
||||||
|
def update_x_anit(original_data: dict) -> None:
|
||||||
|
# 更新 x-anit-forge-code 和 x-anit-forge-token
|
||||||
|
url = "https://www.lagou.com/wn/jobs"
|
||||||
|
headers = {
|
||||||
|
"Host": "www.lagou.com",
|
||||||
|
"Referer": "https://www.lagou.com/",
|
||||||
|
"User-Agent": UA
|
||||||
|
}
|
||||||
|
params = {
|
||||||
|
"kd": original_data["kd"],
|
||||||
|
"city": original_data["city"]
|
||||||
|
}
|
||||||
|
print(f"update_x_anit params --->>> {params}")
|
||||||
|
response = requests.get(url=url, params=params, headers=headers, cookies=global_cookies)
|
||||||
|
print(f"update_x_anit params --->>> {response.text}")
|
||||||
|
tree = etree.HTML(response.text)
|
||||||
|
next_data_json = json.loads(tree.xpath("//script[@id='__NEXT_DATA__']/text()")[0])
|
||||||
|
submit_code = next_data_json["props"]["tokenData"]["submitCode"]
|
||||||
|
submit_token = next_data_json["props"]["tokenData"]["submitToken"]
|
||||||
|
# 注意 JSESSIONID 必须是登录验证后的!
|
||||||
|
if not submit_code or not submit_token:
|
||||||
|
raise Exception("submitCode & submitToken 为空,请检查 JSESSIONID 是否正确!")
|
||||||
|
global x_anit
|
||||||
|
x_anit["x-anit-forge-code"] = submit_code
|
||||||
|
x_anit["x-anit-forge-token"] = submit_token
|
||||||
|
|
||||||
|
|
||||||
|
def get_header_params(original_data: dict) -> dict:
|
||||||
|
# 后续请求数据所需的请求头参数
|
||||||
|
# 职位搜索 URL,如果是搜索公司,那就是 https://www.lagou.com/jobs/companyAjax.json,根据实际情况更改
|
||||||
|
u = "https://www.lagou.com/jobs/v2/positionAjax.json"
|
||||||
|
return {
|
||||||
|
"traceparent": lagou_js.call("getTraceparent"),
|
||||||
|
"X-K-HEADER": secret_key_value,
|
||||||
|
"X-S-HEADER": lagou_js.call("getXSHeader", aes_key, original_data, u),
|
||||||
|
"X-SS-REQ-HEADER": json.dumps({"secret": secret_key_value})
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_encrypted_data(original_data: dict) -> str:
|
||||||
|
# AES 加密原始数据
|
||||||
|
encrypted_data = lagou_js.call("getRequestData", aes_key, original_data)
|
||||||
|
return encrypted_data
|
||||||
|
|
||||||
|
|
||||||
|
def get_data(original_data: dict, encrypted_data: str, header_params: dict) -> dict:
|
||||||
|
# 携带加密后的请求数据和完整请求头,拿到密文,AES 解密得到明文职位信息
|
||||||
|
url = "https://www.lagou.com/jobs/v2/positionAjax.json"
|
||||||
|
referer = parse.urljoin("https://www.lagou.com/wn/jobs?", parse.urlencode(original_data))
|
||||||
|
headers = {
|
||||||
|
# "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
|
||||||
|
"Host": "www.lagou.com",
|
||||||
|
"Origin": "https://www.lagou.com",
|
||||||
|
"Referer": referer,
|
||||||
|
"traceparent": header_params["traceparent"],
|
||||||
|
"User-Agent": UA,
|
||||||
|
"X-K-HEADER": header_params["X-K-HEADER"],
|
||||||
|
"X-S-HEADER": header_params["X-S-HEADER"],
|
||||||
|
"X-SS-REQ-HEADER": header_params["X-SS-REQ-HEADER"],
|
||||||
|
}
|
||||||
|
# 添加 x-anit-forge-code 和 x-anit-forge-token
|
||||||
|
headers.update(x_anit)
|
||||||
|
|
||||||
|
data = {"data": encrypted_data}
|
||||||
|
response = requests.post(url=url, headers=headers, cookies=global_cookies, data=data).json()
|
||||||
|
if "status" in response:
|
||||||
|
if not response["status"] and "操作太频繁" in response["msg"]:
|
||||||
|
raise Exception("获取数据失败!msg:%s!可以尝试补全登录后的 Cookies,或者添加代理!" % response["msg"])
|
||||||
|
else:
|
||||||
|
raise Exception("获取数据异常!请检查数据是否完整!")
|
||||||
|
else:
|
||||||
|
response_data = response["data"]
|
||||||
|
decrypted_data = lagou_js.call("getResponseData", response_data, aes_key)
|
||||||
|
return decrypted_data
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# 初始化设置标识
|
||||||
|
need_init = True
|
||||||
|
|
||||||
|
# 要搜索的原始数据,例如爬取最新的10页全国Java岗位
|
||||||
|
# for pn in range(1, 10):
|
||||||
|
original_data = {
|
||||||
|
"city": "北京", # 城市范围
|
||||||
|
"pn": 1, # 页码
|
||||||
|
"kd": "java", # 关键词
|
||||||
|
"px": "new", # 排序方式,new:最新,default:默认
|
||||||
|
"fromSearch": "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
while need_init:
|
||||||
|
# 初始化设置各项参数,这些参数只需要设置一次就行了,后续请求可以复用
|
||||||
|
# 获取必要的 cookies,主要是 user_trace_token、X_HTTP_TOKEN、__lg_stoken__
|
||||||
|
update_cookies(original_data)
|
||||||
|
# 获取并激活 AES Key,不激活无法使用
|
||||||
|
update_aes_key()
|
||||||
|
# 如果登录了账号,则获取职位的请求 header 多了两个参数,x-anit-forge-code 和 x-anit-forge-token
|
||||||
|
# 在实际测试中,不加这两个值,或者随机值都行,严谨起见还是正常获取一下
|
||||||
|
if "login" in global_cookies:
|
||||||
|
update_x_anit(original_data)
|
||||||
|
need_init = False
|
||||||
|
|
||||||
|
# 获取请求头各参数:X-K-HEADER、X-S-HEADER、X-SS-REQ-HEADER、traceparent
|
||||||
|
header_params = get_header_params(original_data)
|
||||||
|
# 将要搜索的原始数据进行 AES 加密
|
||||||
|
encrypted_data = get_encrypted_data(original_data)
|
||||||
|
print(f"global_cookies --->>> {global_cookies}")
|
||||||
|
print(f"original_data --->>> {original_data}")
|
||||||
|
print(f"header_params --->>> {header_params}")
|
||||||
|
print(f"encrypted_data --->>> {encrypted_data} key --->>> {aes_key}")
|
||||||
|
# 发起请求,获取加密数据并解密成明文
|
||||||
|
data = get_data(original_data, encrypted_data, header_params)
|
||||||
|
print(data["content"]["hrInfoMap"])
|
||||||
|
print(data["content"]["positionResult"])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,6 @@
|
||||||
|
from urllib import parse
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
a = parse.quote('北京')
|
||||||
|
print(a)
|
||||||
|
print("北京".encode(encoding='UTF-8', errors='strict'))
|
|
@ -0,0 +1,248 @@
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
|
||||||
|
# 定义的全局变量
|
||||||
|
LOGIN_NAME = "dl-renmeng"
|
||||||
|
LOGIN_PASSWD = "1111"
|
||||||
|
IS_WORK_DAY = 0
|
||||||
|
|
||||||
|
|
||||||
|
def get_clock_in_data(clock_in_time):
|
||||||
|
"""根据当前的小时,返回打卡的入参"""
|
||||||
|
print("当前时间-小时:", clock_in_time.tm_hour)
|
||||||
|
# 定义时间类型
|
||||||
|
time_type_one = ["00:00", "09:00", "18:00", "23:59"]
|
||||||
|
time_type_two = ["00:00:00", "09:00:00", "18:00:00", "23:59:59"]
|
||||||
|
clock_in_data = {}
|
||||||
|
if clock_in_time.tm_hour > 9:
|
||||||
|
# 下午打卡
|
||||||
|
clock_in_data = {
|
||||||
|
"time": time_type_one[2],
|
||||||
|
"belongtime": time_type_one[2],
|
||||||
|
"canSignTime": time_type_one[3],
|
||||||
|
"signTime": time.strftime("%H:%M:%S", clock_in_time),
|
||||||
|
"date": time.strftime("%Y-%m-%d", clock_in_time),
|
||||||
|
"belongdate": time.strftime("%Y-%m-%d", clock_in_time),
|
||||||
|
"datetime": f'{time.strftime("%Y-%m-%d", clock_in_time)} {time_type_two[2]}',
|
||||||
|
"signSectionTime": f'{time.strftime("%Y-%m-%d", clock_in_time)} {time_type_two[3]}',
|
||||||
|
"signSection": f'{time.strftime("%Y-%m-%d", clock_in_time)} {time_type_two[0]}#{time.strftime("%Y-%m-%d", struct_time)} {time_type_two[3]}',
|
||||||
|
"min": "359",
|
||||||
|
"workmins": "480",
|
||||||
|
"type": "off",
|
||||||
|
"across": "0",
|
||||||
|
"islastsign": "1",
|
||||||
|
"isYellow": "1",
|
||||||
|
"isPunchOpen": "1",
|
||||||
|
"isacross": "0",
|
||||||
|
"pre": "0",
|
||||||
|
"active": "0",
|
||||||
|
"needSign": "0",
|
||||||
|
"reSign": "1",
|
||||||
|
"min_next": "-1",
|
||||||
|
"signfrom": "e9pc",
|
||||||
|
"serialid": "1",
|
||||||
|
"signAcross": "0",
|
||||||
|
"signAcross_next": "0",
|
||||||
|
"signbelong": "今天",
|
||||||
|
"signbelongspan": "今天",
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# 上午打卡
|
||||||
|
clock_in_data = {
|
||||||
|
"time": time_type_one[1],
|
||||||
|
"belongtime": time_type_one[1],
|
||||||
|
"canSignTime": time_type_one[0],
|
||||||
|
"date": time.strftime("%Y-%m-%d", clock_in_time),
|
||||||
|
"belongdate": time.strftime("%Y-%m-%d", clock_in_time),
|
||||||
|
"datetime": f'{time.strftime("%Y-%m-%d", clock_in_time)} {time_type_two[1]}',
|
||||||
|
"signSectionTime": f'{time.strftime("%Y-%m-%d", clock_in_time)} {time_type_two[0]}',
|
||||||
|
"signSection": f'{time.strftime("%Y-%m-%d", clock_in_time)} {time_type_two[0]}#{time.strftime("%Y-%m-%d", struct_time)} {time_type_two[3]}',
|
||||||
|
"min": "540",
|
||||||
|
"workmins": "480",
|
||||||
|
"isfirstsign": "1",
|
||||||
|
"type": "on",
|
||||||
|
"across": "0",
|
||||||
|
"islastsign": "1",
|
||||||
|
"isYellow": "0",
|
||||||
|
"isPunchOpen": "1",
|
||||||
|
"isacross": "0",
|
||||||
|
"pre": "0",
|
||||||
|
"active": "1",
|
||||||
|
"needSign": "1",
|
||||||
|
"min_next": "-1",
|
||||||
|
"serialid": "1",
|
||||||
|
"signAcross": "0",
|
||||||
|
"signAcross_next": "0",
|
||||||
|
}
|
||||||
|
return clock_in_data
|
||||||
|
|
||||||
|
|
||||||
|
def trusty_sleep(sleep_time):
|
||||||
|
"""睡觉方法"""
|
||||||
|
print(f"开始休眠:{sleep_time} 秒")
|
||||||
|
start = time.time()
|
||||||
|
while time.time() - start < sleep_time:
|
||||||
|
time.sleep(sleep_time - (time.time() - start))
|
||||||
|
|
||||||
|
|
||||||
|
struct_time = time.localtime(time.time())
|
||||||
|
# 得到结构化时间格式
|
||||||
|
now_time = time.strftime("%Y%m%d", struct_time)
|
||||||
|
# now_time = 20220131
|
||||||
|
print("当前时间:", now_time)
|
||||||
|
url = f"https://api.apihubs.cn/holiday/get?field=workday&date={now_time}&workday=1&cn=1&size=31"
|
||||||
|
print("获取工作日信息url:", url)
|
||||||
|
print("开始发送请求----->>>>>>")
|
||||||
|
request_result = requests.get(url)
|
||||||
|
print("请求结果返回----->>>>>>", request_result)
|
||||||
|
# 请求返回成功
|
||||||
|
if request_result.status_code == 200:
|
||||||
|
# 解析json
|
||||||
|
is_work = json.loads(request_result.text)
|
||||||
|
# 数据获取成功
|
||||||
|
if is_work["code"] == 0:
|
||||||
|
data_list = is_work["data"]["list"][0] if is_work["data"]["list"] else []
|
||||||
|
IS_WORK_DAY = data_list["workday"] if data_list else 0
|
||||||
|
print("当前日期是否是工作日(1:是,0:否):", IS_WORK_DAY)
|
||||||
|
if IS_WORK_DAY == 1:
|
||||||
|
header = {
|
||||||
|
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
|
||||||
|
"Accept": "*/*",
|
||||||
|
"Accept-Encoding": "gzip, deflate",
|
||||||
|
"Accept-Language": "zh-CN,zh;q=0.9",
|
||||||
|
}
|
||||||
|
|
||||||
|
print("开始OA登录----->>>")
|
||||||
|
login_form_data = {
|
||||||
|
"loginid": LOGIN_NAME,
|
||||||
|
"langid": "7",
|
||||||
|
}
|
||||||
|
login_form_url = "http://oa.njhgroup.cn/api/hrm/login/getLoginForm"
|
||||||
|
login_form_result = requests.post(
|
||||||
|
login_form_url, headers=header, data=login_form_data
|
||||||
|
)
|
||||||
|
print(requests.utils.dict_from_cookiejar(login_form_result.cookies))
|
||||||
|
randcode = json.loads(login_form_result.text)["qrcode"]["loginkey"]
|
||||||
|
print("获取randcode结果---->>>", randcode)
|
||||||
|
login_data = {
|
||||||
|
"loginid": LOGIN_NAME,
|
||||||
|
"userpassword": LOGIN_PASSWD,
|
||||||
|
"logintype": "1",
|
||||||
|
"isie": "false",
|
||||||
|
}
|
||||||
|
login_cookie = login_form_result.cookies
|
||||||
|
# 开始进行OA登录
|
||||||
|
oa_login_url = "http://oa.njhgroup.cn/api/hrm/login/checkLogin"
|
||||||
|
login_result = requests.post(
|
||||||
|
oa_login_url,
|
||||||
|
headers=header,
|
||||||
|
data=login_data,
|
||||||
|
cookies=requests.utils.dict_from_cookiejar(login_form_result.cookies),
|
||||||
|
)
|
||||||
|
print(requests.utils.dict_from_cookiejar(login_result.cookies))
|
||||||
|
print(login_result.text)
|
||||||
|
print("OA登录结束----->>>", login_result.text)
|
||||||
|
|
||||||
|
# 休眠10秒
|
||||||
|
time.sleep(10)
|
||||||
|
print("OA开始刷新randcode----->>>")
|
||||||
|
ts = int(round(time.time() * 1000))
|
||||||
|
refresh_code_url = f"http://oa.njhgroup.cn/rsa/weaver.rsa.GetRsaInfo?ts={ts}"
|
||||||
|
refresh_code_result = requests.get(refresh_code_url, headers=header)
|
||||||
|
print(refresh_code_result.cookies)
|
||||||
|
print("OA刷新randcode结束----->>>")
|
||||||
|
|
||||||
|
# 组装最后的cookie
|
||||||
|
clock_in_cookie = requests.utils.dict_from_cookiejar(
|
||||||
|
login_form_result.cookies)
|
||||||
|
clock_in_cookie.update(
|
||||||
|
requests.utils.dict_from_cookiejar(refresh_code_result.cookies)
|
||||||
|
)
|
||||||
|
clock_in_cookie.update(
|
||||||
|
requests.utils.dict_from_cookiejar(login_result.cookies))
|
||||||
|
|
||||||
|
print("开始检查当日是否请假----->>>")
|
||||||
|
check_is_work_url = (
|
||||||
|
"http://oa.njhgroup.cn/api/kq/myattendance/getHrmKQMonthReportInfo"
|
||||||
|
)
|
||||||
|
check_is_work_result = requests.post(
|
||||||
|
check_is_work_url,
|
||||||
|
headers=header,
|
||||||
|
data={
|
||||||
|
"typevalue": time.strftime("%Y-%m", struct_time),
|
||||||
|
"loaddata": "1",
|
||||||
|
"type": "2",
|
||||||
|
},
|
||||||
|
cookies=clock_in_cookie,
|
||||||
|
).text
|
||||||
|
# 解析json
|
||||||
|
is_work = json.loads(check_is_work_result)
|
||||||
|
print("结束检查当日是否请假----->>>")
|
||||||
|
print(f"{struct_time.tm_mday}")
|
||||||
|
isWorkDay = is_work["result"][f"{struct_time.tm_mday}"]["isWorkDay"]
|
||||||
|
workflow = len(is_work["result"][f"{struct_time.tm_mday}"]["workflow"])
|
||||||
|
print(f"今天是否是工作日:{isWorkDay},今天是否请假:{workflow}")
|
||||||
|
needSign = False
|
||||||
|
if isWorkDay and workflow <= 0:
|
||||||
|
needSign = True
|
||||||
|
else:
|
||||||
|
print("今日有请假,不打卡~!")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
check_is_need_sign_url = "http://oa.njhgroup.cn/api/hrm/kq/attendanceButton/getButtons"
|
||||||
|
check_is_need_sign_result = requests.post(
|
||||||
|
check_is_need_sign_url,
|
||||||
|
headers=header,
|
||||||
|
cookies=clock_in_cookie,
|
||||||
|
).text
|
||||||
|
check_is_need_sign_timeline = json.loads(check_is_need_sign_result)["timeline"]
|
||||||
|
# 0代表不需要打卡
|
||||||
|
need_sign = 0
|
||||||
|
# sign_time 为空也代表不需要打卡
|
||||||
|
sign_time = ""
|
||||||
|
if struct_time.tm_hour < 9:
|
||||||
|
# 上午走第一个集合,上午卡
|
||||||
|
need_sign = check_is_need_sign_timeline[0]["needSign"]
|
||||||
|
if "signTime" in check_is_need_sign_timeline[0]:
|
||||||
|
sign_time = check_is_need_sign_timeline[0]["signTime"]
|
||||||
|
print(f"上午打卡情况:---{need_sign} ----- {sign_time} --- {len(sign_time)}")
|
||||||
|
else:
|
||||||
|
# 下午走第二个集合,下午卡
|
||||||
|
need_sign = check_is_need_sign_timeline[1]["needSign"]
|
||||||
|
if "signTime" in check_is_need_sign_timeline[1]:
|
||||||
|
sign_time = check_is_need_sign_timeline[1]["signTime"]
|
||||||
|
print(f"下午打卡情况:---{need_sign} ----- {sign_time} --- {len(sign_time)}")
|
||||||
|
# 可以打卡
|
||||||
|
if need_sign == "1" and len(sign_time) == 0:
|
||||||
|
needSign = True
|
||||||
|
else:
|
||||||
|
print("已经打卡,无需再打~!")
|
||||||
|
sys.exit()
|
||||||
|
# 检查是否已经打过卡,如果没有,则继续
|
||||||
|
if needSign:
|
||||||
|
# 开始打卡,如果是下午,则休眠一下再打卡
|
||||||
|
if struct_time.tm_hour > 9:
|
||||||
|
# 休眠 5分-15分
|
||||||
|
trusty_sleep(random.randint(300, 900))
|
||||||
|
print("OA开始打卡----->>>")
|
||||||
|
# clock_in_cookie["__randcode__"] =
|
||||||
|
# 刷新时间
|
||||||
|
struct_time = time.localtime(time.time())
|
||||||
|
sign_time = time.strftime("%H:%M:%S", struct_time)
|
||||||
|
|
||||||
|
get_clock_in_data(struct_time)
|
||||||
|
clock_in_url = "http://oa.njhgroup.cn/api/hrm/kq/attendanceButton/punchButton"
|
||||||
|
print(
|
||||||
|
"OA打卡结束----->>>",
|
||||||
|
requests.post(
|
||||||
|
clock_in_url,
|
||||||
|
headers=header,
|
||||||
|
data=get_clock_in_data(struct_time),
|
||||||
|
cookies=clock_in_cookie,
|
||||||
|
).text,
|
||||||
|
)
|
|
@ -0,0 +1,21 @@
|
||||||
|
from selenium import webdriver
|
||||||
|
# urllib3 教程:https://urllib3.readthedocs.io/en/latest/user-guide.html
|
||||||
|
# selenium 教程:https://www.selenium.dev/zh-cn/documentation/webdriver/getting_started/
|
||||||
|
# 下载最新的chromedriver: https://chromedriver.storage.googleapis.com/index.html
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.support.wait import WebDriverWait
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
|
||||||
|
# 使用管理器管理驱动程序
|
||||||
|
# 管理器将驱动下载到了 /Users/renmeng/.wdm/drivers/chromedriver 目录下
|
||||||
|
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
|
||||||
|
|
||||||
|
# options = ChromeOptions()
|
||||||
|
# driver = webdriver.Chrome(options=options)
|
||||||
|
# 打开网页
|
||||||
|
driver.get("https://www.lagou.com/jobs/list_运维?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=") # 打开url网页 比如 driver.get()
|
||||||
|
wait = WebDriverWait(driver, 10)
|
||||||
|
# 获取网页源码
|
||||||
|
print(driver.page_source)
|
||||||
|
driver.quit()
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,183 @@
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import hashlib
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import execjs
|
||||||
|
import requests
|
||||||
|
import urllib3
|
||||||
|
|
||||||
|
|
||||||
|
def des_js(js_str):
|
||||||
|
keys = re.findall(f'DES\.encrypt\((\w+)\s?,\s?(\w+)\s?,\s?(\w+)\)', js_str)
|
||||||
|
text_name, key_name, iv_name = keys[0]
|
||||||
|
key = re.findall(f'const\s+?{key_name}\s+?=.*?"(.*?)"', js_str)[0]
|
||||||
|
iv = re.findall(f'const\s+?{iv_name}\s+?=.*?"(.*?)"', js_str)[0]
|
||||||
|
appid_name = re.findall("appId:.*?(\w+),", js_str)[0]
|
||||||
|
appId = re.findall(f"var\s?{appid_name}\s?=.*?'(.*?)'", js_str)[0]
|
||||||
|
param_name = re.findall("data:\s?\{\s?(\w+):.*?}", js_str)[0]
|
||||||
|
|
||||||
|
des_keys = re.findall(f'DES\.decrypt\(data,\s?(\w+),\s?(\w+)\);', js_str)
|
||||||
|
des_dec_key_name, des_dec_iv_name = des_keys[0]
|
||||||
|
|
||||||
|
des_dec_key = re.findall(f'const\s+?{des_dec_key_name}\s+?=.*?"(.*?)"', js_str)[0]
|
||||||
|
des_dec_iv = re.findall(f'const\s+?{des_dec_iv_name}\s+?=.*?"(.*?)"', js_str)[0]
|
||||||
|
|
||||||
|
aes_keys = re.findall(f'AES\.decrypt\(data,\s?(\w+),\s?(\w+)\);', js_str)
|
||||||
|
aes_dec_key_name, aes_dec_iv_name = aes_keys[0]
|
||||||
|
aes_dec_key = re.findall(f'const\s+?{aes_dec_key_name}\s+?=.*?"(.*?)"', js_str)[0]
|
||||||
|
aes_dec_iv = re.findall(f'const\s+?{aes_dec_iv_name}\s+?=.*?"(.*?)"', js_str)[0]
|
||||||
|
|
||||||
|
method = "GETDAYDATA"
|
||||||
|
obj = {"city": "济南", "month": '201702'}
|
||||||
|
timestamp = int(time.time() * 1000)
|
||||||
|
clienttype = 'WEB'
|
||||||
|
form_data = {
|
||||||
|
"appId": appId,
|
||||||
|
"method": method,
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"clienttype": clienttype,
|
||||||
|
"object": obj,
|
||||||
|
"secret": hashlib.md5(
|
||||||
|
f'{appId}{method}{timestamp}{clienttype}{str(obj)}'.replace("'", '"').replace(' ', '').encode(
|
||||||
|
'utf-8')).hexdigest()
|
||||||
|
}
|
||||||
|
|
||||||
|
base64_d = base64.b64encode(str(form_data).replace("'", '"').replace(' ', '').encode('utf-8')).decode('utf-8')
|
||||||
|
|
||||||
|
result = js.call("des_encrypt", base64_d, key, iv)
|
||||||
|
print(data := {param_name: result})
|
||||||
|
|
||||||
|
url = "https://www.aqistudy.cn/historydata/api/historyapi.php"
|
||||||
|
|
||||||
|
resp = requests.post(url=url, headers=headers, data=data, verify=False)
|
||||||
|
|
||||||
|
print(resp.text)
|
||||||
|
dec_data = js.call('dec_func', resp.text, des_dec_key, des_dec_iv, aes_dec_key, aes_dec_iv)
|
||||||
|
print(json.loads(dec_data))
|
||||||
|
|
||||||
|
|
||||||
|
def aes_js(js_str):
|
||||||
|
keys = re.findall(f'AES\.encrypt\((\w+)\s?,\s?(\w+)\s?,\s?(\w+)\)', js_str)
|
||||||
|
text_name, key_name, iv_name = keys[1]
|
||||||
|
key = re.findall(f'const\s+?{key_name}\s+?=.*?"(.*?)"', js_str)[0]
|
||||||
|
iv = re.findall(f'const\s+?{iv_name}\s+?=.*?"(.*?)"', js_str)[0]
|
||||||
|
appid_name = re.findall("appId:.*?(\w+),", js_str)[0]
|
||||||
|
appId = re.findall(f"var\s?{appid_name}\s?=.*?'(.*?)'", js_str)[0]
|
||||||
|
param_name = re.findall("data:\s?\{\s?(\w+):.*?}", js_str)[0]
|
||||||
|
|
||||||
|
des_keys = re.findall(f'DES\.decrypt\(data,\s?(\w+),\s?(\w+)\);', js_str)
|
||||||
|
des_dec_key_name, des_dec_iv_name = des_keys[0]
|
||||||
|
|
||||||
|
des_dec_key = re.findall(f'const\s+?{des_dec_key_name}\s+?=.*?"(.*?)"', js_str)[0]
|
||||||
|
des_dec_iv = re.findall(f'const\s+?{des_dec_iv_name}\s+?=.*?"(.*?)"', js_str)[0]
|
||||||
|
|
||||||
|
aes_keys = re.findall(f'AES\.decrypt\(data,\s?(\w+),\s?(\w+)\);', js_str)
|
||||||
|
aes_dec_key_name, aes_dec_iv_name = aes_keys[0]
|
||||||
|
aes_dec_key = re.findall(f'const\s+?{aes_dec_key_name}\s+?=.*?"(.*?)"', js_str)[0]
|
||||||
|
aes_dec_iv = re.findall(f'const\s+?{aes_dec_iv_name}\s+?=.*?"(.*?)"', js_str)[0]
|
||||||
|
|
||||||
|
method = "GETDAYDATA"
|
||||||
|
obj = {"city": "济南", "month": '201702'}
|
||||||
|
timestamp = int(time.time() * 1000)
|
||||||
|
clienttype = 'WEB'
|
||||||
|
form_data = {
|
||||||
|
"appId": appId,
|
||||||
|
"method": method,
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"clienttype": clienttype,
|
||||||
|
"object": obj,
|
||||||
|
"secret": hashlib.md5(
|
||||||
|
f'{appId}{method}{timestamp}{clienttype}{str(obj)}'.replace("'", '"').replace(' ', '').encode(
|
||||||
|
'utf-8')).hexdigest()
|
||||||
|
}
|
||||||
|
|
||||||
|
base64_d = base64.b64encode(str(form_data).replace("'", '"').replace(' ', '').encode('utf-8')).decode('utf-8')
|
||||||
|
|
||||||
|
result = js.call("aes_encrypt", base64_d, key, iv)
|
||||||
|
print(data := {param_name: result})
|
||||||
|
|
||||||
|
url = "https://www.aqistudy.cn/historydata/api/historyapi.php"
|
||||||
|
|
||||||
|
resp = requests.post(url=url, headers=headers, data=data, verify=False)
|
||||||
|
|
||||||
|
dec_data = js.call('dec_func', resp.text, des_dec_key, des_dec_iv, aes_dec_key, aes_dec_iv)
|
||||||
|
print(json.loads(dec_data))
|
||||||
|
|
||||||
|
|
||||||
|
def bs64_js(js_str):
|
||||||
|
appid_name = re.findall("appId:.*?(\w+),", js_str)[0]
|
||||||
|
appId = re.findall(f"var\s?{appid_name}\s?=.*?'(.*?)'", js_str)[0]
|
||||||
|
param_name = re.findall("data:\s?\{\s?(\w+):.*?}", js_str)[0]
|
||||||
|
|
||||||
|
method = "GETDAYDATA"
|
||||||
|
obj = {"city": "济南", "month": '202206'}
|
||||||
|
timestamp = int(time.time() * 1000)
|
||||||
|
clienttype = 'WEB'
|
||||||
|
form_data = {
|
||||||
|
"appId": appId,
|
||||||
|
"method": method,
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"clienttype": clienttype,
|
||||||
|
"object": obj,
|
||||||
|
"secret": hashlib.md5(
|
||||||
|
f'{appId}{method}{timestamp}{clienttype}{str(obj)}'.replace("'", '"').replace(' ', '').encode(
|
||||||
|
'utf-8')).hexdigest()
|
||||||
|
}
|
||||||
|
|
||||||
|
base64_d = base64.b64encode(str(form_data).replace("'", '"').replace(' ', '').encode('utf-8')).decode('utf-8')
|
||||||
|
|
||||||
|
print(data := {param_name: base64_d})
|
||||||
|
|
||||||
|
url = "https://www.aqistudy.cn/historydata/api/historyapi.php"
|
||||||
|
|
||||||
|
resp = requests.post(url=url, headers=headers, data=data, verify=False)
|
||||||
|
|
||||||
|
des_keys = re.findall(f'DES\.decrypt\(data,\s?(\w+),\s?(\w+)\);', js_str)
|
||||||
|
des_dec_key_name, des_dec_iv_name = des_keys[0]
|
||||||
|
|
||||||
|
des_dec_key = re.findall(f'const\s+?{des_dec_key_name}\s?=.*?"(.*?)"', js_str)[0]
|
||||||
|
des_dec_iv = re.findall(f'const\s+?{des_dec_iv_name}\s?=.*?"(.*?)"', js_str)[0]
|
||||||
|
|
||||||
|
aes_keys = re.findall(f'AES\.decrypt\(data,\s?(\w+),\s?(\w+)\);', js_str)
|
||||||
|
aes_dec_key_name, aes_dec_iv_name = aes_keys[0]
|
||||||
|
aes_dec_key = re.findall(f'const\s+?{aes_dec_key_name}\s?=.*?"(.*?)"', js_str)[0]
|
||||||
|
aes_dec_iv = re.findall(f'const\s+?{aes_dec_iv_name}\s?=.*?"(.*?)"', js_str)[0]
|
||||||
|
|
||||||
|
dec_data = js.call('dec_func', resp.text, des_dec_key, des_dec_iv, aes_dec_key, aes_dec_iv)
|
||||||
|
print(json.loads(dec_data))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
url = "https://www.aqistudy.cn/historydata/daydata.php?city=%E4%BF%9D%E5%AE%9A&month=201910"
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36",
|
||||||
|
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
||||||
|
"Origin": "https://www.aqistudy.cn",
|
||||||
|
"Referer": "https://www.aqistudy.cn/historydata/daydata.php?city=%E4%BF%9D%E5%AE%9A&month=202009",
|
||||||
|
}
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
req = requests.get(url, headers=headers, verify=False)
|
||||||
|
js_url = re.findall(r'src="(resource/js/.*?.min.js\?v=\d+)"', req.text)[0]
|
||||||
|
js_req = requests.get(url=urljoin(url, js_url), headers=headers, verify=False)
|
||||||
|
print(js_req.url)
|
||||||
|
|
||||||
|
js_code = open('/Users/renmeng/work_space/python_work/qnloft-get-web-everything/爬虫/aqistudy网站/airHistory_2108.js', 'r').read()
|
||||||
|
js_bs64_bs64_code = js_req.text[5:-2]
|
||||||
|
js_code = js_code.replace('jscode_pattern', js_bs64_bs64_code)
|
||||||
|
js = execjs.compile(js_code)
|
||||||
|
res = js.call("get_full_js", js_bs64_bs64_code)
|
||||||
|
# print(res)
|
||||||
|
type_len = len(re.findall("dweklxde", res))
|
||||||
|
print(type_len)
|
||||||
|
base64_str = re.findall("'(.*?)'", res)[0]
|
||||||
|
if type_len == 2:
|
||||||
|
target_js = base64.b64decode(base64.b64decode(base64_str)).decode('utf-8')
|
||||||
|
des_js(js_str=target_js)
|
||||||
|
elif type_len == 1:
|
||||||
|
target_js = base64.b64decode(base64_str).decode('utf-8')
|
||||||
|
aes_js(js_str=target_js)
|
||||||
|
elif type_len == 0:
|
||||||
|
bs64_js(js_str=res)
|
|
@ -0,0 +1,11 @@
|
||||||
|
# 代理服务器配置信息
|
||||||
|
proxies = {
|
||||||
|
'http': 'http://proxy_server:port',
|
||||||
|
'https': 'https://proxy_server:port'
|
||||||
|
}
|
||||||
|
|
||||||
|
# 获取代理服务器的 API 函数
|
||||||
|
def get_api():
|
||||||
|
# 在这里可以编写获取代理服务器的 API 调用代码
|
||||||
|
# 返回代理服务器地址和端口
|
||||||
|
return 'proxy_api_server:port'
|
|
@ -0,0 +1,82 @@
|
||||||
|
'''
|
||||||
|
支付宝账单
|
||||||
|
|
||||||
|
cchardet 使用这个包检测比chardet更准确
|
||||||
|
'''
|
||||||
|
import codecs
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import cchardet as chardet
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def detection_file_encoding(file_name): # 自动检测文件编码
|
||||||
|
with open(file_name, 'rb') as file:
|
||||||
|
rawdata = file.read()
|
||||||
|
result = chardet.detect(rawdata)
|
||||||
|
# 检测结果包含编码和置信度信息
|
||||||
|
encoding = result['encoding']
|
||||||
|
confidence = result['confidence']
|
||||||
|
print(f"文件【{file_name}】 编码:{encoding}, 置信度:{confidence:.2f}")
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
|
||||||
|
def encoding_conversion(source_file, target_file, source_encoding, target_encoding): # 文件编码转换
|
||||||
|
file_path = Path(target_file)
|
||||||
|
if file_path.exists():
|
||||||
|
return detection_file_encoding(target_file)
|
||||||
|
# 指定源文件的编码和目标文件的编码
|
||||||
|
source_encoding = source_encoding # 源文件编码
|
||||||
|
target_encoding = target_encoding # 目标文件编码
|
||||||
|
|
||||||
|
# 使用codecs模块打开源文件和目标文件,进行编码转换
|
||||||
|
with codecs.open(source_file, 'r', encoding=source_encoding) as source:
|
||||||
|
with codecs.open(target_file, 'w', encoding=target_encoding) as target:
|
||||||
|
for line in source:
|
||||||
|
target.write(line)
|
||||||
|
|
||||||
|
encoding = detection_file_encoding(target_file)
|
||||||
|
print(f"文件已从 {source_encoding} 编码转换为 {encoding} 编码")
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
|
||||||
|
def reset_account_name(name):
|
||||||
|
if "余额宝" in name or '滴滴出行' in name:
|
||||||
|
return "支付宝"
|
||||||
|
elif "信用卡" in name:
|
||||||
|
return "信用卡"
|
||||||
|
elif "借记卡" in name:
|
||||||
|
return "现金"
|
||||||
|
|
||||||
|
|
||||||
|
class ALiPay:
|
||||||
|
def __init__(self, csv_file):
|
||||||
|
# 获取文件编码
|
||||||
|
self.encoding = detection_file_encoding(csv_file)
|
||||||
|
rename = csv_file.split("-")[1:3]
|
||||||
|
if len(rename) > 0:
|
||||||
|
rename = "_".join(csv_file.split("-")[1:3])
|
||||||
|
else:
|
||||||
|
rename = int(time.time())
|
||||||
|
self.target_file = f'/Users/renmeng/Downloads/支付宝交易账单-{rename}.csv' # 目标文件名
|
||||||
|
# 生成新文件,并且使用加入日期命名
|
||||||
|
self.encoding = encoding_conversion(source_file=csv_file, target_file=self.target_file,
|
||||||
|
source_encoding=self.encoding,
|
||||||
|
target_encoding="utf-8")
|
||||||
|
|
||||||
|
def get_ali_pay_bill(self):
|
||||||
|
# 你可以使用pandas库的skiprows参数来指定从第几行开始读取数据
|
||||||
|
df = pd.read_csv(self.target_file, encoding=self.encoding, skiprows=2)
|
||||||
|
df = df.drop(index=df[df['交易状态'] != '成功'].index)
|
||||||
|
# 将日期列转换为日期时间对象
|
||||||
|
df['创建时间'] = pd.to_datetime(df['创建时间'])
|
||||||
|
df['账户'] = df['支付渠道'].apply(reset_account_name)
|
||||||
|
# 格式化日期列为'%Y-%m-%d'
|
||||||
|
df['创建时间'] = df['创建时间'].dt.strftime('%Y-%m-%d')
|
||||||
|
df['优惠(元)'].apply(lambda x: 0 if not x.strip() else float(x))
|
||||||
|
df['金额'] = df['订单金额(元)'].apply(lambda x: float(x) if x else 0) \
|
||||||
|
- df['累计退款总额(元)'].apply(lambda x: float(x) if x else 0) \
|
||||||
|
- df['优惠(元)'].apply(lambda x: 0 if not x.strip() else float(x))
|
||||||
|
return df
|
|
@ -0,0 +1,48 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from 爬虫.理财记账.ali_pay import ALiPay
|
||||||
|
from 爬虫.理财记账.zs_bank import ZsBank
|
||||||
|
|
||||||
|
# 显示所有列
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
# 显示所有行
|
||||||
|
pd.set_option('display.max_rows', None)
|
||||||
|
# 输出不折行
|
||||||
|
pd.set_option('expand_frame_repr', False)
|
||||||
|
# 最大列宽度
|
||||||
|
pd.set_option('display.max_colwidth', None)
|
||||||
|
|
||||||
|
|
||||||
|
class BillHandle:
|
||||||
|
def __init__(self, ali_pay_file, zs_bank_file, sheet_name):
|
||||||
|
self.ali_pay_file, self.zs_bank_file, self.sheet_name = ali_pay_file, zs_bank_file, sheet_name
|
||||||
|
self.df = pd.DataFrame(
|
||||||
|
columns=['交易类型', '日期', '分类', '子分类', '账户1', '账户2', '金额', '成员', '商家', '项目', '备注'])
|
||||||
|
|
||||||
|
def __init_ali_bill(self):
|
||||||
|
ali_pay_data = ALiPay(self.ali_pay_file).get_ali_pay_bill()
|
||||||
|
ali_pay_data['交易类型'] = '支出'
|
||||||
|
ali_pay_data['日期'] = ali_pay_data['创建时间']
|
||||||
|
ali_pay_data['账户1'] = ali_pay_data['账户']
|
||||||
|
ali_pay_data['备注'] = ali_pay_data['商品名称'] + "_" + ali_pay_data['对方名称']
|
||||||
|
self.df = pd.concat([self.df, ali_pay_data])
|
||||||
|
|
||||||
|
def __init_zs_bank_bill(self):
|
||||||
|
zs_bank_data = ZsBank(self.zs_bank_file, self.sheet_name).get_zs_bank_bill()
|
||||||
|
zs_bank_data['交易类型'] = '支出'
|
||||||
|
zs_bank_data['账户1'] = '信用卡'
|
||||||
|
zs_bank_data['备注'] = zs_bank_data['来源'] + "_" + zs_bank_data['详情']
|
||||||
|
self.df = pd.concat([self.df, zs_bank_data])
|
||||||
|
|
||||||
|
def bill_opt(self):
|
||||||
|
self.__init_ali_bill()
|
||||||
|
self.__init_zs_bank_bill()
|
||||||
|
df = self.df
|
||||||
|
df = df.sort_values(by='日期', ascending=False).reset_index()
|
||||||
|
print(df)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
ali_pay_file = '/Users/renmeng/Downloads/2088102231652088-20230918-108990157-买入交易.csv'
|
||||||
|
zs_bank_file = '/Users/renmeng/Downloads/招商银行对账单.xlsx'
|
||||||
|
zs_bank_sheet = '8-9月对账单'
|
||||||
|
BillHandle(ali_pay_file, zs_bank_file, zs_bank_sheet).bill_opt()
|
|
@ -0,0 +1,44 @@
|
||||||
|
'''
|
||||||
|
招商银行账单
|
||||||
|
'''
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def reset_date(date):
|
||||||
|
# 获取当前日期和时间
|
||||||
|
current_datetime = datetime.now()
|
||||||
|
# 从当前日期中提取年份
|
||||||
|
current_year = current_datetime.year
|
||||||
|
# 将整数转换为字符串并添加前导零,确保它至少有四位数
|
||||||
|
date_str = str(current_year) + str(date).zfill(4)
|
||||||
|
# 将输入字符串解析为日期对象
|
||||||
|
input_date = datetime.strptime(date_str, '%Y%m%d')
|
||||||
|
# 将日期对象格式化为所需的日期字符串格式
|
||||||
|
return input_date.strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
|
||||||
|
def pay_source(details):
|
||||||
|
res = ""
|
||||||
|
source = details.split('-')[0]
|
||||||
|
if source == '京东支付':
|
||||||
|
res = '京东'
|
||||||
|
elif source == '财付通':
|
||||||
|
res = '微信'
|
||||||
|
elif source == '支付宝':
|
||||||
|
res = '支付宝'
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
class ZsBank:
|
||||||
|
def __init__(self, bill_file, sheet_name):
|
||||||
|
self.df = pd.read_excel(bill_file, sheet_name=sheet_name)
|
||||||
|
|
||||||
|
def get_zs_bank_bill(self):
|
||||||
|
self.df['金额'] = self.df['金额'].astype(str).str.replace(',', '', regex=True).astype(float)
|
||||||
|
total_sum = self.df['金额'].sum()
|
||||||
|
print(total_sum)
|
||||||
|
self.df['日期'] = self.df['日期'].apply(reset_date)
|
||||||
|
self.df['来源'] = self.df['详情'].apply(pay_source)
|
||||||
|
return self.df
|
Loading…
Reference in New Issue