初始化项目

This commit is contained in:
rm 2023-11-26 20:31:18 +08:00
parent 1e78c893bf
commit 50fa6ae8b8
21 changed files with 4188 additions and 0 deletions

31
Job/README.md Normal file
View File

@ -0,0 +1,31 @@
## 招聘网站数据爬取
### 参考项目:
#### 拉钩网:
> 简单的 python 爬取网站的案例 全网代理、58 到家、房价网、东方财富、ITOrange、邮政编码、康美中药、拉钩、猫眼、投融资、中国裁判文书网、自如网、百科网、中国房价网、网易云音乐、去哪儿网、汽车之家
- [spider-project](https://github.com/tanjunchen/spider-project)
> 一个拉钩网招聘信息的爬虫,仅供学习参考用!任何商业用途后果自负
- [lagou_crawler](https://github.com/DE009/lagou_crawler)
#### BOSS 直聘
> 爬虫逆向案例,已完成:网易易盾 | 微信小程序反编译逆向(百达星系) | 同花顺 | rpc解密 | 加速乐 | 极验滑块验证码 | 巨量算数 | Boss直聘 | 企查查 | 中国五矿 | qq音乐 | 产业政策大数据平台 | 企知道 | 雪球网(acw_sc__v2) | 1688 | 七麦数据 | whggzy | 企名科技 | mohurd | 艺恩数据 | 欧科云链
- [spider_reverse](https://github.com/ChenZixinn/spider_reverse)
> python爬虫项目合集从基础到js逆向包含基础篇、自动化篇、进阶篇以及验证码篇。案例涵盖各大网站(xhs douyin weibo ins boss jobjd...),你将会学到有关爬虫以及反爬虫、自动化和验证码的各方面知识
-[crawlProject](https://github.com/xishandong/crawlProject)
#### 智联招聘
#### 简历自动投递
> 帮你自动在拉钩网上投递简历
- [拉钩网自动投递](https://github.com/BeammNotFound/get-jobs-lagou)
- [Boss直聘自动投递](https://github.com/BeammNotFound/get-jobs-boss)
- [前程无忧自动投递](https://github.com/BeammNotFound/get-jobs-51job)

View File

@ -0,0 +1,28 @@
[获取全部检索条件的URL](https://www.zhipin.com/wapi/zpgeek/search/job/condition.json)
[获取全国城市URL](https://www.zhipin.com/wapi/zpCommon/data/cityGroup.json)
[职位搜索的主URL需要参数](https://www.zhipin.com/wapi/zpgeek/mobile/search/joblist.json)
- scene: 1
- query: Java
- city: 101010100
- experience:
- payType:
- partTime:
- degree:
- industry:
- scale:
- stage:
- position:
- jobType:
- salary:
- multiBusinessDistrict:
- multiSubway:
- page: 1
- pageSize: 30
[获取相关搜索条件的全部关键词](https://www.zhipin.com/wapi/zpgeek/search/job/related/word.json?query=关键词)
[获取相关城市城区信息](https://www.zhipin.com/wapi/zpgeek/businessDistrict.json?cityCode=101010100)
[获取地铁线路信息,根据地铁线路进行检索时需要](https://www.zhipin.com/wapi/zpCommon/data/getSubwayByCity?cityCode=101010100)
[公司行业检索条件数据](https://www.zhipin.com/wapi/zpCommon/data/industry.json)
[职位类型检索条件数据](https://www.zhipin.com/wapi/zpCommon/data/getCityShowPosition)

View File

@ -0,0 +1,367 @@
import json
import time
from csv import DictWriter
from itertools import islice
from typing import Literal, Iterator, Union
from urllib.parse import urlparse, parse_qs
import execjs
import requests
from lxml import etree
from tqdm import tqdm
# ip代理信息
# from Proxy_info import proxies, get_api
# from boss.点选 import BossSlide
# 类型控制
Accept = Literal['json', 'text', 'contents']
city_code_dict: dict = json.load(open('cityCode.json', 'r', encoding='utf-8'))
# 休眠时间
sleepTime = 5
class BossJob:
def __init__(self, js_name: str = '', proxy: dict = None):
self.isFirst: bool = True # 是否为初次访问
self.js_name: str = js_name # js的名称
self.seed: str = '' # 随机种子
self.ts: str = '' # 时间戳
# api列表
self.apiList: list[str] = [
'https://www.zhipin.com/wapi/zpgeek/mobile/search/joblist.json', # 职位搜索页, 需要指定params
'https://www.zhipin.com/job_detail/', # 不需要指定params
f'https://www.zhipin.com/web/common/security-js/{self.js_name}.js', # 动态加载js的链接
'https://www.zhipin.com/wapi/zpgeek/search/joblist.json' # web api
]
# 请求头
self.headers: dict = {
'Accept': 'application/json, text/plain, */*',
}
self.cookies: dict = {} # cookie
self.js = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()) # 调用的js
self.stop: bool = False # 控制手机端搜索停止
self.checkEnd: str = '' # 检测手机端是否爬完
self.proxy = proxy # 代理
# 发送请求
def ajax_request(self, url: str, params: dict = None, cookies=None) -> requests.Response:
for _ in range(5):
try:
resp = requests.get(url, params=params, headers=self.headers, cookies=cookies, timeout=10,
# proxies=self.proxy
)
if resp.status_code == 200:
return resp
elif resp.status_code == 403:
print("=====出现响应码403, ip被封=====")
self.show_pro(sleepTime)
self.change_ip()
continue
else:
print('HTTP Error: %s' % resp.status_code)
self.show_pro(sleepTime)
continue
except Exception as e:
print('出现错误: ', e)
print('链接为: ', url)
self.show_pro(sleepTime)
continue
else:
raise Exception('超过5次也无法正常获取响应...')
# 初始化搜索
def first_get_seed(self, url: str, params: dict = None, isWeb: bool = False) -> Union[requests.Response, None]:
if self.isFirst:
resp = self.ajax_request(url=url, params=params)
self.isFirst = False
else:
resp = self.ajax_request(url=url, params=params, cookies=self.cookies)
# 未发生重定向以及是web端的情况
if resp.url == url and not isWeb:
print(f'=====本次没有更新cookie: {resp.url} =====')
return resp
elif isWeb:
zpData = resp.json()['zpData']
self.seed = zpData['seed']
self.ts = zpData['ts']
name = zpData['name']
self.check_js(name)
return
# 处理重定向到检查页面的情况
parsedUrl = urlparse(resp.url)
generatedDict = parse_qs(parsedUrl.query)
self.seed = generatedDict['seed'][0]
self.ts = generatedDict['ts'][0]
name = generatedDict['name'][0]
self.check_js(name)
# 手机端搜索职位
def search_job_mobile(self, position: str, city: str, startPage: int = 1) -> Iterator:
self.headers.update({
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36',
})
city_code = city_code_dict.get(city)
if city_code:
params: dict = {
'city': city_code,
'query': position,
'page': startPage
}
# 初始化搜索
self.first_get_seed(self.apiList[1], params)
self.update_cookie()
continuations: list = [params]
# 模拟翻页
while continuations:
continuation = continuations.pop()
resp = self.ajax_request('https://www.zhipin.com/wapi/zpgeek/mobile/search/joblist.json',
params=continuation, cookies=self.cookies)
html = resp.json().get('zpData', {}).get('html')
# 存在新的帖子
if html and self.stop is False:
print(f'=====爬取{position}-{city}{continuation["page"]}页=====')
continuation['page'] += 1
continuations.append(continuation)
# 提交数据
yield from self.parse_search_html(html)
# 控制爬取频率
self.show_pro(sleepTime)
elif not html and self.stop is False:
print('=====ip被封=====')
continuations.append(continuation)
self.show_pro(sleepTime)
self.change_ip()
self.isFirst = True
self.first_get_seed(self.apiList[1], params)
self.update_cookie()
else:
print(f'=====爬取{position}-{city}停止=====')
else:
raise Exception(f'错误的城市名称: {city}')
# web端搜索
def search_job_web(self, position: str, city: str, startPage: int = 1, totalPage: int = 1) -> Iterator:
self.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
})
city_code = city_code_dict.get(city)
if city_code:
params = {
'query': position,
'city': city_code,
'page': 1,
'pageSize': '30',
'scene': '1',
}
# 初次访问
self.isFirst = True
self.first_get_seed(self.apiList[3], params=params, isWeb=True)
page = startPage
# 控制翻页
while page <= totalPage:
params.update({'page': page})
self.update_cookie()
resp = self.ajax_request(self.apiList[3], params=params, cookies=self.cookies)
print(f'=====爬取{position}-{city}{page}页=====')
# 出现访问异常重新生成cookie
if resp.json().get('code') == 37:
print(f'====={resp.json().get("message")}, 正在重试 =====')
zpData = resp.json()['zpData']
self.seed = zpData['seed']
self.ts = zpData['ts']
self.show_pro(sleepTime)
continue
# 出现ip被封暂停一下
elif resp.json().get('code') == 5002:
print(f'====={resp.json().get("message")}=====')
self.show_pro(sleepTime)
self.change_ip()
self.isFirst = True
self.first_get_seed(self.apiList[3], params=params, isWeb=True)
continue
# 得到数据
searchData = resp.json().get('zpData', {}).get('jobList')
if searchData:
page += 1
# 提交管道
yield from self.parse_search_data(searchData)
# 休息一下
self.show_pro(sleepTime)
# 获取下一次访问所需种子和时间戳
self.seed = resp.cookies['__zp_sseed__']
self.ts = resp.cookies['__zp_sts__']
else:
raise Exception(f'错误的城市名称: {city}')
# 获取详情页
def get_job_details_by_id(self, encryptJobId: str) -> str:
url = self.apiList[1] + encryptJobId + '.html'
return self.get_job_details_bt_url(url)
# 获取详情页
def get_job_details_bt_url(self, url: str) -> str:
self.headers.update({
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36',
})
resp = self.first_get_seed(url)
self.update_cookie()
if not resp:
resp = self.ajax_request(url, cookies=self.cookies)
tree = etree.HTML(resp.text)
texts = tree.xpath('//div[@class="detail-content"]//text()')
textList: list = [i.strip() for i in texts if i.strip()]
if not textList:
print('===== 重置cookie获取详情页 =====')
self.isFirst = True
self.show_pro(sleepTime)
return self.get_job_details_bt_url(url)
return '\n'.join(textList)
# 保存手机端搜索结果
def save_job_list_to_csv(self, position: str, city: str, startPage: int = 1, saveCount: int = 100):
dataSet: Iterator = self.search_job_mobile(position, city, startPage)
header = ['job_name', 'detail_url', 'pay', 'company_name', 'requirement']
fp = open(f'mobile-{position}-{city}.csv', 'w', encoding='utf-8', newline='')
writer = DictWriter(fp, header)
writer.writeheader()
for job in islice(dataSet, saveCount):
job['requirement'] = ';'.join(job['requirement'])
writer.writerow(job)
# 保存web端搜索结果
def save_job_list_to_csv_web(self, position: str, city: str, startPage: int = 1, savePage: int = 2):
dataSet = self.search_job_web(position, city, startPage, savePage)
header = [
'jobName', 'encryptJobId', 'salaryDesc', 'jobLabels', 'skills', 'jobExperience',
'jobDegree', 'cityName', 'brandName', 'brandScaleName', 'welfareList', 'brandIndustry'
]
fp = open(f'web-{position}-{city}.csv', 'w', encoding='utf-8', newline='')
writer = DictWriter(fp, header)
writer.writeheader()
for job in dataSet:
job['jobLabels'] = ';'.join(job['jobLabels'])
job['skills'] = ';'.join(job['skills'])
job['welfareList'] = ';'.join(job['welfareList'])
writer.writerow(job)
# 更新cookie
def update_cookie(self):
print(f"seed === {self.seed} , ts === {self.ts}")
__zp = self.js.call('r', self.seed, self.ts)
self.cookies['__zp_stoken__'] = __zp
print(f'=====更新cookie: {self.cookies["__zp_stoken__"]}')
# 解析手机端搜索
def parse_search_html(self, html: str) -> Iterator:
tree = etree.HTML(html)
li_list = tree.xpath('//li')
for num, li in enumerate(li_list, start=1):
if num == 1:
if self.checkEnd == li.xpath('./a/@href')[0]:
self.stop = True
return
self.checkEnd = li.xpath('./a/@href')[0]
yield {
'job_name': li.xpath('./a/div[1]/span[1]/text()')[0],
'detail_url': 'https://www.zhipin.com' + li.xpath('./a/@href')[0],
'pay': li.xpath('a/div[1]/span[2]/text()')[0],
'company_name': li.xpath('./a/div[2]/span[1]/text()')[0],
'requirement': [r.strip() for r in li.xpath('./a/div[3]//text()') if r.strip()]
}
# 检查js是否为最新
def check_js(self, name):
if self.js_name != name:
self.js_name = name
print(f"=====这次的js名称 -----> {name} =====")
resp = self.ajax_request(f'https://www.zhipin.com/web/common/security-js/{self.js_name}.js').text
resp_ = resp.split('module,')
resp = ''
# 对 module 进行处理,否则容易识别为爬虫
for i in range(len(resp_)):
resp += resp_[i]
if i == 0:
resp += 'module_,'
if i == 1:
resp += 'module,'
with open('./jssss.js', 'w', encoding='utf-8') as f:
f.write(resp)
@staticmethod
# 解析web端搜索结果
def parse_search_data(searchData: list[dict]) -> Iterator:
for job in searchData:
yield {
'jobName': job['jobName'],
'encryptJobId': job['encryptJobId'],
'salaryDesc': job['salaryDesc'],
'jobLabels': job['jobLabels'],
'skills': job['skills'],
'jobExperience': job['jobExperience'],
'jobDegree': job['jobDegree'],
'cityName': job['cityName'],
'brandName': job['brandName'],
'brandScaleName': job['brandScaleName'],
'welfareList': job['welfareList'],
'brandIndustry': job['brandIndustry']
}
@staticmethod
def change_ip():
# response = requests.get(
# 'https://www.zhipin.com/wapi/zpAntispam/v2/geetest/validate',
# params=self.__do_verify(),
# cookies=self.cookies,
# headers=self.headers,
# )
# print(response.text)
pass
@staticmethod
# 展示休息进度条
def show_pro(t: int, isOpen: bool = True):
pass
# time.sleep(1)
# if isOpen:
# for _ in tqdm(
# range(t * 10),
# leave=False,
# colour='blue',
# desc='正在等待中...',
# ascii='*-'
# ):
# time.sleep(0.1)
if __name__ == '__main__':
boss = BossJob('8955eed0')
# 通过url获取详情页
# detail = boss.get_job_details_bt_url('https://www.zhipin.com/job_detail/fc823036861698e10nF42NW0GVo~.html')
# 通过加密id获取详情页
# detail = boss.get_job_details_by_id('05988daddc5b6afc1n1-3du1FVZW')
# print(detail)
# 保存数据
# boss.save_job_list_to_csv('python', '上海', saveCount=20)
# boss.save_job_list_to_csv_web('python', '上海', 2, 2)
# web搜索
items = boss.search_job_web('python', '上海', 1, 10)
# mobile搜搜
# items = boss.search_job_mobile('web', '上海')
for item in items:
print(item)

View File

@ -0,0 +1,373 @@
{
"鞍山": 101070300,
"阿拉善盟": 101081200,
"安康": 101110700,
"阿克苏地区": 101131000,
"阿勒泰地区": 101131500,
"阿拉尔": 101131700,
"阿里地区": 101140700,
"安阳": 101180200,
"安庆": 101220600,
"安顺": 101260300,
"阿坝藏族羌族自治州": 101271900,
"澳门": 101330100,
"北京": 101010100,
"白城": 101060500,
"白山": 101060800,
"本溪": 101070500,
"包头": 101080200,
"巴彦淖尔": 101080800,
"保定": 101090200,
"宝鸡": 101110900,
"滨州": 101121100,
"巴音郭楞蒙古自治州": 101130400,
"博尔塔拉蒙古自治州": 101130500,
"北屯市": 101132100,
"白银": 101161000,
"蚌埠": 101220200,
"亳州": 101220900,
"毕节": 101260500,
"巴中": 101270900,
"保山": 101290300,
"百色": 101301000,
"北海": 101301300,
"白沙黎族自治县": 101311400,
"保亭黎族苗族自治县": 101311800,
"重庆": 101040100,
"长春": 101060100,
"朝阳": 101071200,
"赤峰": 101080500,
"承德": 101090400,
"沧州": 101090700,
"长治": 101100500,
"昌吉回族自治州": 101130300,
"昌都": 101140300,
"常州": 101191100,
"滁州": 101221000,
"池州": 101221500,
"长沙": 101250100,
"郴州": 101250500,
"常德": 101250600,
"成都": 101270100,
"潮州": 101281500,
"楚雄彝族自治州": 101291700,
"崇左": 101300200,
"澄迈": 101311200,
"昌江黎族自治县": 101311500,
"大庆": 101050800,
"大兴安岭地区": 101051300,
"大连": 101070200,
"丹东": 101070600,
"大同": 101100200,
"德州": 101120400,
"东营": 101121200,
"定西": 101160200,
"达州": 101270600,
"德阳": 101271700,
"东莞": 101281600,
"东沙群岛": 101282200,
"德宏傣族景颇族自治州": 101291300,
"迪庆藏族自治州": 101291500,
"大理白族自治州": 101291600,
"儋州": 101310400,
"东方": 101310900,
"定安": 101311000,
"鄂尔多斯": 101080600,
"鄂州": 101200300,
"恩施土家族苗族自治州": 101201300,
"抚顺": 101070400,
"阜新": 101070900,
"阜阳": 101220800,
"福州": 101230100,
"抚州": 101240400,
"佛山": 101280800,
"防城港": 101301400,
"果洛藏族自治州": 101150600,
"甘南藏族自治州": 101161400,
"固原": 101170400,
"赣州": 101240700,
"贵阳": 101260100,
"广安": 101270800,
"广元": 101271800,
"甘孜藏族自治州": 101272100,
"广州": 101280100,
"桂林": 101300500,
"贵港": 101300800,
"哈尔滨": 101050100,
"黑河": 101050600,
"鹤岗": 101051100,
"葫芦岛": 101071400,
"呼和浩特": 101080100,
"呼伦贝尔": 101080700,
"衡水": 101090800,
"邯郸": 101091000,
"汉中": 101110800,
"菏泽": 101121000,
"哈密": 101130900,
"和田地区": 101131300,
"海东": 101150200,
"海北藏族自治州": 101150300,
"黄南藏族自治州": 101150400,
"海南藏族自治州": 101150500,
"海西蒙古族藏族自治州": 101150800,
"鹤壁": 101181200,
"淮安": 101190900,
"黄冈": 101200500,
"黄石": 101200600,
"杭州": 101210100,
"湖州": 101210200,
"合肥": 101220100,
"淮南": 101220400,
"淮北": 101221100,
"黄山": 101221600,
"衡阳": 101250400,
"怀化": 101251200,
"惠州": 101280300,
"河源": 101281200,
"红河哈尼族彝族自治州": 101291200,
"贺州": 101300700,
"河池": 101301200,
"海口": 101310100,
"佳木斯": 101050400,
"鸡西": 101051000,
"吉林": 101060200,
"锦州": 101070700,
"晋中": 101100400,
"晋城": 101100600,
"济南": 101120100,
"济宁": 101120700,
"金昌": 101160600,
"酒泉": 101160800,
"嘉峪关": 101161200,
"焦作": 101181100,
"济源": 101181800,
"荆州": 101200800,
"荆门": 101201200,
"嘉兴": 101210300,
"金华": 101210900,
"九江": 101240200,
"吉安": 101240600,
"景德镇": 101240800,
"江门": 101281100,
"揭阳": 101281900,
"克拉玛依": 101130200,
"克孜勒苏柯尔克孜自治州": 101131100,
"喀什地区": 101131200,
"可克达拉市": 101132200,
"昆玉市": 101132300,
"开封": 101180800,
"昆明": 101290100,
"辽源": 101060600,
"辽阳": 101071000,
"廊坊": 101090600,
"临汾": 101100700,
"吕梁": 101101100,
"临沂": 101120900,
"聊城": 101121700,
"拉萨": 101140100,
"林芝": 101140400,
"兰州": 101160100,
"陇南": 101161100,
"临夏回族自治州": 101161300,
"洛阳": 101180900,
"漯河": 101181500,
"连云港": 101191000,
"丽水": 101210800,
"六安": 101221400,
"龙岩": 101230700,
"娄底": 101250800,
"六盘水": 101260600,
"泸州": 101271000,
"乐山": 101271400,
"凉山彝族自治州": 101272000,
"临沧": 101290800,
"丽江": 101290900,
"柳州": 101300300,
"来宾": 101300400,
"临高": 101311300,
"乐东黎族自治县": 101311600,
"陵水黎族自治县": 101311700,
"牡丹江": 101050300,
"马鞍山": 101220500,
"绵阳": 101270400,
"眉山": 101271500,
"梅州": 101280400,
"茂名": 101282000,
"那曲": 101140600,
"南阳": 101180700,
"南京": 101190100,
"南通": 101190500,
"宁波": 101210400,
"宁德": 101230300,
"南平": 101230900,
"南昌": 101240100,
"南充": 101270500,
"内江": 101271200,
"怒江傈僳族自治州": 101291400,
"南宁": 101300100,
"盘锦": 101071300,
"平凉": 101160300,
"平顶山": 101180500,
"濮阳": 101181300,
"莆田": 101230400,
"萍乡": 101240900,
"攀枝花": 101270200,
"普洱": 101290500,
"齐齐哈尔": 101050200,
"七台河": 101050900,
"秦皇岛": 101091100,
"青岛": 101120200,
"庆阳": 101160400,
"潜江": 101201500,
"衢州": 101211000,
"泉州": 101230500,
"黔东南苗族侗族自治州": 101260700,
"黔南布依族苗族自治州": 101260800,
"黔西南布依族苗族自治州": 101260900,
"清远": 101281300,
"曲靖": 101290200,
"钦州": 101301100,
"琼海": 101310600,
"琼中黎族苗族自治县": 101311900,
"日照": 101121500,
"日喀则": 101140200,
"上海": 101020100,
"绥化": 101050500,
"双鸭山": 101051200,
"四平": 101060300,
"松原": 101060700,
"沈阳": 101070100,
"石家庄": 101090100,
"朔州": 101100900,
"商洛": 101110600,
"石河子": 101131600,
"双河市": 101132400,
"山南": 101140500,
"石嘴山": 101170200,
"商丘": 101181000,
"三门峡": 101181700,
"苏州": 101190400,
"宿迁": 101191300,
"十堰": 101201000,
"随州": 101201100,
"神农架": 101201700,
"绍兴": 101210500,
"宿州": 101220700,
"三明": 101230800,
"上饶": 101240300,
"邵阳": 101250900,
"遂宁": 101270700,
"韶关": 101280200,
"汕头": 101280500,
"深圳": 101280600,
"汕尾": 101282100,
"三亚": 101310200,
"三沙": 101310300,
"天津": 101030100,
"通化": 101060400,
"铁岭": 101071100,
"通辽": 101080400,
"唐山": 101090500,
"太原": 101100100,
"铜川": 101111000,
"泰安": 101120800,
"吐鲁番": 101130800,
"塔城地区": 101131400,
"图木舒克": 101131800,
"铁门关": 101132000,
"天水": 101160900,
"泰州": 101191200,
"天门": 101201600,
"台州": 101210600,
"铜陵": 101221200,
"铜仁": 101260400,
"屯昌": 101311100,
"台湾": 101341100,
"乌海": 101080300,
"乌兰察布": 101080900,
"渭南": 101110500,
"潍坊": 101120600,
"威海": 101121300,
"乌鲁木齐": 101130100,
"五家渠": 101131900,
"武威": 101160500,
"吴忠": 101170300,
"无锡": 101190200,
"武汉": 101200100,
"温州": 101210700,
"芜湖": 101220300,
"文山壮族苗族自治州": 101291100,
"梧州": 101300600,
"五指山": 101310500,
"文昌": 101310700,
"万宁": 101310800,
"锡林郭勒盟": 101081000,
"兴安盟": 101081100,
"邢台": 101090900,
"忻州": 101101000,
"西安": 101110100,
"咸阳": 101110200,
"新星市": 101132500,
"西宁": 101150100,
"新乡": 101180300,
"许昌": 101180400,
"信阳": 101180600,
"徐州": 101190800,
"襄阳": 101200200,
"孝感": 101200400,
"咸宁": 101200700,
"仙桃": 101201400,
"宣城": 101221300,
"厦门": 101230200,
"新余": 101241000,
"湘潭": 101250200,
"湘西土家族苗族自治州": 101251400,
"西双版纳傣族自治州": 101291000,
"香港": 101320300,
"伊春": 101050700,
"延边朝鲜族自治州": 101060900,
"营口": 101070800,
"阳泉": 101100300,
"运城": 101100800,
"延安": 101110300,
"榆林": 101110400,
"烟台": 101120500,
"伊犁哈萨克自治州": 101130600,
"玉树藏族自治州": 101150700,
"银川": 101170100,
"扬州": 101190600,
"盐城": 101190700,
"宜昌": 101200900,
"宜春": 101240500,
"鹰潭": 101241100,
"益阳": 101250700,
"岳阳": 101251000,
"永州": 101251300,
"宜宾": 101271100,
"雅安": 101271600,
"云浮": 101281400,
"阳江": 101281800,
"玉溪": 101290400,
"玉林": 101300900,
"张家口": 101090300,
"淄博": 101120300,
"枣庄": 101121400,
"张掖": 101160700,
"中卫": 101170500,
"郑州": 101180100,
"周口": 101181400,
"驻马店": 101181600,
"镇江": 101190300,
"舟山": 101211100,
"漳州": 101230600,
"株洲": 101250300,
"张家界": 101251100,
"遵义": 101260200,
"自贡": 101270300,
"资阳": 101271300,
"珠海": 101280700,
"肇庆": 101280900,
"湛江": 101281000,
"中山": 101281700,
"昭通": 101290700
}

510
Job/com.boss_zhipin/demo.js Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,4 @@
import execjs
js = execjs.compile(open('demo.js', 'r', encoding='utf-8').read())
js.call('r', "1EAWUR51t3ADpSjeK5ywydCLIV2U4WaF93nocYiDXQs=", "1699709623728")

24
Job/com.lagou/README.md Normal file
View File

@ -0,0 +1,24 @@
# 招聘数据拉取
## 拉钩网 使用说明
> 注意需要有node环境具体安装和配置请自行搜索~
1. 首先进入到`Job`文件夹下,安装三个包,命令如下:
```shell
npm install crypto-js
npm install jsencrypt
npm install get-random-values
```
安装完成后便可执行 `la_gou.py` 脚本
2. 执行中如果出现如下错误:
```shell
window is not defined
```
需要找到`jsencrypt`的安装目录,`node_modules/jsencrypt/bin/jsencrypt.js` 下,加入如下代码:
```javascript
var window = {};
var navigator ={};
```
之后再次执行。

View File

@ -0,0 +1,267 @@
import time
import json
import execjs
import requests
from lxml import etree
from urllib import parse
aes_key = ""
secret_key_value = ""
with open('lagou.js', 'r', encoding='utf-8') as f:
lagou_js = execjs.compile(f.read())
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
x_anit = {
"x-anit-forge-code": "0",
"x-anit-forge-token": None
}
global_cookies = {
# - 如果 IP 被拉黑,访问需要登录,或者提示太频繁,那么就需要那么就需要补全登录后的 cookie
# - 以下所有值都必须在登录后再复制过来,特别是 JSESSIONID没登录得到的 JSESSIONID 是无效的!
# - 经过测试主要是以下三个 cookie 起作用能保持登录login、gate_login_token、_putrc
# - 还有一个 JSESSIONID 值,主要是用来获取请求头 x-anit-forge-code 和 x-anit-forge-token
# "login": "true",
# "gate_login_token": "",
# "_putrc": "",
# "JSESSIONID": ""
}
def get_user_trace_token() -> str:
# 获取 cookie 中的 user_trace_token
json_url = "https://a.lagou.com/json"
headers = {
"Host": "a.lagou.com",
"Referer": "https://www.lagou.com/",
"User-Agent": UA
}
params = {
"lt": "trackshow",
"t": "ad",
"v": 0,
"dl": "https://www.lagou.com/",
"dr": "https://www.lagou.com",
"time": str(int(time.time() * 1000))
}
response = requests.get(url=json_url, headers=headers, params=params)
user_trace_token = response.cookies.get_dict()["user_trace_token"]
return user_trace_token
def get_lg_stoken(original_data: dict) -> str:
# 获取 cookie 中的 __lg_stoken__
token_url = "https://www.lagou.com/wn/jobs"
token_headers = {
"Host": "www.lagou.com",
"Referer": "https://www.lagou.com/",
"User-Agent": UA
}
params = {
"kd": original_data["kd"],
"city": original_data["city"],
"fromSearch": original_data["fromSearch"],
"pn": original_data["pn"],
"px": original_data["px"]
}
token_response = requests.get(url=token_url, params=params, headers=token_headers, cookies=global_cookies,
allow_redirects=False)
if token_response.status_code != 302:
raise Exception("获取跳转链接异常!检查 global_cookies 是否已包含 __lg_stoken__")
# 获取 302 跳转的地址
security_check_url = token_response.headers["Location"]
print(f"security_check_url --->>> {security_check_url}")
if "login" in security_check_url:
raise Exception("IP 被关进小黑屋啦!需要登录!请补全登录后的 Cookie或者自行添加代理")
parse_result = parse.urlparse(security_check_url)
# url 的参数为待加密对象
security_check_params = parse_result.query
# 取 name 参数,为混淆 js 的文件名
security_check_js_name = parse.parse_qs(security_check_params)["name"][0]
# 发送请求,获取混淆的 js
js_url = "https://www.lagou.com/common-sec/dist/" + security_check_js_name + ".js"
js_headers = {
"Host": "www.lagou.com",
"Referer": security_check_url,
"User-Agent": UA
}
js_response = requests.get(url=js_url, headers=js_headers, cookies=global_cookies).text
# 补全 js添加 window 参数和一个方法,用于获取 __lg_stoken__ 的值
lg_js = """
window = {
"location": {
"hostname": "www.lagou.com",
"search": '?%s'
}
}
function getLgStoken(){
return window.gt.prototype.a()
}
""" % security_check_params + js_response
lg_stoken = execjs.compile(lg_js).call("getLgStoken")
print(f"lg_stoken --->>> {lg_stoken}")
return lg_stoken
def update_cookies(original_data: dict) -> None:
global global_cookies
# 获取 user_trace_token
user_trace_token = get_user_trace_token()
# 获取 X_HTTP_TOKEN
x_http_token = lagou_js.call("getXHttpToken", "user_trace_token=" + user_trace_token)
# 第一次更新全局 cookies后续获取 __lg_stoken__ 会用到
global_cookies.update({
"user_trace_token": user_trace_token,
"X_HTTP_TOKEN": x_http_token
})
# 获取 __lg_stoken__
lg_stoken = get_lg_stoken(original_data)
# 第二次更新全局 cookies
global_cookies.update({
"__lg_stoken__": lg_stoken,
})
def update_aes_key() -> None:
# 通过JS获取 AES Key并通过接口激活接口激活后会返回一个 secretKeyValue后续请求头会用到
global aes_key, secret_key_value
url = "https://gate.lagou.com/system/agreement"
headers = {
"Content-Type": "application/json",
"Host": "gate.lagou.com",
"Origin": "https://www.lagou.com",
"Referer": "https://www.lagou.com/",
"User-Agent": UA
}
encrypt_data = lagou_js.call("getAesKeyAndRsaEncryptData")
aes_key = encrypt_data["aesKey"]
rsa_encrypt_data = encrypt_data["rsaEncryptData"]
data = {"secretKeyDecode": rsa_encrypt_data}
response = requests.post(url=url, headers=headers, json=data).json()
secret_key_value = response["content"]["secretKeyValue"]
def update_x_anit(original_data: dict) -> None:
# 更新 x-anit-forge-code 和 x-anit-forge-token
url = "https://www.lagou.com/wn/jobs"
headers = {
"Host": "www.lagou.com",
"Referer": "https://www.lagou.com/",
"User-Agent": UA
}
params = {
"kd": original_data["kd"],
"city": original_data["city"]
}
print(f"update_x_anit params --->>> {params}")
response = requests.get(url=url, params=params, headers=headers, cookies=global_cookies)
print(f"update_x_anit params --->>> {response.text}")
tree = etree.HTML(response.text)
next_data_json = json.loads(tree.xpath("//script[@id='__NEXT_DATA__']/text()")[0])
submit_code = next_data_json["props"]["tokenData"]["submitCode"]
submit_token = next_data_json["props"]["tokenData"]["submitToken"]
# 注意 JSESSIONID 必须是登录验证后的!
if not submit_code or not submit_token:
raise Exception("submitCode & submitToken 为空,请检查 JSESSIONID 是否正确!")
global x_anit
x_anit["x-anit-forge-code"] = submit_code
x_anit["x-anit-forge-token"] = submit_token
def get_header_params(original_data: dict) -> dict:
# 后续请求数据所需的请求头参数
# 职位搜索 URL如果是搜索公司那就是 https://www.lagou.com/jobs/companyAjax.json根据实际情况更改
u = "https://www.lagou.com/jobs/v2/positionAjax.json"
return {
"traceparent": lagou_js.call("getTraceparent"),
"X-K-HEADER": secret_key_value,
"X-S-HEADER": lagou_js.call("getXSHeader", aes_key, original_data, u),
"X-SS-REQ-HEADER": json.dumps({"secret": secret_key_value})
}
def get_encrypted_data(original_data: dict) -> str:
# AES 加密原始数据
encrypted_data = lagou_js.call("getRequestData", aes_key, original_data)
return encrypted_data
def get_data(original_data: dict, encrypted_data: str, header_params: dict) -> dict:
# 携带加密后的请求数据和完整请求头拿到密文AES 解密得到明文职位信息
url = "https://www.lagou.com/jobs/v2/positionAjax.json"
referer = parse.urljoin("https://www.lagou.com/wn/jobs?", parse.urlencode(original_data))
headers = {
# "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"Host": "www.lagou.com",
"Origin": "https://www.lagou.com",
"Referer": referer,
"traceparent": header_params["traceparent"],
"User-Agent": UA,
"X-K-HEADER": header_params["X-K-HEADER"],
"X-S-HEADER": header_params["X-S-HEADER"],
"X-SS-REQ-HEADER": header_params["X-SS-REQ-HEADER"],
}
# 添加 x-anit-forge-code 和 x-anit-forge-token
headers.update(x_anit)
data = {"data": encrypted_data}
response = requests.post(url=url, headers=headers, cookies=global_cookies, data=data).json()
if "status" in response:
if not response["status"] and "操作太频繁" in response["msg"]:
raise Exception("获取数据失败msg%s!可以尝试补全登录后的 Cookies或者添加代理" % response["msg"])
else:
raise Exception("获取数据异常!请检查数据是否完整!")
else:
response_data = response["data"]
decrypted_data = lagou_js.call("getResponseData", response_data, aes_key)
return decrypted_data
def main():
# 初始化设置标识
need_init = True
# 要搜索的原始数据例如爬取最新的10页全国Java岗位
# for pn in range(1, 10):
original_data = {
"city": "北京", # 城市范围
"pn": 1, # 页码
"kd": "java", # 关键词
"px": "new", # 排序方式new最新default默认
"fromSearch": "true"
}
while need_init:
# 初始化设置各项参数,这些参数只需要设置一次就行了,后续请求可以复用
# 获取必要的 cookies主要是 user_trace_token、X_HTTP_TOKEN、__lg_stoken__
update_cookies(original_data)
# 获取并激活 AES Key不激活无法使用
update_aes_key()
# 如果登录了账号,则获取职位的请求 header 多了两个参数x-anit-forge-code 和 x-anit-forge-token
# 在实际测试中,不加这两个值,或者随机值都行,严谨起见还是正常获取一下
if "login" in global_cookies:
update_x_anit(original_data)
need_init = False
# 获取请求头各参数X-K-HEADER、X-S-HEADER、X-SS-REQ-HEADER、traceparent
header_params = get_header_params(original_data)
# 将要搜索的原始数据进行 AES 加密
encrypted_data = get_encrypted_data(original_data)
print(f"global_cookies --->>> {global_cookies}")
print(f"original_data --->>> {original_data}")
print(f"header_params --->>> {header_params}")
print(f"encrypted_data --->>> {encrypted_data} key --->>> {aes_key}")
# 发起请求,获取加密数据并解密成明文
data = get_data(original_data, encrypted_data, header_params)
print(data["content"]["hrInfoMap"])
print(data["content"]["positionResult"])
if __name__ == '__main__':
main()

366
Job/com.lagou/lagou.js Normal file

File diff suppressed because one or more lines are too long

6
Job/com.lagou/test.py Normal file
View File

@ -0,0 +1,6 @@
from urllib import parse
if __name__ == '__main__':
a = parse.quote('北京')
print(a)
print("北京".encode(encoding='UTF-8', errors='strict'))

0
OA/__init__.py Normal file
View File

248
OA/oa_clock_in.py Normal file
View File

@ -0,0 +1,248 @@
import sys
import requests
import time
import json
import random
# 定义的全局变量
LOGIN_NAME = "dl-renmeng"
LOGIN_PASSWD = "1111"
IS_WORK_DAY = 0
def get_clock_in_data(clock_in_time):
"""根据当前的小时,返回打卡的入参"""
print("当前时间-小时:", clock_in_time.tm_hour)
# 定义时间类型
time_type_one = ["00:00", "09:00", "18:00", "23:59"]
time_type_two = ["00:00:00", "09:00:00", "18:00:00", "23:59:59"]
clock_in_data = {}
if clock_in_time.tm_hour > 9:
# 下午打卡
clock_in_data = {
"time": time_type_one[2],
"belongtime": time_type_one[2],
"canSignTime": time_type_one[3],
"signTime": time.strftime("%H:%M:%S", clock_in_time),
"date": time.strftime("%Y-%m-%d", clock_in_time),
"belongdate": time.strftime("%Y-%m-%d", clock_in_time),
"datetime": f'{time.strftime("%Y-%m-%d", clock_in_time)} {time_type_two[2]}',
"signSectionTime": f'{time.strftime("%Y-%m-%d", clock_in_time)} {time_type_two[3]}',
"signSection": f'{time.strftime("%Y-%m-%d", clock_in_time)} {time_type_two[0]}#{time.strftime("%Y-%m-%d", struct_time)} {time_type_two[3]}',
"min": "359",
"workmins": "480",
"type": "off",
"across": "0",
"islastsign": "1",
"isYellow": "1",
"isPunchOpen": "1",
"isacross": "0",
"pre": "0",
"active": "0",
"needSign": "0",
"reSign": "1",
"min_next": "-1",
"signfrom": "e9pc",
"serialid": "1",
"signAcross": "0",
"signAcross_next": "0",
"signbelong": "今天",
"signbelongspan": "今天",
}
else:
# 上午打卡
clock_in_data = {
"time": time_type_one[1],
"belongtime": time_type_one[1],
"canSignTime": time_type_one[0],
"date": time.strftime("%Y-%m-%d", clock_in_time),
"belongdate": time.strftime("%Y-%m-%d", clock_in_time),
"datetime": f'{time.strftime("%Y-%m-%d", clock_in_time)} {time_type_two[1]}',
"signSectionTime": f'{time.strftime("%Y-%m-%d", clock_in_time)} {time_type_two[0]}',
"signSection": f'{time.strftime("%Y-%m-%d", clock_in_time)} {time_type_two[0]}#{time.strftime("%Y-%m-%d", struct_time)} {time_type_two[3]}',
"min": "540",
"workmins": "480",
"isfirstsign": "1",
"type": "on",
"across": "0",
"islastsign": "1",
"isYellow": "0",
"isPunchOpen": "1",
"isacross": "0",
"pre": "0",
"active": "1",
"needSign": "1",
"min_next": "-1",
"serialid": "1",
"signAcross": "0",
"signAcross_next": "0",
}
return clock_in_data
def trusty_sleep(sleep_time):
"""睡觉方法"""
print(f"开始休眠:{sleep_time}")
start = time.time()
while time.time() - start < sleep_time:
time.sleep(sleep_time - (time.time() - start))
struct_time = time.localtime(time.time())
# 得到结构化时间格式
now_time = time.strftime("%Y%m%d", struct_time)
# now_time = 20220131
print("当前时间:", now_time)
url = f"https://api.apihubs.cn/holiday/get?field=workday&date={now_time}&workday=1&cn=1&size=31"
print("获取工作日信息url", url)
print("开始发送请求----->>>>>>")
request_result = requests.get(url)
print("请求结果返回----->>>>>>", request_result)
# 请求返回成功
if request_result.status_code == 200:
# 解析json
is_work = json.loads(request_result.text)
# 数据获取成功
if is_work["code"] == 0:
data_list = is_work["data"]["list"][0] if is_work["data"]["list"] else []
IS_WORK_DAY = data_list["workday"] if data_list else 0
print("当前日期是否是工作日10", IS_WORK_DAY)
if IS_WORK_DAY == 1:
header = {
"Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
}
print("开始OA登录----->>>")
login_form_data = {
"loginid": LOGIN_NAME,
"langid": "7",
}
login_form_url = "http://oa.njhgroup.cn/api/hrm/login/getLoginForm"
login_form_result = requests.post(
login_form_url, headers=header, data=login_form_data
)
print(requests.utils.dict_from_cookiejar(login_form_result.cookies))
randcode = json.loads(login_form_result.text)["qrcode"]["loginkey"]
print("获取randcode结果---->>>", randcode)
login_data = {
"loginid": LOGIN_NAME,
"userpassword": LOGIN_PASSWD,
"logintype": "1",
"isie": "false",
}
login_cookie = login_form_result.cookies
# 开始进行OA登录
oa_login_url = "http://oa.njhgroup.cn/api/hrm/login/checkLogin"
login_result = requests.post(
oa_login_url,
headers=header,
data=login_data,
cookies=requests.utils.dict_from_cookiejar(login_form_result.cookies),
)
print(requests.utils.dict_from_cookiejar(login_result.cookies))
print(login_result.text)
print("OA登录结束----->>>", login_result.text)
# 休眠10秒
time.sleep(10)
print("OA开始刷新randcode----->>>")
ts = int(round(time.time() * 1000))
refresh_code_url = f"http://oa.njhgroup.cn/rsa/weaver.rsa.GetRsaInfo?ts={ts}"
refresh_code_result = requests.get(refresh_code_url, headers=header)
print(refresh_code_result.cookies)
print("OA刷新randcode结束----->>>")
# 组装最后的cookie
clock_in_cookie = requests.utils.dict_from_cookiejar(
login_form_result.cookies)
clock_in_cookie.update(
requests.utils.dict_from_cookiejar(refresh_code_result.cookies)
)
clock_in_cookie.update(
requests.utils.dict_from_cookiejar(login_result.cookies))
print("开始检查当日是否请假----->>>")
check_is_work_url = (
"http://oa.njhgroup.cn/api/kq/myattendance/getHrmKQMonthReportInfo"
)
check_is_work_result = requests.post(
check_is_work_url,
headers=header,
data={
"typevalue": time.strftime("%Y-%m", struct_time),
"loaddata": "1",
"type": "2",
},
cookies=clock_in_cookie,
).text
# 解析json
is_work = json.loads(check_is_work_result)
print("结束检查当日是否请假----->>>")
print(f"{struct_time.tm_mday}")
isWorkDay = is_work["result"][f"{struct_time.tm_mday}"]["isWorkDay"]
workflow = len(is_work["result"][f"{struct_time.tm_mday}"]["workflow"])
print(f"今天是否是工作日:{isWorkDay},今天是否请假:{workflow}")
needSign = False
if isWorkDay and workflow <= 0:
needSign = True
else:
print("今日有请假,不打卡~")
sys.exit()
check_is_need_sign_url = "http://oa.njhgroup.cn/api/hrm/kq/attendanceButton/getButtons"
check_is_need_sign_result = requests.post(
check_is_need_sign_url,
headers=header,
cookies=clock_in_cookie,
).text
check_is_need_sign_timeline = json.loads(check_is_need_sign_result)["timeline"]
# 0代表不需要打卡
need_sign = 0
# sign_time 为空也代表不需要打卡
sign_time = ""
if struct_time.tm_hour < 9:
# 上午走第一个集合,上午卡
need_sign = check_is_need_sign_timeline[0]["needSign"]
if "signTime" in check_is_need_sign_timeline[0]:
sign_time = check_is_need_sign_timeline[0]["signTime"]
print(f"上午打卡情况:---{need_sign} ----- {sign_time} --- {len(sign_time)}")
else:
# 下午走第二个集合,下午卡
need_sign = check_is_need_sign_timeline[1]["needSign"]
if "signTime" in check_is_need_sign_timeline[1]:
sign_time = check_is_need_sign_timeline[1]["signTime"]
print(f"下午打卡情况:---{need_sign} ----- {sign_time} --- {len(sign_time)}")
# 可以打卡
if need_sign == "1" and len(sign_time) == 0:
needSign = True
else:
print("已经打卡,无需再打~")
sys.exit()
# 检查是否已经打过卡,如果没有,则继续
if needSign:
# 开始打卡,如果是下午,则休眠一下再打卡
if struct_time.tm_hour > 9:
# 休眠 5分-15分
trusty_sleep(random.randint(300, 900))
print("OA开始打卡----->>>")
# clock_in_cookie["__randcode__"] =
# 刷新时间
struct_time = time.localtime(time.time())
sign_time = time.strftime("%H:%M:%S", struct_time)
get_clock_in_data(struct_time)
clock_in_url = "http://oa.njhgroup.cn/api/hrm/kq/attendanceButton/punchButton"
print(
"OA打卡结束----->>>",
requests.post(
clock_in_url,
headers=header,
data=get_clock_in_data(struct_time),
cookies=clock_in_cookie,
).text,
)

0
Selenium/__init__.py Normal file
View File

21
Selenium/test.py Normal file
View File

@ -0,0 +1,21 @@
from selenium import webdriver
# urllib3 教程https://urllib3.readthedocs.io/en/latest/user-guide.html
# selenium 教程https://www.selenium.dev/zh-cn/documentation/webdriver/getting_started/
# 下载最新的chromedriver https://chromedriver.storage.googleapis.com/index.html
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
# 使用管理器管理驱动程序
# 管理器将驱动下载到了 /Users/renmeng/.wdm/drivers/chromedriver 目录下
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
# options = ChromeOptions()
# driver = webdriver.Chrome(options=options)
# 打开网页
driver.get("https://www.lagou.com/jobs/list_运维?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=") # 打开url网页 比如 driver.get()
wait = WebDriverWait(driver, 10)
# 获取网页源码
print(driver.page_source)
driver.quit()

File diff suppressed because it is too large Load Diff

183
aqistudy网站/main.py Normal file
View File

@ -0,0 +1,183 @@
import base64
import json
import re
import time
import hashlib
from urllib.parse import urljoin
import execjs
import requests
import urllib3
def des_js(js_str):
keys = re.findall(f'DES\.encrypt\((\w+)\s?,\s?(\w+)\s?,\s?(\w+)\)', js_str)
text_name, key_name, iv_name = keys[0]
key = re.findall(f'const\s+?{key_name}\s+?=.*?"(.*?)"', js_str)[0]
iv = re.findall(f'const\s+?{iv_name}\s+?=.*?"(.*?)"', js_str)[0]
appid_name = re.findall("appId:.*?(\w+),", js_str)[0]
appId = re.findall(f"var\s?{appid_name}\s?=.*?'(.*?)'", js_str)[0]
param_name = re.findall("data:\s?\{\s?(\w+):.*?}", js_str)[0]
des_keys = re.findall(f'DES\.decrypt\(data,\s?(\w+),\s?(\w+)\);', js_str)
des_dec_key_name, des_dec_iv_name = des_keys[0]
des_dec_key = re.findall(f'const\s+?{des_dec_key_name}\s+?=.*?"(.*?)"', js_str)[0]
des_dec_iv = re.findall(f'const\s+?{des_dec_iv_name}\s+?=.*?"(.*?)"', js_str)[0]
aes_keys = re.findall(f'AES\.decrypt\(data,\s?(\w+),\s?(\w+)\);', js_str)
aes_dec_key_name, aes_dec_iv_name = aes_keys[0]
aes_dec_key = re.findall(f'const\s+?{aes_dec_key_name}\s+?=.*?"(.*?)"', js_str)[0]
aes_dec_iv = re.findall(f'const\s+?{aes_dec_iv_name}\s+?=.*?"(.*?)"', js_str)[0]
method = "GETDAYDATA"
obj = {"city": "济南", "month": '201702'}
timestamp = int(time.time() * 1000)
clienttype = 'WEB'
form_data = {
"appId": appId,
"method": method,
"timestamp": timestamp,
"clienttype": clienttype,
"object": obj,
"secret": hashlib.md5(
f'{appId}{method}{timestamp}{clienttype}{str(obj)}'.replace("'", '"').replace(' ', '').encode(
'utf-8')).hexdigest()
}
base64_d = base64.b64encode(str(form_data).replace("'", '"').replace(' ', '').encode('utf-8')).decode('utf-8')
result = js.call("des_encrypt", base64_d, key, iv)
print(data := {param_name: result})
url = "https://www.aqistudy.cn/historydata/api/historyapi.php"
resp = requests.post(url=url, headers=headers, data=data, verify=False)
print(resp.text)
dec_data = js.call('dec_func', resp.text, des_dec_key, des_dec_iv, aes_dec_key, aes_dec_iv)
print(json.loads(dec_data))
def aes_js(js_str):
keys = re.findall(f'AES\.encrypt\((\w+)\s?,\s?(\w+)\s?,\s?(\w+)\)', js_str)
text_name, key_name, iv_name = keys[1]
key = re.findall(f'const\s+?{key_name}\s+?=.*?"(.*?)"', js_str)[0]
iv = re.findall(f'const\s+?{iv_name}\s+?=.*?"(.*?)"', js_str)[0]
appid_name = re.findall("appId:.*?(\w+),", js_str)[0]
appId = re.findall(f"var\s?{appid_name}\s?=.*?'(.*?)'", js_str)[0]
param_name = re.findall("data:\s?\{\s?(\w+):.*?}", js_str)[0]
des_keys = re.findall(f'DES\.decrypt\(data,\s?(\w+),\s?(\w+)\);', js_str)
des_dec_key_name, des_dec_iv_name = des_keys[0]
des_dec_key = re.findall(f'const\s+?{des_dec_key_name}\s+?=.*?"(.*?)"', js_str)[0]
des_dec_iv = re.findall(f'const\s+?{des_dec_iv_name}\s+?=.*?"(.*?)"', js_str)[0]
aes_keys = re.findall(f'AES\.decrypt\(data,\s?(\w+),\s?(\w+)\);', js_str)
aes_dec_key_name, aes_dec_iv_name = aes_keys[0]
aes_dec_key = re.findall(f'const\s+?{aes_dec_key_name}\s+?=.*?"(.*?)"', js_str)[0]
aes_dec_iv = re.findall(f'const\s+?{aes_dec_iv_name}\s+?=.*?"(.*?)"', js_str)[0]
method = "GETDAYDATA"
obj = {"city": "济南", "month": '201702'}
timestamp = int(time.time() * 1000)
clienttype = 'WEB'
form_data = {
"appId": appId,
"method": method,
"timestamp": timestamp,
"clienttype": clienttype,
"object": obj,
"secret": hashlib.md5(
f'{appId}{method}{timestamp}{clienttype}{str(obj)}'.replace("'", '"').replace(' ', '').encode(
'utf-8')).hexdigest()
}
base64_d = base64.b64encode(str(form_data).replace("'", '"').replace(' ', '').encode('utf-8')).decode('utf-8')
result = js.call("aes_encrypt", base64_d, key, iv)
print(data := {param_name: result})
url = "https://www.aqistudy.cn/historydata/api/historyapi.php"
resp = requests.post(url=url, headers=headers, data=data, verify=False)
dec_data = js.call('dec_func', resp.text, des_dec_key, des_dec_iv, aes_dec_key, aes_dec_iv)
print(json.loads(dec_data))
def bs64_js(js_str):
appid_name = re.findall("appId:.*?(\w+),", js_str)[0]
appId = re.findall(f"var\s?{appid_name}\s?=.*?'(.*?)'", js_str)[0]
param_name = re.findall("data:\s?\{\s?(\w+):.*?}", js_str)[0]
method = "GETDAYDATA"
obj = {"city": "济南", "month": '202206'}
timestamp = int(time.time() * 1000)
clienttype = 'WEB'
form_data = {
"appId": appId,
"method": method,
"timestamp": timestamp,
"clienttype": clienttype,
"object": obj,
"secret": hashlib.md5(
f'{appId}{method}{timestamp}{clienttype}{str(obj)}'.replace("'", '"').replace(' ', '').encode(
'utf-8')).hexdigest()
}
base64_d = base64.b64encode(str(form_data).replace("'", '"').replace(' ', '').encode('utf-8')).decode('utf-8')
print(data := {param_name: base64_d})
url = "https://www.aqistudy.cn/historydata/api/historyapi.php"
resp = requests.post(url=url, headers=headers, data=data, verify=False)
des_keys = re.findall(f'DES\.decrypt\(data,\s?(\w+),\s?(\w+)\);', js_str)
des_dec_key_name, des_dec_iv_name = des_keys[0]
des_dec_key = re.findall(f'const\s+?{des_dec_key_name}\s?=.*?"(.*?)"', js_str)[0]
des_dec_iv = re.findall(f'const\s+?{des_dec_iv_name}\s?=.*?"(.*?)"', js_str)[0]
aes_keys = re.findall(f'AES\.decrypt\(data,\s?(\w+),\s?(\w+)\);', js_str)
aes_dec_key_name, aes_dec_iv_name = aes_keys[0]
aes_dec_key = re.findall(f'const\s+?{aes_dec_key_name}\s?=.*?"(.*?)"', js_str)[0]
aes_dec_iv = re.findall(f'const\s+?{aes_dec_iv_name}\s?=.*?"(.*?)"', js_str)[0]
dec_data = js.call('dec_func', resp.text, des_dec_key, des_dec_iv, aes_dec_key, aes_dec_iv)
print(json.loads(dec_data))
if __name__ == '__main__':
url = "https://www.aqistudy.cn/historydata/daydata.php?city=%E4%BF%9D%E5%AE%9A&month=201910"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Origin": "https://www.aqistudy.cn",
"Referer": "https://www.aqistudy.cn/historydata/daydata.php?city=%E4%BF%9D%E5%AE%9A&month=202009",
}
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
req = requests.get(url, headers=headers, verify=False)
js_url = re.findall(r'src="(resource/js/.*?.min.js\?v=\d+)"', req.text)[0]
js_req = requests.get(url=urljoin(url, js_url), headers=headers, verify=False)
print(js_req.url)
js_code = open('/Users/renmeng/work_space/python_work/qnloft-get-web-everything/爬虫/aqistudy网站/airHistory_2108.js', 'r').read()
js_bs64_bs64_code = js_req.text[5:-2]
js_code = js_code.replace('jscode_pattern', js_bs64_bs64_code)
js = execjs.compile(js_code)
res = js.call("get_full_js", js_bs64_bs64_code)
# print(res)
type_len = len(re.findall("dweklxde", res))
print(type_len)
base64_str = re.findall("'(.*?)'", res)[0]
if type_len == 2:
target_js = base64.b64decode(base64.b64decode(base64_str)).decode('utf-8')
des_js(js_str=target_js)
elif type_len == 1:
target_js = base64.b64decode(base64_str).decode('utf-8')
aes_js(js_str=target_js)
elif type_len == 0:
bs64_js(js_str=res)

11
代理/proxy_info.py Normal file
View File

@ -0,0 +1,11 @@
# 代理服务器配置信息
proxies = {
'http': 'http://proxy_server:port',
'https': 'https://proxy_server:port'
}
# 获取代理服务器的 API 函数
def get_api():
# 在这里可以编写获取代理服务器的 API 调用代码
# 返回代理服务器地址和端口
return 'proxy_api_server:port'

82
理财记账/ali_pay.py Normal file
View File

@ -0,0 +1,82 @@
'''
支付宝账单
cchardet 使用这个包检测比chardet更准确
'''
import codecs
import time
from pathlib import Path
import cchardet as chardet
import numpy as np
import pandas as pd
def detection_file_encoding(file_name): # 自动检测文件编码
with open(file_name, 'rb') as file:
rawdata = file.read()
result = chardet.detect(rawdata)
# 检测结果包含编码和置信度信息
encoding = result['encoding']
confidence = result['confidence']
print(f"文件【{file_name}】 编码:{encoding}, 置信度:{confidence:.2f}")
return encoding
def encoding_conversion(source_file, target_file, source_encoding, target_encoding): # 文件编码转换
file_path = Path(target_file)
if file_path.exists():
return detection_file_encoding(target_file)
# 指定源文件的编码和目标文件的编码
source_encoding = source_encoding # 源文件编码
target_encoding = target_encoding # 目标文件编码
# 使用codecs模块打开源文件和目标文件进行编码转换
with codecs.open(source_file, 'r', encoding=source_encoding) as source:
with codecs.open(target_file, 'w', encoding=target_encoding) as target:
for line in source:
target.write(line)
encoding = detection_file_encoding(target_file)
print(f"文件已从 {source_encoding} 编码转换为 {encoding} 编码")
return encoding
def reset_account_name(name):
if "余额宝" in name or '滴滴出行' in name:
return "支付宝"
elif "信用卡" in name:
return "信用卡"
elif "借记卡" in name:
return "现金"
class ALiPay:
def __init__(self, csv_file):
# 获取文件编码
self.encoding = detection_file_encoding(csv_file)
rename = csv_file.split("-")[1:3]
if len(rename) > 0:
rename = "_".join(csv_file.split("-")[1:3])
else:
rename = int(time.time())
self.target_file = f'/Users/renmeng/Downloads/支付宝交易账单-{rename}.csv' # 目标文件名
# 生成新文件,并且使用加入日期命名
self.encoding = encoding_conversion(source_file=csv_file, target_file=self.target_file,
source_encoding=self.encoding,
target_encoding="utf-8")
def get_ali_pay_bill(self):
# 你可以使用pandas库的skiprows参数来指定从第几行开始读取数据
df = pd.read_csv(self.target_file, encoding=self.encoding, skiprows=2)
df = df.drop(index=df[df['交易状态'] != '成功'].index)
# 将日期列转换为日期时间对象
df['创建时间'] = pd.to_datetime(df['创建时间'])
df['账户'] = df['支付渠道'].apply(reset_account_name)
# 格式化日期列为'%Y-%m-%d'
df['创建时间'] = df['创建时间'].dt.strftime('%Y-%m-%d')
df['优惠(元)'].apply(lambda x: 0 if not x.strip() else float(x))
df['金额'] = df['订单金额(元)'].apply(lambda x: float(x) if x else 0) \
- df['累计退款总额(元)'].apply(lambda x: float(x) if x else 0) \
- df['优惠(元)'].apply(lambda x: 0 if not x.strip() else float(x))
return df

48
理财记账/main.py Normal file
View File

@ -0,0 +1,48 @@
import pandas as pd
from 爬虫.理财记账.ali_pay import ALiPay
from 爬虫.理财记账.zs_bank import ZsBank
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 输出不折行
pd.set_option('expand_frame_repr', False)
# 最大列宽度
pd.set_option('display.max_colwidth', None)
class BillHandle:
def __init__(self, ali_pay_file, zs_bank_file, sheet_name):
self.ali_pay_file, self.zs_bank_file, self.sheet_name = ali_pay_file, zs_bank_file, sheet_name
self.df = pd.DataFrame(
columns=['交易类型', '日期', '分类', '子分类', '账户1', '账户2', '金额', '成员', '商家', '项目', '备注'])
def __init_ali_bill(self):
ali_pay_data = ALiPay(self.ali_pay_file).get_ali_pay_bill()
ali_pay_data['交易类型'] = '支出'
ali_pay_data['日期'] = ali_pay_data['创建时间']
ali_pay_data['账户1'] = ali_pay_data['账户']
ali_pay_data['备注'] = ali_pay_data['商品名称'] + "_" + ali_pay_data['对方名称']
self.df = pd.concat([self.df, ali_pay_data])
def __init_zs_bank_bill(self):
zs_bank_data = ZsBank(self.zs_bank_file, self.sheet_name).get_zs_bank_bill()
zs_bank_data['交易类型'] = '支出'
zs_bank_data['账户1'] = '信用卡'
zs_bank_data['备注'] = zs_bank_data['来源'] + "_" + zs_bank_data['详情']
self.df = pd.concat([self.df, zs_bank_data])
def bill_opt(self):
self.__init_ali_bill()
self.__init_zs_bank_bill()
df = self.df
df = df.sort_values(by='日期', ascending=False).reset_index()
print(df)
if __name__ == '__main__':
ali_pay_file = '/Users/renmeng/Downloads/2088102231652088-20230918-108990157-买入交易.csv'
zs_bank_file = '/Users/renmeng/Downloads/招商银行对账单.xlsx'
zs_bank_sheet = '8-9月对账单'
BillHandle(ali_pay_file, zs_bank_file, zs_bank_sheet).bill_opt()

44
理财记账/zs_bank.py Normal file
View File

@ -0,0 +1,44 @@
'''
招商银行账单
'''
from datetime import datetime
import pandas as pd
def reset_date(date):
# 获取当前日期和时间
current_datetime = datetime.now()
# 从当前日期中提取年份
current_year = current_datetime.year
# 将整数转换为字符串并添加前导零,确保它至少有四位数
date_str = str(current_year) + str(date).zfill(4)
# 将输入字符串解析为日期对象
input_date = datetime.strptime(date_str, '%Y%m%d')
# 将日期对象格式化为所需的日期字符串格式
return input_date.strftime('%Y-%m-%d')
def pay_source(details):
res = ""
source = details.split('-')[0]
if source == '京东支付':
res = '京东'
elif source == '财付通':
res = '微信'
elif source == '支付宝':
res = '支付宝'
return res
class ZsBank:
def __init__(self, bill_file, sheet_name):
self.df = pd.read_excel(bill_file, sheet_name=sheet_name)
def get_zs_bank_bill(self):
self.df['金额'] = self.df['金额'].astype(str).str.replace(',', '', regex=True).astype(float)
total_sum = self.df['金额'].sum()
print(total_sum)
self.df['日期'] = self.df['日期'].apply(reset_date)
self.df['来源'] = self.df['详情'].apply(pay_source)
return self.df