新增豆瓣网页的抓取
This commit is contained in:
parent
0a7519cf68
commit
74c22e48f4
|
@ -0,0 +1,109 @@
|
||||||
|
import csv
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def get_book_html(start=1, doulist_no=''):
|
||||||
|
url = f'https://www.douban.com/doulist/{doulist_no}/?start={start}&sort=time&playable=0&sub_type='
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
|
||||||
|
}
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
return BeautifulSoup(response.content, 'html.parser')
|
||||||
|
|
||||||
|
|
||||||
|
def get_book_total_page(soup):
|
||||||
|
total_pages = 0
|
||||||
|
# 找到包含总页数的span元素
|
||||||
|
total_pages_span = soup.find('span', class_='thispage')
|
||||||
|
if total_pages_span:
|
||||||
|
total_pages = total_pages_span['data-total-page'] # 获取data-total-page属性
|
||||||
|
print(f'总页数: {total_pages}')
|
||||||
|
return total_pages
|
||||||
|
|
||||||
|
|
||||||
|
def get_book_list(soup):
|
||||||
|
books_info = []
|
||||||
|
# 找到所有的doulist-item
|
||||||
|
items = soup.find_all('div', class_='doulist-item')
|
||||||
|
# 用于存储结果的集合
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
# 提取标题
|
||||||
|
title_tag = item.find('div', class_='title').a
|
||||||
|
title = title_tag.text.strip()
|
||||||
|
link = title_tag['href'] # 获取标题链接
|
||||||
|
|
||||||
|
# 提取评分
|
||||||
|
rating_span = item.find('span', class_='rating_nums')
|
||||||
|
rating = rating_span.text.strip() if rating_span and rating_span.text.strip() else '0'
|
||||||
|
|
||||||
|
# 提取评价人数
|
||||||
|
comments_span = item.find('div', class_='rating').find_all('span')
|
||||||
|
if len(comments_span) > 2 and comments_span[2].text:
|
||||||
|
comments_text = comments_span[2].text.strip()
|
||||||
|
comments_count = comments_text.split('(')[1].split('人')[0] # 从 "(172人评价)" 提取 "172"
|
||||||
|
else:
|
||||||
|
comments_count = '无评价'
|
||||||
|
# 将数据放入集合
|
||||||
|
books_info.append({
|
||||||
|
'title': title,
|
||||||
|
'rating': rating,
|
||||||
|
'comments_count': comments_count,
|
||||||
|
'link': link
|
||||||
|
})
|
||||||
|
return books_info
|
||||||
|
|
||||||
|
|
||||||
|
def save_data_to_csv(file_name, data):
|
||||||
|
# 保存到CSV文件
|
||||||
|
with open(f'/Users/renmeng/Downloads/{file_name}.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
||||||
|
fieldnames = ['title', 'rating', 'comments_count', 'link']
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||||
|
|
||||||
|
# 写入表头
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
# 写入数据
|
||||||
|
for book in data:
|
||||||
|
writer.writerow(book)
|
||||||
|
|
||||||
|
print("数据已成功保存到 books_info.csv")
|
||||||
|
|
||||||
|
|
||||||
|
def main(file_name, dou_list_no):
|
||||||
|
page_size = 25
|
||||||
|
books_info = []
|
||||||
|
# 第一次拉取
|
||||||
|
soup = get_book_html(doulist_no=dou_list_no)
|
||||||
|
print("第一次拉取成功!")
|
||||||
|
# 获取总页数
|
||||||
|
total_pages = int(get_book_total_page(soup))
|
||||||
|
print(f"获取到的页数是:{total_pages}")
|
||||||
|
# 获取一下第一次页面中的内容
|
||||||
|
data = get_book_list(soup)
|
||||||
|
books_info.extend(data)
|
||||||
|
print(f"当前数据长度是:{len(books_info)}")
|
||||||
|
print(books_info)
|
||||||
|
if total_pages > 0:
|
||||||
|
for page in range(1, total_pages):
|
||||||
|
sleep_time = random.randint(10, 30)
|
||||||
|
print(f"程序需要休息一下。休息时间是:{sleep_time}s")
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
soup = get_book_html(start=page_size * page, doulist_no=dou_list_no)
|
||||||
|
print(f"第 {page + 1} 次拉取成功!")
|
||||||
|
data = get_book_list(soup)
|
||||||
|
books_info.extend(data)
|
||||||
|
print(f"当前数据长度是:{len(books_info)}")
|
||||||
|
save_data_to_csv(file_name=file_name, data=books_info)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# 2023 年书单
|
||||||
|
# dou_list_no = 153478378
|
||||||
|
# 2024年书单
|
||||||
|
dou_list_no = 157428303
|
||||||
|
main("2024_books_info", dou_list_no)
|
Loading…
Reference in New Issue