import csv import random import time import requests from bs4 import BeautifulSoup def get_book_html(start=1, doulist_no=''): url = f'https://www.douban.com/doulist/{doulist_no}/?start={start}&sort=time&playable=0&sub_type=' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36' } response = requests.get(url, headers=headers) return BeautifulSoup(response.content, 'html.parser') def get_book_total_page(soup): total_pages = 0 # 找到包含总页数的span元素 total_pages_span = soup.find('span', class_='thispage') if total_pages_span: total_pages = total_pages_span['data-total-page'] # 获取data-total-page属性 print(f'总页数: {total_pages}') return total_pages def get_book_list(soup): books_info = [] # 找到所有的doulist-item items = soup.find_all('div', class_='doulist-item') # 用于存储结果的集合 for item in items: # 提取标题 title_tag = item.find('div', class_='title').a title = title_tag.text.strip() link = title_tag['href'] # 获取标题链接 # 提取评分 rating_span = item.find('span', class_='rating_nums') rating = rating_span.text.strip() if rating_span and rating_span.text.strip() else '0' # 提取评价人数 comments_span = item.find('div', class_='rating').find_all('span') if len(comments_span) > 2 and comments_span[2].text: comments_text = comments_span[2].text.strip() comments_count = comments_text.split('(')[1].split('人')[0] # 从 "(172人评价)" 提取 "172" else: comments_count = '无评价' # 将数据放入集合 books_info.append({ 'title': title, 'rating': rating, 'comments_count': comments_count, 'link': link }) return books_info def save_data_to_csv(file_name, data): # 保存到CSV文件 with open(f'/Users/renmeng/Downloads/{file_name}.csv', 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['title', 'rating', 'comments_count', 'link'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) # 写入表头 writer.writeheader() # 写入数据 for book in data: writer.writerow(book) print("数据已成功保存到 books_info.csv") def main(file_name, dou_list_no): page_size = 25 books_info = [] # 第一次拉取 soup = get_book_html(doulist_no=dou_list_no) print("第一次拉取成功!") # 获取总页数 total_pages = int(get_book_total_page(soup)) print(f"获取到的页数是:{total_pages}") # 获取一下第一次页面中的内容 data = get_book_list(soup) books_info.extend(data) print(f"当前数据长度是:{len(books_info)}") print(books_info) if total_pages > 0: for page in range(1, total_pages): sleep_time = random.randint(10, 30) print(f"程序需要休息一下。休息时间是:{sleep_time}s") time.sleep(sleep_time) soup = get_book_html(start=page_size * page, doulist_no=dou_list_no) print(f"第 {page + 1} 次拉取成功!") data = get_book_list(soup) books_info.extend(data) print(f"当前数据长度是:{len(books_info)}") save_data_to_csv(file_name=file_name, data=books_info) if __name__ == '__main__': # 2023 年书单 # dou_list_no = 153478378 # 2024年书单 dou_list_no = 157428303 main("2024_books_info", dou_list_no)