From 74c22e48f49ba61cff13966790e5b0aacba47694 Mon Sep 17 00:00:00 2001 From: rm Date: Wed, 16 Oct 2024 18:10:18 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E8=B1=86=E7=93=A3=E7=BD=91?= =?UTF-8?q?=E9=A1=B5=E7=9A=84=E6=8A=93=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- com.douban/doulist.py | 109 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 com.douban/doulist.py diff --git a/com.douban/doulist.py b/com.douban/doulist.py new file mode 100644 index 0000000..fbb4cc3 --- /dev/null +++ b/com.douban/doulist.py @@ -0,0 +1,109 @@ +import csv +import random +import time + +import requests +from bs4 import BeautifulSoup + + +def get_book_html(start=1, doulist_no=''): + url = f'https://www.douban.com/doulist/{doulist_no}/?start={start}&sort=time&playable=0&sub_type=' + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36' + } + response = requests.get(url, headers=headers) + return BeautifulSoup(response.content, 'html.parser') + + +def get_book_total_page(soup): + total_pages = 0 + # 找到包含总页数的span元素 + total_pages_span = soup.find('span', class_='thispage') + if total_pages_span: + total_pages = total_pages_span['data-total-page'] # 获取data-total-page属性 + print(f'总页数: {total_pages}') + return total_pages + + +def get_book_list(soup): + books_info = [] + # 找到所有的doulist-item + items = soup.find_all('div', class_='doulist-item') + # 用于存储结果的集合 + + for item in items: + # 提取标题 + title_tag = item.find('div', class_='title').a + title = title_tag.text.strip() + link = title_tag['href'] # 获取标题链接 + + # 提取评分 + rating_span = item.find('span', class_='rating_nums') + rating = rating_span.text.strip() if rating_span and rating_span.text.strip() else '0' + + # 提取评价人数 + comments_span = item.find('div', class_='rating').find_all('span') + if len(comments_span) > 2 and comments_span[2].text: + comments_text = comments_span[2].text.strip() + comments_count = comments_text.split('(')[1].split('人')[0] # 从 "(172人评价)" 提取 "172" + else: + comments_count = '无评价' + # 将数据放入集合 + books_info.append({ + 'title': title, + 'rating': rating, + 'comments_count': comments_count, + 'link': link + }) + return books_info + + +def save_data_to_csv(file_name, data): + # 保存到CSV文件 + with open(f'/Users/renmeng/Downloads/{file_name}.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['title', 'rating', 'comments_count', 'link'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + + # 写入表头 + writer.writeheader() + + # 写入数据 + for book in data: + writer.writerow(book) + + print("数据已成功保存到 books_info.csv") + + +def main(file_name, dou_list_no): + page_size = 25 + books_info = [] + # 第一次拉取 + soup = get_book_html(doulist_no=dou_list_no) + print("第一次拉取成功!") + # 获取总页数 + total_pages = int(get_book_total_page(soup)) + print(f"获取到的页数是:{total_pages}") + # 获取一下第一次页面中的内容 + data = get_book_list(soup) + books_info.extend(data) + print(f"当前数据长度是:{len(books_info)}") + print(books_info) + if total_pages > 0: + for page in range(1, total_pages): + sleep_time = random.randint(10, 30) + print(f"程序需要休息一下。休息时间是:{sleep_time}s") + time.sleep(sleep_time) + soup = get_book_html(start=page_size * page, doulist_no=dou_list_no) + print(f"第 {page + 1} 次拉取成功!") + data = get_book_list(soup) + books_info.extend(data) + print(f"当前数据长度是:{len(books_info)}") + save_data_to_csv(file_name=file_name, data=books_info) + + +if __name__ == '__main__': + # 2023 年书单 + # dou_list_no = 153478378 + # 2024年书单 + dou_list_no = 157428303 + main("2024_books_info", dou_list_no)