1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
| import csv import requests import re from bs4 import BeautifulSoup
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' }
base_url = "https://movie.douban.com/top250" movie_list = []
def get_page(url): """获取网页内容""" try: response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() return response.text except requests.exceptions.RequestException as e: print(f"请求失败: {e}") return None
def extract_director_names(text): """从文本中提取导演中文名""" if not text: return "未知导演" chinese_name_pattern = r"导演:\s*([\u4e00-\u9fa5·]+)" chinese_match = re.search(chinese_name_pattern, text) return chinese_match.group(1) if chinese_match else "未知导演"
def get_movie_details(url): """获取电影详情页信息""" html = get_page(url) if not html: return {}
soup = BeautifulSoup(html, 'html.parser') details = {}
screenwriter_span = soup.find('span', class_='pl', string=re.compile('编剧')) if screenwriter_span: screenwriters_span = screenwriter_span.find_next_sibling('span', class_='attrs') if screenwriters_span: details['screenwriters'] = [a.text.strip() for a in screenwriters_span.find_all('a')]
actor_span = soup.find('span', class_='pl', string=re.compile('主演')) if actor_span: actors_container = actor_span.find_next_sibling('span', class_='attrs') if actors_container: details['actors'] = [a.text.strip() for a in actors_container.find_all('a', rel='v:starring')]
type_span = soup.find('span', class_='pl', string=re.compile('类型:')) if type_span: details['move_type'] = type_span.find_next_sibling('span', class_='v:genre').text.strip()
diqu_span = soup.find('span', class_='pl', string=re.compile('制片国家/地区:')) if diqu_span: details['diqu'] = diqu_span.find_next_sibling(text=True).strip()
language_span = soup.find('span', class_='pl', string=re.compile('语言:')) if language_span: details['language'] = language_span.find_next_sibling(text=True).strip()
release_span = soup.find('span', class_='pl', string=re.compile('上映日期:')) if release_span: details['release_date'] = release_span.find_next_sibling('span', property='v:initialReleaseDate').text.strip()
duration_span = soup.find('span', class_='pl', string=re.compile('片长:')) if duration_span: details['duration'] = duration_span.find_next_sibling('span', property='v:runtime').text.strip()
aka_span = soup.find('span', class_='pl', string=re.compile('又名:')) if aka_span: details['aka'] = aka_span.find_next_sibling(text=True).strip()
imdb_span = soup.find('span', class_='pl', string=re.compile('IMDb:')) if imdb_span: details['imdb'] = imdb_span.find_next_sibling(text=True).strip()
summary_span = soup.find('span', property='v:summary') if summary_span: details['summary'] = summary_span.text.strip()
return details
def parse_page(html): """解析电影列表页面""" if not html: return
soup = BeautifulSoup(html, 'html.parser') movies = soup.find_all('div', class_='item')
for movie in movies: try: rank = movie.find('em').text.strip() title = movie.find('span', class_='title').text.strip() link = movie.find('a')['href'] rating = movie.find('span', class_='rating_num').text.strip()
bd_elem = movie.find('div', class_='bd') director_actors = bd_elem.find('p').text.strip() if bd_elem and bd_elem.find('p') else '' director = extract_director_names(director_actors)
details = get_movie_details(link)
movie_info = { '排名': rank, '名字': title, '别名': details.get('aka', ''), '链接': link, '评分': rating, '导演': director, '编剧': ','.join(details.get('screenwriters', [])), '主演': ','.join(details.get('actors', [])), '类型': details.get('move_type', ''), '制片国家/地区': details.get('diqu', ''), '语言': details.get('language', ''), '上映日期': details.get('release_date', ''), '片长': details.get('duration', ''), 'IMDb': details.get('imdb', ''), '剧情简介': details.get('summary', '') }
movie_list.append(movie_info) print(f"已爬取: {rank}. {title}")
except Exception as e: print(f"处理电影时出错: {e}") continue
def save_to_csv(): """保存数据到CSV文件""" if not movie_list: print("没有数据可保存") return
with open('douban_top250.csv', 'w', newline='', encoding='utf-8-sig') as f: fieldnames = movie_list[0].keys() writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(movie_list) print(f"数据已保存到 douban_top250.csv,共 {len(movie_list)} 条记录")
def main(): """主函数""" print("开始爬取豆瓣电影Top250...") for start in range(0, 250, 25): url = f"{base_url}?start={start}" print(f"正在爬取第 {start//25 + 1} 页: {url}") html = get_page(url) parse_page(html)
save_to_csv()
if __name__ == "__main__": main()
|