使用了 request、re (正则)、csv 模块
#爬取豆瓣 Top250 的电影名称,评分,导演与演员 | |
import requests | |
import re | |
import csv | |
url = "https://movie.douban.com/top250" #豆瓣链接 | |
headers = { # headers,伪装浏览器访问 | |
"User-Agent":": Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36" | |
} | |
obj = re.compile(r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?' # 正则预编译查找模式 | |
r'<p class="">(?P<actors>.*?)</p>.*?' | |
r'<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?', re.S) | |
with open("peaTop250.csv", mode="w", encoding="utf-8") as f: # 以 utf-8 编码打开 csv 文件 | |
num = 0 | |
while(num<250): # 爬取每一个页面的信息 | |
URL = url + "?start={}&filter=".format(num) #更改页面地址 | |
print(URL) | |
content = requests.get(url=URL, headers=headers).text #得到页面源代码 | |
res = obj.finditer(content) #按照预编译好的规则查找响应的值,并储存到一个迭代器中,存储类型为字典 | |
for i in res: | |
f.write("作者:" + i.group("name") + "\n") #写到 csv 中 | |
f.write(i.group("actors").replace(" ",'').replace('\n','').replace(" ",' ') + "\n") #把字符串中的空格回车消除和 & amp;nbsp 转化为空格 | |
f.write("评分:" + i.group("score") + "\n") | |
num+=25 | |
f.close() #关闭文件 | |
print("over") |