爬取豆瓣电影top250

记录使用python爬取豆瓣

直接上代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import requests
from bs4 import BeautifulSoup
import xlwt

book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet('豆瓣电影Top250')
sheet.write(0, 0, '名称')
sheet.write(0, 1, '图片')
sheet.write(0, 2, '排名')
sheet.write(0, 3, '评分')
sheet.write(0, 4, '作者')
sheet.write(0, 5, '简介')

n = 1


def main(page):
url = 'https://movie.douban.com/top250?start=' + str(page * 25) + '&filter='
html = request_douban(url)
soup = BeautifulSoup(html, 'lxml')
list = soup.find(class_='grid_view').find_all('li')
for item in list:
item_name = item.find(class_='title').string
item_img = item.find('a').find('img').get('src')
item_index = item.find(class_='').string
item_score = item.find(class_='rating_num').string
item_author = item.find('p').text
if item.find(class_='inq') is not None:
item_intr = item.find(class_='inq').string

print('爬取电影:' + item_index + ' | ' + item_name + ' | ' + item_score + ' | ' + item_intr)

global n
sheet.write(n, 0, item_name)
sheet.write(n, 1, item_img)
sheet.write(n, 2, item_index)
sheet.write(n, 3, item_score)
sheet.write(n, 4, item_author)
sheet.write(n, 5, item_intr)
n = n + 1


def request_douban(url):
maxTryNum = 20
for tries in range(maxTryNum):
try:
kv = {"user-agent": "Mizilla/5.0"}
response = requests.get(url, headers=kv, timeout=60)
if response.status_code == 200:
return response.text
except requests.RequestException:
if tries < (maxTryNum - 1):
continue
else:
print("Has tried %d times to access url %s, all failed!" % (maxTryNum, url))
break


if __name__ == '__main__':
for i in range(0, 10):
main(i)

book.save('豆瓣最受欢迎的250部电影.xls')
文章作者: Joker
文章链接: https://qytayh.github.io/2020/05/%E7%88%AC%E5%8F%96%E8%B1%86%E7%93%A3%E7%94%B5%E5%BD%B1top250/
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Joker's Blog