1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
| import requests from bs4 import BeautifulSoup import xlwt
book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('豆瓣电影Top250') sheet.write(0, 0, '名称') sheet.write(0, 1, '图片') sheet.write(0, 2, '排名') sheet.write(0, 3, '评分') sheet.write(0, 4, '作者') sheet.write(0, 5, '简介')
n = 1
def main(page): url = 'https://movie.douban.com/top250?start=' + str(page * 25) + '&filter=' html = request_douban(url) soup = BeautifulSoup(html, 'lxml') list = soup.find(class_='grid_view').find_all('li') for item in list: item_name = item.find(class_='title').string item_img = item.find('a').find('img').get('src') item_index = item.find(class_='').string item_score = item.find(class_='rating_num').string item_author = item.find('p').text if item.find(class_='inq') is not None: item_intr = item.find(class_='inq').string
print('爬取电影:' + item_index + ' | ' + item_name + ' | ' + item_score + ' | ' + item_intr)
global n sheet.write(n, 0, item_name) sheet.write(n, 1, item_img) sheet.write(n, 2, item_index) sheet.write(n, 3, item_score) sheet.write(n, 4, item_author) sheet.write(n, 5, item_intr) n = n + 1
def request_douban(url): maxTryNum = 20 for tries in range(maxTryNum): try: kv = {"user-agent": "Mizilla/5.0"} response = requests.get(url, headers=kv, timeout=60) if response.status_code == 200: return response.text except requests.RequestException: if tries < (maxTryNum - 1): continue else: print("Has tried %d times to access url %s, all failed!" % (maxTryNum, url)) break
if __name__ == '__main__': for i in range(0, 10): main(i)
book.save('豆瓣最受欢迎的250部电影.xls')
|