爬取漫画书并分章节储存到本地

#! python3
# downloadcomic.py - Downloads every single comic and stores each chapter in each file.
"""
Created on Sat Feb 24 17:46:07 2018


@author: He Yuanwei
"""


import requests, os, bs4
url = 'http://www.tuku.cc/comic/22206/' # starting url
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'lxml')
# TODO: Find the URL of each chapter.
comicChapter = soup.select('#chapterlistload a') # [第8回,
# 第7回,
# ...
# 第1回]
chapterNum = len(comicChapter)
if chapterNum == int(0):
print('Could not find comic chapter.')
else:
for i in range(chapterNum):
os.makedirs('D:\鹅毛笔\第%s章' %str(chapterNum - i), exist_ok=True) # store comics in ./鹅毛笔
chapterUrl = comicChapter[i].get('href')
chapterUrl = 'http://www.tuku.cc' + chapterUrl
print('Opening %s, chapter %s' %(chapterUrl, str(chapterNum - i)))
while not chapterUrl.endswith('#'):
# 尝试打开下一页网址,如果打开出错,则except:打印本章下载结束,并break跳出while循环,继续for循环
try:
# TODO: Download the page.
print('Downloading page %s...' % chapterUrl) # 打印当前chapterUrl
res = requests.get(chapterUrl)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'lxml')
# TODO: Find the URL of the comic image.
comicElem = soup.select('#cp_image') # id属性为cp_image,img开头
if comicElem == []:
print('Could not find comic image.')
else:
comicUrl = comicElem[0].get('src') # 图片链接对应的键为src
# Download the image.
print('Downloading image %s...' % (comicUrl))
res = requests.get(comicUrl)
res.raise_for_status()
# TODO: Save the image to ./鹅毛笔.
# os.path.join拼接路径,os.path.basename获取当前py文件的文件名
# open在访问非文本格式文件(二进制文件)的时候,访问模式通常加上‘b’(即二进制模式:‘rb’或‘wb’)
imageFile = open(os.path.join('D:\鹅毛笔\第%s章' %str(chapterNum - i), os.path.basename(comicUrl)), 'wb')
# 节省,每次100000字节
for chunk in res.iter_content(100000):
imageFile.write(chunk)
imageFile.close()
# Get the next button's url.
# 所有名为,并有一个herf属性,其值为javascript:ShowNext(); 的元素
nextLink = soup.select('.list + a')[0] # list类的后一个a开头的
chapterUrl = 'http://www.tuku.cc' + nextLink.get('href')
except:
print('chapter %s have been downloaded.' % str(chapterNum - i))
break
print('Done.')

    推荐阅读