python|python 爬虫下载图片(煎蛋)

我所用到的库BeautifulSoup,requests

# encoding:utf-8import os, urllib, re, urllib2, requests, gzip from StringIO import StringIO from bs4 import BeautifulSoup# 加上头部信息伪装成浏览器 req_header = { 'User-Agent': 'Mozilla/4.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) ' 'Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html; q=0.9,*/*; q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8; q=0.7,*; q=0.3', 'Accept-Encoding': 'gzip', 'Connection': 'close', 'Referer': None# 注意如果依然不能抓取的话,这里可以设置抓取网站的host }# 传入URL获取网页文本信息 def get_html(url): # page = requests.get(url) req = urllib2.Request(url=url, headers=req_header) res = urllib2.urlopen(req, timeout=5) html = res.read()print 'Content-Encoding :', res.info().get('Content-Encoding')# 如果网页通过gzip压缩,需要解压 if res.info().get('Content-Encoding') == 'gzip': buf = StringIO(html) f = gzip.GzipFile(fileobj=buf) html = f.read() return html# 根据html网页匹配其中所有 src=https://www.it610.com/article/.....jpg 的信息 并返回链接列表 def get_imglink(html_text): img = re.compile(r'src="https://www.it610.com/article/(.+?\.jpg)"') imglist = re.findall(img, html_text) return imglist# 用bs获取图片链接 def get_imglink2(html_text): bs = BeautifulSoup(html_text, 'html.parser', from_encoding='utf-8') links = bs.find_all('img') imglist = [] for link in links: imglist.append(link.get('src')) return imglist# 根据链接列表获取到图片,写入文件用的是requests 比urllib稳定 def get_img2(imgs, path): dirname = './%s' % path if not os.path.exists(dirname): os.makedirs(dirname) for imgurl in imgs: filename = imgurl.split('/')[-1] local = os.path.join(dirname, filename) print local try: with open(local, 'wb') as jpg: jpg.write(requests.get('http:'+imgurl, stream=True, headers=req_header).content) except requests.RequestException,e: print eif __name__ == '__main__': for num in range(3): html = get_html('http://jandan.net/ooxx/page-%s#comments' % str(2000 - num)) imgs = get_imglink2(html) get_img2(imgs, u'你要的图') print u'图片共:', len(imgs)

  • 【python|python 爬虫下载图片(煎蛋)】煎蛋网的页面是gzip压缩后的,需要解压再,从中解析element
  • try except 是有的imgurl前面有http前缀,有的没有。所以循环是request会抛出异常
  • with 语句用来打开文件很好,可以自动关闭流。

    推荐阅读