python爬虫 python爬虫

目前该代码只是实现了当前页面所有出现的url中的图片爬去，并没有做第几页的爬取。但是天狗网页实在是太慢了。。建议大家换个网页爬取吧！话不多说直接看代码

# -*- coding:utf-8 -*-import urllib,re import osdef ID(): #这里打开天狗网页源代码 html = urllib.urlopen("http://tnfs.tngou.net/") html = html.read() #利用正则来匹配到想要的内容并且返回 req = re.compile(r'href="http://www.tngou.net/tnfs/show/(.*?)"') urllist = re.findall(req,html) return urllistdef get_img(id): #组合url url = 'http://www.tngou.net/tnfs/show/'+id html = urllib.urlopen(url).read() #匹配 img = re.compile(r'src="https://www.it610.com/article/(.*?).jpg"') img_url = re.findall(img,html)#文件判断 filename = (r'G:\\xxoo\\%s' %id) if os.path.exists(filename): message = 'OK, the "%s" file exists.' else: os.mkdir(r'G:\\xxoo\\%s' %id) print message % filenamex = 0 for i in img_url: print i x +=1 try: urllib.urlretrieve(i+'.jpg','G:\\xxoo\\%s\\%s.jpg' %(id,x)) except Exception,e: print efor id in ID(): print id print type(id) get_img(id)

运行结果：