爬取网易云音乐
效果图
spider_music.py主页面
# coding=gbk
from download import Download
from url_manager import Url_Manager
from html_parser import Html_Parser
from save import Save
from set_text_color import Set_Colorclass Spider_Music():def __init__(self):
self.download = Download()
self.url_manager = Url_Manager()
self.html_parser = Html_Parser()
self.save = Save()
self.set_color = Set_Color()def craw(self,url):
self.url_manager.addurl({'url':url,'name':'temp'})while self.url_manager.checknewurllength>0:
newurl = self.url_manager.geturl()if self.save.checkfile(newurl['name']):
self.set_color.printDarkRed("{} 已下载!\n".format(newurl['name']))
continueprint("开始下载 {} {}".format(newurl['name'],newurl['url']))
htmlcontent = self.download.download(newurl['url'])if htmlcontent['htmlcontents'] == None:
self.url_manager.delUrl(newurl)
self.url_manager.addurl(newurl)newurls,result = self.html_parser.parser(htmlcontent)self.url_manager.addurls(newurls)
self.save.save(result,newurl['name'])
print("下载完成 {} ".format(newurl['name']))
print("共下载{}首歌曲".format(self.save.count))def main(self):
self.craw('https://music.163.com/#/playlist?id=2492536378')spider = Spider_Music()
spider.main()
download.py负责下载
# coding=gbk
import re
import requests
from selenium import webdriver
import random
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Optionsclass Download():__uas = [
"Mozilla/5.0 (X11;
Ubuntu;
Linux x86_64;
rv:17.0;
Baiduspider-ads) Gecko/17.0 Firefox/17.0",
"Mozilla/5.0 (Windows;
U;
Windows NT 5.1;
zh-CN;
rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
"Mozilla/5.0 (Windows;
U;
MSIE 6.0;
Windows NT 5.1;
SV1;
.NET CLR 2.0.50727;
BIDUBrowser 7.6)",
"Mozilla/5.0 (Windows NT 6.3;
WOW64;
Trident/7.0;
rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.3;
WOW64;
rv:46.0) Gecko/20100101 Firefox/46.0",
"Mozilla/5.0 (Windows NT 6.3;
WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3;
Win64;
x64;
Trident/7.0;
Touch;
LCJB;
rv:11.0) like Gecko",
"Mozilla/5.0 (Macintosh;
Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
]__ips = []headers = {
#"Accept": "text/html,application/xhtml+xml,application/xml;
q=0.9,image/webp,image/apng,*/*;
q=0.8",
'Referer':'http://music.163.com/',
'Host':'music.163.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3;
WOW64;
Trident/7.0;
rv:11.0) like Gecko',
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;
q=0.9"
}def __init__(self):
self.url = ''
#self.__ips = self.get_ip()def download(self,url):
self.url = url
#print(url)
return self.patterns@property
def patterns(self):
playlist = re.compile("playlist\?id=\d+")#匹配歌单
song = re.compile("song/media/outer/url\?id=\d+")#匹配下载地址res = {
'identify':False,
'htmlcontents':'',
}if re.search(song,self.url):
res['identify'] = 1
res['htmlcontents'] = self.getmusic()#用于获取mp3
elif re.search(playlist,self.url):
res['identify'] = 2
res['htmlcontents'] = self.geturl() #获取网页内容return res#未找到高可用代理,功能暂时停止
def get_ip(self):
url = "https://www.kuaidaili.com/free/inha/1/"
res = requests.get(url)
soup = BeautifulSoup(res.text,'html.parser')
data = https://www.it610.com/article/soup.find(id="list").find('tbody').find_all('tr')
ip_compile= re.compile(r'(\d+\.\d+\.\d+\.\d+) ')# 匹配IP
port_compile = re.compile(r'(\d+) ')# 匹配端口
ip = re.findall(ip_compile,str(data))# 获取所有IP
port = re.findall(port_compile,str(data))# 获取所有端口
return [":".join(i) for i in zip(ip,port)]# 组合IP+端口,如:115.112.88.23:8080def getmusic(self):
try:
url = self.getrealurl()
host = url.strip('http://').split('/')[0]
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;
q=0.9,image/webp,image/apng,*/*;
q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;
q=0.9",
"Host": host,
"User-Agent": self.__uas[random.randint(0,6)]#模拟不同浏览器
}
ip = random.choice(self.__ips)
proxies = {
'http':'http://'+ip,
'https':'http://'+ip
}
res = requests.get(url,headers=headers)
except Exception as e:
print(e)
return
else:
return res.contentdef getrealurl(self):
res = requests.get(self.url,headers=self.headers)
return res.urldef geturl(self):
try:
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
brower = webdriver.Chrome("D:\\tools\\chromedriver_win32\\chromedriver.exe",options=chrome_options)#创建driver,参数为插件的路径
brower.get(self.url)
brower.switch_to.frame(brower.find_element_by_name("contentFrame"))#切换到指定框架
except Exception as e:
print(e)
return
else:
return brower.page_source#d=Download()
#print(d.get_ip())
#res = d.download('http://music.163.com/song/media/outer/url?id=28160459.mp3')
#print(res['identify'])
#print(res['htmlcontents'])
html_parser.py负责网页内容解析
from bs4 import BeautifulSoupclass Html_Parser():
baseurl = "http://music.163.com/song/media/outer/url?{}.mp3"def parser(self,res):if res.get('identify') == 1:
#print(res['identify'])
return None,res.get('htmlcontents',False)else:
return self.geturls(res['htmlcontents'])def geturls(self,htmlcontent):
#print(htmlcontent)
newsurl=list()
try:
soup = BeautifulSoup(htmlcontent,'html.parser')
songlist = soup.find('table').find_all('tr')[1:]for link in songlist:
url = self.baseurl.format(link.find_all('td')[1].find('a')['href'].split('?')[-1])
name = link.find_all('td')[1].find('a').find('b')['title']
newsurl.append({'url':url,'name':name})
except Exception as e:
print(e)
pass
else:
return newsurl,False
url_manager.pyurl管理器
# coding=gbk
class Url_Manager():__newurl = list()#存放未下载的url
__oldurl = list()#存放已下载的urldef addurl(self,url):
if url == None:
return
if self.checkurl(url):
self.__newurl.append(url)def addurls(self,urls):
if urls == None:
returnfor url in urls:
self.addurl(url)def geturl(self):
newurl = self.__newurl.pop()
self.__oldurl.append(newurl)
return newurldef delUrl(self,url):
if url in self.__oldurl:
self.__oldurl.remove(url)@property
def checknewurllength(self):
return len(self.__newurl)def checkurl(self,url):
if url not in self.__newurl and url not in self.__oldurl:
return True
else:
return False
save.py保存下载内容
# coding=gbk
import osclass Save():
path="./download/"
count = 0def __init__(self):
self.mkdir(self.path)def save(self,contents,name):
if contents and name:
try:
with open(self.remove_special_characters(name),'wb') as f:
f.write(contents)
except Exception as e:
print(e)
pass
else:
self.count+=1
#创建文件存放目录
def mkdir(self,path):
if os.path.exists(path):
return
os.makedirs(path)#防止重复下载
def checkfile(self,name):
if name == 'temp':
return
return os.path.exists(self.remove_special_characters(name))#确保windows下文件可创建成功
def remove_special_characters(self,string):
#windows文件名中不能有下列符号:'\\', '/', ':', '*', '?', '"', '<', '>', '|'
special_characters = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']
for special_character in special_characters:
string = string.replace(special_character,'')
return '/'.join([self.path.strip('/'),string.strip()])+".mp3"
【爬取网易云音乐】set_text_color.py设置cmd窗口显示颜色
# coding=gbk
#参考地址https://blog.csdn.net/wy_97/article/details/79663014import ctypes,sys
class Set_Color():
STD_INPUT_HANDLE = -10
STD_OUTPUT_HANDLE = -11
STD_ERROR_HANDLE = -12# 字体颜色定义 ,关键在于颜色编码,由2位十六进制组成,分别取0~f,前一位指的是背景色,后一位指的是字体色
#由于该函数的限制,应该是只有这16种,可以前景色与背景色组合。也可以几种颜色通过或运算组合,组合后还是在这16种颜色中# Windows CMD命令行 字体颜色定义 text colors
FOREGROUND_BLACK = 0x00 # black.
FOREGROUND_DARKBLUE = 0x01 # dark blue.
FOREGROUND_DARKGREEN = 0x02 # dark green.
FOREGROUND_DARKSKYBLUE = 0x03 # dark skyblue.
FOREGROUND_DARKRED = 0x04 # dark red.
FOREGROUND_DARKPINK = 0x05 # dark pink.
FOREGROUND_DARKYELLOW = 0x06 # dark yellow.
FOREGROUND_DARKWHITE = 0x07 # dark white.
FOREGROUND_DARKGRAY = 0x08 # dark gray.
FOREGROUND_BLUE = 0x09 # blue.
FOREGROUND_GREEN = 0x0a # green.
FOREGROUND_SKYBLUE = 0x0b # skyblue.
FOREGROUND_RED = 0x0c # red.
FOREGROUND_PINK = 0x0d # pink.
FOREGROUND_YELLOW = 0x0e # yellow.
FOREGROUND_WHITE = 0x0f # white.# Windows CMD命令行 背景颜色定义 background colors
BACKGROUND_BLUE = 0x10 # dark blue.
BACKGROUND_GREEN = 0x20 # dark green.
BACKGROUND_DARKSKYBLUE = 0x30 # dark skyblue.
BACKGROUND_DARKRED = 0x40 # dark red.
BACKGROUND_DARKPINK = 0x50 # dark pink.
BACKGROUND_DARKYELLOW = 0x60 # dark yellow.
BACKGROUND_DARKWHITE = 0x70 # dark white.
BACKGROUND_DARKGRAY = 0x80 # dark gray.
BACKGROUND_BLUE = 0x90 # blue.
BACKGROUND_GREEN = 0xa0 # green.
BACKGROUND_SKYBLUE = 0xb0 # skyblue.
BACKGROUND_RED = 0xc0 # red.
BACKGROUND_PINK = 0xd0 # pink.
BACKGROUND_YELLOW = 0xe0 # yellow.
BACKGROUND_WHITE = 0xf0 # white.std_out_handle = ctypes.windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE)
# get handledef set_cmd_text_color(self,color, handle=False):
if handle:
Bool = ctypes.windll.kernel32.SetConsoleTextAttribute(handle, color)
else:
Bool = ctypes.windll.kernel32.SetConsoleTextAttribute(self.std_out_handle, color)
return Bool#reset white
def resetColor(self):
self.set_cmd_text_color(self.FOREGROUND_GREEN)#reset white
def resetDefault(self):
self.set_cmd_text_color(self.FOREGROUND_RED | self.FOREGROUND_GREEN | self.FOREGROUND_BLUE)################################################################暗蓝色
#dark blue
def printDarkBlue(self,mess):
self.set_cmd_text_color(self.FOREGROUND_DARKBLUE)
sys.stdout.write(mess)
self.resetColor()#暗绿色
#dark green
def printDarkGreen(self,mess):
self.set_cmd_text_color(self.FOREGROUND_DARKGREEN)
sys.stdout.write(mess)
self.resetColor()#暗天蓝色
#dark sky blue
def printDarkSkyBlue(mess):
self.set_cmd_text_color(self.FOREGROUND_DARKSKYBLUE)
sys.stdout.write(mess)
self.resetColor()#暗红色
#dark red
def printDarkRed(self,mess):
#self.set_back()
self.set_cmd_text_color(self.FOREGROUND_DARKRED)
sys.stdout.write(mess)
self.resetColor()#暗粉红色
#dark pink
def printDarkPink(self,mess):
self.set_cmd_text_color(self.FOREGROUND_DARKPINK)
sys.stdout.write(mess)
self.resetColor()#暗黄色
#dark yellow
def printDarkYellow(self,mess):
self.set_cmd_text_color(self.FOREGROUND_DARKYELLOW)
sys.stdout.write(mess)
self.resetColor()#暗白色
#dark white
def printDarkWhite(self,mess):
self.set_cmd_text_color(self.FOREGROUND_DARKWHITE)
sys.stdout.write(mess)
self.resetColor()#暗灰色
#dark gray
def printDarkGray(self,mess):
self.set_cmd_text_color(self.FOREGROUND_DARKGRAY)
sys.stdout.write(mess)
self.resetColor()#蓝色
#blue
def printBlue(self,mess):
self.set_cmd_text_color(self.FOREGROUND_BLUE)
sys.stdout.write(mess)
self.resetColor()#绿色
#green
def printGreen(self,mess):
self.set_cmd_text_color(self.FOREGROUND_GREEN)
sys.stdout.write(mess)
self.resetColor()#天蓝色
#sky blue
def printSkyBlue(self,mess):
self.set_cmd_text_color(self.FOREGROUND_SKYBLUE)
sys.stdout.write(mess)
self.resetColor()#红色
#red
def printRed(self,mess):
self.set_cmd_text_color(self.FOREGROUND_RED)
sys.stdout.write(mess)
self.resetColor()#粉红色
#pink
def printPink(self,mess):
self.set_cmd_text_color(self.FOREGROUND_PINK)
sys.stdout.write(mess)
self.resetColor()#黄色
#yellow
def printYellow(self,mess):
self.set_cmd_text_color(self.FOREGROUND_YELLOW)
sys.stdout.write(mess)
self.resetColor()#白色
#white
def printWhite(self,mess):
self.set_cmd_text_color(self.FOREGROUND_WHITE)
sys.stdout.write(mess)
self.resetColor()###################################################白底黑字
#white bkground and black text
def printWhiteBlack(self,mess):
self.set_cmd_text_color(self.FOREGROUND_BLACK | self.BACKGROUND_WHITE)
sys.stdout.write(mess)
self.resetColor()#白底黑字
#white bkground and black text
def printWhiteBlack_2(self,mess):
self.set_cmd_text_color(0xf0)
sys.stdout.write(mess)
self.resetColor()#黄底蓝字
#white bkground and black text
def printYellowRed(self,mess):
self.set_cmd_text_color(BACKGROUND_YELLOW | FOREGROUND_RED)
sys.stdout.write(mess)
self.resetColor()##############################################################
"""
if __name__ == '__main__':print
printDarkBlue('printDarkBlue:暗蓝色文字\n')
printDarkGreen('printDarkGreen:暗绿色文字\n')
printDarkSkyBlue(u'printDarkSkyBlue:暗天蓝色文字\n')
printDarkRed(u'printDarkRed:暗红色文字\n')
printDarkPink(u'printDarkPink:暗粉红色文字\n')
printDarkYellow(u'printDarkYellow:暗黄色文字\n')
printDarkWhite(u'printDarkWhite:暗白色文字\n')
printDarkGray(u'printDarkGray:暗灰色文字\n')
printBlue(u'printBlue:蓝色文字\n')
printGreen(u'printGreen:绿色文字\n')
printSkyBlue(u'printSkyBlue:天蓝色文字\n')
printRed(u'printRed:红色文字\n')
printPink(u'printPink:粉红色文字\n')
printYellow(u'printYellow:黄色文字\n')
printWhite(u'printWhite:白色文字\n')
printWhiteBlack(u'printWhiteBlack:白底黑字输出\n')
printWhiteBlack_2(u'printWhiteBlack_2:白底黑字输出\n')
printYellowRed('printYellowRed:黄底红字输出\n')
"""
#c = Set_Color()
#c.printDarkRed(u'printDarkRed:暗红色文字\n')
推荐阅读
- 赠己诗
- 八、「料理风云」
- 西湖游
- 两短篇
- 9531
- NeuVector 会是下一个爆款云原生安全神器吗()
- S8大连侠盗勇士
- 使用协程爬取网页,计算网页数据大小
- 走向天空,走向云(小说)3
- 2018年7月11日|2018年7月11日 星期三 多云转晴(18)