BiliBili等网站极验滑动验证码的详细破解过程 python

基于selenium自动化的滑动验证码破解 1.环境配置(Linux)

python2.7
pip安装的库：selenium ， PIL
Chrome浏览器 , Chromedriver （点击查看详细安装过程）

2.示例网站（BiliBili)
https://passport.bilibili.com/login
3.代码整体流程图

文章图片

4.完整代码

# -*- coding: utf-8 -*- import logging import time import random import re import requests import urlparse import pdb from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from PIL import Image from io import BytesIO class Bilibili(object): """docstring for Bilibili""" js="""var keys=document.cookie.match(/[^ =; ]+(?=\=)/g); if (keys) { for (var i = keys.length; i--; ) document.cookie=keys[i]+'=0; expires=' + new Date( 0).toUTCString() } """ def __init__(self): """构造函数""" super(Bilibili,self).__init__() self.browser=webdriver.Chrome("your chromedrive's path") self.browser.set_page_load_timeout(20) self.browser.implicitly_wait(10)def __del__(self): """析构函数""" if self.browser is not None: self.browser.quit()def logging(self,username,password):self.browser.get("https://passport.bilibili.com/login") dom_input_id = self.browser.find_element_by_id("login-username") dom_input_keyword = self.browser.find_element_by_id("login-passwd") dom_btn_log = self.browser.find_element_by_xpath('//*[@class="btn-box"]/a[1]') #pdb.set_trace() dom_input_id.send_keys(username) dom_input_keyword.send_keys(password)flag_success = False while not flag_success: image_full_bg = self.get_image("gt_cut_fullbg_slice") # 下载完整的验证图 image_bg = self.get_image("gt_cut_bg_slice") # 下载有缺口的验证图 diff_x = self.get_diff_x(image_full_bg, image_bg) #pdb.set_trace() track = self.get_track(diff_x) result = self.simulate_drag(track) print resultif u'验证通过' in result: flag_success = True elif u'出现错误:' in result: self.browser.execute_script('location.reload()') elif u'再' in result: time.sleep(4) continue elif u'吃' in result: time.sleep(5) else: breakif flag_success: time.sleep(random.uniform(1.5, 2)) self.browser.execute_script(self.js)def get_image(self,class_name): """ 下载并还原极验的验证图 Args: class_name: 验证图所在的html标签的class name Returns: 返回验证图 Errors: IndexError: list index out of range. ajax超时未加载完成，导致image_slices为空 """ image_slices = self.browser.find_elements_by_class_name(class_name) #pdb.set_trace() if len(image_slices) == 0: print 'No such a class' div_style=image_slices[0].get_attribute('style') #pdb.set_trace() image_url = re.findall("background-image: url\(\"(.*)\"\); background-position: (.*)px (.*)px; ",div_style)[0][0] image_url = image_url.replace("webp","jpg") image_filename = urlparse.urlsplit(image_url).path.split('/')[-1]location_list = list() for image_slice in image_slices: location = dict() location['x'] = int(re.findall("background-image: url\(\"(.*)\"\); background-position: (.*)px (.*)px; ", image_slice.get_attribute('style'))[0][4]) location['y'] = int(re.findall("background-image: url\(\"(.*)\"\); background-position: (.*)px (.*)px; ", image_slice.get_attribute('style'))[0][5]) location_list.append(location)headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"} response = requests.get(image_url,headers=headers)image = Image.open(BytesIO(response.content)) image = self.recover_image(image,location_list) return imagedef recover_image(self,image,location_list): """ 还原验证图像 Args: image: 打乱的验证图像（PIL.Image数据类型） location_list: 验证图像每个碎片的位置 Returns: 还原过后的图像 """ new_im = Image.new('RGB',(260,116)) im_list_upper = [] im_list_down = [] for location in location_list: if location['y'] == -58: im_list_upper.append(image.crop((abs(location['x']), 58, abs(location['x']) + 10, 116))) if location['y'] == 0: im_list_down.append(image.crop((abs(location['x']), 0, abs(location['x']) + 10, 58)))x_offset = 0 for im in im_list_upper: new_im.paste(im, (x_offset, 0)) x_offset += im.size[0]x_offset = 0 for im in im_list_down: new_im.paste(im, (x_offset, 58)) x_offset += im.size[0]return new_imdef get_diff_x(self,image1,image2): """ 计算验证图的缺口位置（x轴）两张原始图的大小都是相同的260*116，那就通过两个for循环依次对比每个像素点的RGB值，如果RGB三元素中有一个相差超过50则就认为找到了缺口的位置 Args: image1: 图像1 image2: 图像2 Returns: x_offset """ for x in range(0, 260): for y in range(0, 116): if not self.__is_similar(image1, image2, x, y): return xdef __is_similar(self, image1, image2, x_offset, y_offset): """ 判断image1, image2的[x, y]这一像素是否相似，如果该像素的RGB值相差都在50以内，则认为相似。 Args: image1: 图像1 image2: 图像2 x_offset: x坐标 y_offset: y坐标 Returns: boolean """ pixel1 = image1.getpixel((x_offset, y_offset)) pixel2 = image2.getpixel((x_offset, y_offset)) for i in range(0, 3): if abs(pixel1[i] - pixel2[i]) >= 50: return False return Truedef get_track(self, x_offset):track = list() length = x_offset - 6 x = random.randint(1,5) while length - x >4: track.append([x,0,0]) length=length - x x= random.randint(1,15)for i in range(length): if x_offset>47: track.append([1,0,random.randint(10,12)/100.0]) else: track.append([1, 0, random.randint(13, 14)/100.0]) return trackdef simulate_drag(self, track):dom_div_slider = self.browser.find_element_by_xpath('//*[@id="gc-box"]/div/div[3]/div[2]')ActionChains(self.browser).click_and_hold(on_element=dom_div_slider).perform()for x,y,z in track: ActionChains(self.browser).move_to_element_with_offset( to_element=dom_div_slider, xoffset=x+22, yoffset=y+22).perform() time.sleep(z)time.sleep(0.9) ActionChains(self.browser).release(on_element=dom_div_slider).perform() time.sleep(1) dom_div_gt_info = self.browser.find_element_by_class_name('gt_info_type') return dom_div_gt_info.textif __name__ == '__main__': bilibili=Bilibili() bilibili.logging('username','password')

5.滑动验证码破解思路

文章图片

我们主要有两大问题：
一、如何找到图中的缺口的位置
二、如何模拟人拖拽滑块，欺骗其人机判别机制，使我们验证成功的几率增大
第一个问题：

首先，我们的任务是得到需要验证的图，点击鼠标右键对出现的图进行检查，细心的同学会发现，整张图片是碎片化的，而且在对应的html标签里我们能发现图片的url地址，访问该地址
得到图片

文章图片

正好证明了我们刚开始的想法
但是，可以看到，图片是杂乱无章的，这时候怎么办呢？
在刚才图片的html标签中中出现了一个重要的参数 background-position 就是每个碎片对应杂乱无章的那张整图的某个位置，再在前端按一定的排列顺序拼接，我们只需要按照前端这个顺序一一把碎片拼接起来就得到了一张完整的图片
这样的图片有两张，一张是有缺口的，另一张是没有缺口的
我们将两张拼接好的图片进行RGB像素对比，最终就能找到缺口的位置
第二个问题：
我们分析下人滑动滑块时会出现的动作
首先，刚开始距离缺口很远，滑动得很快，快要接近缺口了，滑动减慢
最后还要确定一下是否对齐，所以要在原地停留1秒的样子
根据分析，我们得出大概的轨迹，再通过机器学习，得到成功率最高的拖拽算法。（由于篇幅问题，在此不再讲机器学习，后续将补充）
6.关键代码分析
图片的复原：

def recover_image(self,image,location_list):new_im = Image.new('RGB',(260,116)) #生成一张(260，116)新图 im_list_upper = [] im_list_down = [] #以列表形式储存按照一定的顺序储存图片碎片 for location in location_list: #截取对应位置的图，并添加到列表中 if location['y'] == -58: im_list_upper.append(image.crop((abs(location['x']), 58, abs(location['x']) + 10, 116))) if location['y'] == 0: im_list_down.append(image.crop((abs(location['x']), 0, abs(location['x']) + 10, 58))) x_offset = 0 for im in im_list_upper: new_im.paste(im, (x_offset, 0)) x_offset += im.size[0]x_offset = 0 for im in im_list_down: new_im.paste(im, (x_offset, 58)) x_offset += im.size[0] #将图片碎片组合到复制到新图片上 return new_im

为什么要生成一个(260,116)的新图，而不是其他大小的图，是因为在前端显示的图就是(260,116),要做到对应才能准确的将滑块滑到缺口上
滑块的拖动：

def simulate_drag(self, track):dom_div_slider = self.browser.find_element_by_xpath('//*[@id="gc-box"]/div/div[3]/div[2]') #定位到滑块这个元素ActionChains(self.browser).click_and_hold(on_element=dom_div_slider).perform() #模拟鼠标按住滑块for x,y,z in track: ActionChains(self.browser).move_to_element_with_offset( to_element=dom_div_slider, xoffset=x+22, yoffset=y+22).perform() #进行拖拽 time.sleep(z)time.sleep(0.9) ActionChains(self.browser).release(on_element=dom_div_slider).perform() time.sleep(1) dom_div_gt_info = self.browser.find_element_by_class_name('gt_info_type') return dom_div_gt_info.text #返回验证结果

为什么xoffset,yoffset要加上22，因为滑块的尺寸是(44,44),而ActionChains的作用是将滑块移动到距离滑块左上角(x,y)的位置上去，而滑块x的实际滑的距离是其中心到缺口的位置。（selenium键盘鼠标操作）
7.代码衍生
【BiliBili等网站极验滑动验证码的详细破解过程】Bilibili和一些其他使用极验验证码的网站只需要改下元素定位，破解验证码的思路是一样的。
后续还将研究不用slenium而直接抓包的方式，以及机器学习来破解验证码。（进阶学习）