携程酒店反爬

携程的反爬主要是列表页翻页是ajax加载的,需要调用携程的api,抓包发现,需要传递几十个参数,一开始被吓到了,到了后来,发现大部分参数可以省略,剩下的也都是非加密的,直接调用就行了
【携程酒店反爬】只需要传递7个参数:

def get_allpages(self,response): tree = etree.HTML(response.text) url_name = response.save url_e = re.findall(r'(/hotel/[a-z]+\d+)\?time=', response.url)[0]name = url_name[url_e] try: pages = tree.xpath('//div[@class="c_page_list layoutfix"]/a[@rel="nofollow"]/text()')[0] pages = int(pages) except: pages = 1 # python2 import urllib print(name) name_code = urllib.quote(name.decode('utf-8').encode('utf-8')) city_id = re.findall(r'/hotel/[a-z]+(\d+)\?time=', response.url)[0] city_py = re.findall(r'/hotel/([a-z]+)\d+\?time=',response.url)[0] for i in range(1,pages+1): url = 'http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx?time={}'.format(time.time()) formdata = https://www.it610.com/article/{"__VIEWSTATEGENERATOR": "DB1FBB6D", "cityName": name_code, # "StartTime": "2018-03-14", # "DepTime": "2018-03-15", "RoomGuestCount": "1,1,0", "operationtype": "NEWHOTELORDER", "cityId": city_id, "cityPY": city_py, # "cityCode": "0571", # "cityLat": "30.2799952044", # "cityLng": "120.1616127798", # "checkIn": "2018-03-14", # "checkOut": "2018-03-15", "page": i, } ua = UserAgent() headers = { 'User-Agent':ua.random }self.crawl(url,method='POST', data=https://www.it610.com/article/formdata,headers=headers,proxy=proxy_util.random_proxy(),retries=10,callback=self.response_parse,save={'name':name})

    推荐阅读