携程的反爬主要是列表页翻页是ajax加载的,需要调用携程的api,抓包发现,需要传递几十个参数,一开始被吓到了,到了后来,发现大部分参数可以省略,剩下的也都是非加密的,直接调用就行了
【携程酒店反爬】只需要传递7个参数:
def get_allpages(self,response):
tree = etree.HTML(response.text)
url_name = response.save
url_e = re.findall(r'(/hotel/[a-z]+\d+)\?time=', response.url)[0]name = url_name[url_e]
try:
pages = tree.xpath('//div[@class="c_page_list layoutfix"]/a[@rel="nofollow"]/text()')[0]
pages = int(pages)
except:
pages = 1
# python2
import urllib
print(name)
name_code = urllib.quote(name.decode('utf-8').encode('utf-8'))
city_id = re.findall(r'/hotel/[a-z]+(\d+)\?time=', response.url)[0]
city_py = re.findall(r'/hotel/([a-z]+)\d+\?time=',response.url)[0]
for i in range(1,pages+1):
url = 'http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx?time={}'.format(time.time())
formdata = https://www.it610.com/article/{"__VIEWSTATEGENERATOR": "DB1FBB6D",
"cityName": name_code,
# "StartTime": "2018-03-14",
# "DepTime": "2018-03-15",
"RoomGuestCount": "1,1,0",
"operationtype": "NEWHOTELORDER",
"cityId": city_id,
"cityPY": city_py,
# "cityCode": "0571",
# "cityLat": "30.2799952044",
# "cityLng": "120.1616127798",
# "checkIn": "2018-03-14",
# "checkOut": "2018-03-15",
"page": i,
}
ua = UserAgent()
headers = {
'User-Agent':ua.random
}self.crawl(url,method='POST', data=https://www.it610.com/article/formdata,headers=headers,proxy=proxy_util.random_proxy(),retries=10,callback=self.response_parse,save={'name':name})