selenium+PhantomJS/淘宝商品实战

一,先看结果

selenium+PhantomJS/淘宝商品实战
文章图片
image.png 二, 思路

1.通过对网页源代码的使用selenium内xpath点击动作了,获取网页源代码 2.将源代码用BeautifulSoup解析 3.使用mysql语法,创建数据库表的结构,实现自动化建表,一键入库

【selenium+PhantomJS/淘宝商品实战】三,上源码
from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time from bs4 import BeautifulSoup import pymysql import logging''' 1. 1.selenium+PhantomJs+BeautifulSoup ''' ###################数据库 class Dandan(object):def __init__(self): self.url='http://www.taobao.com' self.conn = pymysql.Connect(host='x',user='xx',password='xx',port=x,database='xx',charset='utf8') self.browser = webdriver.PhantomJS() self.mysql = mysql()def mysql(self): cursor=self.conn.cursor() dataname = input('请输入数据库名字:') sql='CREATE TABLE IF NOT EXISTS %s(ID INT(10) NOT NULL PRIMARY KEY AUTO_INCREMENT,' \ 'A TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,' \ 'B VARCHAR(255),' \ 'C VARCHAR(255),' \ 'D VARCHAR(255),' \ 'E VARCHAR(255))ENGINE = INNODB DEFAULT CHARSET=utf8' cursor.execute(sql%dataname) print('创建成功!!')def get_url(self):browser =self.browser timeout = WebDriverWait(browser,10) browser.set_window_size(900,900) browser.get(self.url)input_a =browser.find_element_by_id('q') inputname = input('请输入你想要的商品:') input_a.send_keys(inputname) input_a.send_keys(Keys.ENTER) time.sleep(4) #print(browser.page_source) #timeout.until(EC.presence_of_all_elements_located((By.CLASS_NAME,'ctx-box J_MouseEneterLeave J_IconMoreNew'))) try: i = 0 while True: i+=1 for page in range(0,6000,1000): browser.execute_script('window,scrollBy(0,{})'.format(page)) time.sleep(3) html = browser.page_source soup = BeautifulSoup(html,'lxml') price = [price.get_text() for price in soup.find_all('div',class_='price g_price g_price-highlight')] pay = [pay.get_text() for pay in soup.find_all('div',class_='deal-cnt')] store = [store.get_text().strip().replace('\n\n','') for store in soup.find_all('div',class_='row row-3 g-clearfix')]for a,b,c in zip(store,price,pay): print('\n\n店面/城市:{}\n价钱:{}付款人数:{}'.format(a,b,c))sql = "INSERT INTO %s(B,C,D)VALUES('店面/城市:%s','价钱:%s','付款人数:%s')" values = (dataname,a,b,c) cursor.execute(sql % values) conn.commit() print('导入成功!!')browser.find_element_by_xpath("//div[@id='mainsrp-pager']/div/div/div/ul/li[8]/a/span").click() print('下一页') time.sleep(5) if i == 3:#抓取几页数据 cursor.close() conn.cursor() print('关闭数据库') browser.quit() print('关闭浏览器') break except Exception as e: print(e) if __name__ == '__main__': d = Dandan() d.mysql() d.get_url()

    推荐阅读