轻松利用Python爬虫爬取你想要的数据

import urllib.request
from bs4 import BeautifulSoup
import time
import pymysql
def headers_request(url):

headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'

【轻松利用Python爬虫爬取你想要的数据】}
request=urllib.request.Request(url,headers=headers) return request

解析内容 def parse_content(content, db):
# 生成soup对象 soup = BeautifulSoup(content,'lxml') # 先找包含所有工作的div odivbox = soup.find('div',id='resultList') # 首先找到包含所有工作的div odiv_list = odivbox.find_all('div',class_='el')[1:] # print(len(odiv_list)) for odiv in odiv_list: # 职位名称 jobname = odiv.select('.t1 > span > a')[0]['title'] # 公司名称 company = odiv.select('.t2 > a')[0]['title'] # 工作地点 area = odiv.select('.t3')[0].string # 职位月薪 salary = odiv.select('.t4')[0].string # 发布时间 publish_time = odiv.select('.t5')[0].string # print(salary, publish_time) # 保存到字典中 item = { '职位名称':jobname, '公司名称':company, '工作地点':area, '职位月薪':salary, '发布时间':publish_time } # 保存到文件中 # string = str(item) + '\n' # fp.write(string) # 保存到mysql中 save_to_mysql(db,item)

def save_to_mysql(db,item):
# 获取游标 cur = db.cursor() # 执行sql语句 sql = """insert into work(jobname,company,area,salary,publish_time) values('%s','%s','%s','%s','%s')""" % (item['职位名称'], item['公司名称'], item['工作地点'], item['职位月薪'], item['发布时间']) # print(sql) try: cur.execute(sql) #提交 db.commit() except Exception as e: # print(e) #错误回滚 db.rollback()

def main():
# fp = open('work.txt','w',encoding='utf8') # 链接数据库 db = pymysql.connect(host="xxxx",user="xxxx",password="xxxxxx",db="xx",port=xxxx,charset='utf8') # 用户输入要搜索工作关键字 keyword = input('请输入要搜索的关键字-') # 用户输入要爬取的起始和结束页码 start_page = int(input('请输入要爬取的起始页码-')) end_page = int(input('请输入要爬取的结束页码-')) # 要拼接的起始url url = 'https://search.51job.com/list/010000,000000,0000,00,9,99,{},2,{}.html' # 写循环,每一页开始挨着爬取 for page in range(start_page,end_page + 1): print('[金融期货](https://www.gendan5.com/futures/ff.html)正在爬取第%s页......' % page) # 拼接url url_page = url.format(keyword,page) # print(url_page) # 构建请求对象 request = headers_request(url_page) # 发送请求,得到响应 content = urllib.request.urlopen(request).read().decode('gbk') # 解析内容 parse_content(content,db) print('结束爬取第%s页' % page) time.sleep(2) # fp.close() db.close()

if name == '__main__':
main()

    推荐阅读