通过高德地图抓起所有城市的银行信息

目标:通过高德地图的搜索接口,抓取每个城市的所有银行的分行信息
思路:1. 在本地mysql中存储有全国各城市名称、编码

  1. 将城市编码读取到数组中
  2. 按照数据读取每个编码,组拼URL,通过POST请求访问接口
  3. 获取xml后解析出我们需要的数据,插入到mysql中
第一步,定义访问接口的基本参数
file_name='result.txt'# write result to this fileurl_header='http://restapi.amap.com/v3/place/text?&keyword=&types=160100&'url_end='&citylimit=true&&output=xml&offset=20&page=1&key=c787ae8e49424a657127c3ed64cfe053&extensions=base'url_amap='city='each_page_rec=20# results that displays in one pagexml_file='tmp.xml'# xml filen name

第二步,建立本地数据库访问请求,获取数据库中的所有城市编码
首先,在本地mysql中建立一张region表,可以从网上down一份全国各地省市区编码表,结构如下图:
通过高德地图抓起所有城市的银行信息
文章图片
省市区编码表.png 为了方便大家,我将表的结构及数据导出为sql语句,直接复制到mysql中执行即可,链接如下 http://www.jianshu.com/p/0b9b0e3cda5f
def getallcity(): cityarr = [] connection = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', port=3306, db='icoachu', charset="utf8") cursor = connection.cursor() sql = "select * from region where parent_id in (select id from region where parent_id=0)" try: cursor.execute(sql) rows = cursor.fetchall() for row in rows: cityarr.append(row[0]) return cityarr finally: cursor.close() connection.close() return cityarr

关于如何访问本地mysql的,比较简单,此处不做说明,需要强调的是在try 语句中,一定要在finally中关闭cursor及connection。
第三步,通过接口访问获取html数据,并将数据写入到文件中
# get html by url and save the data to xml file def gethtml(url): page = urllib.request.urlopen(url) html = page.read() # print(html)try: # open xml file and save data to it with open(xml_file, 'wb+') as xml_file_handle: xml_file_handle.write(html) except IOError as err: print "IO error: " + str(err) return -1 return 0

第四步,获取xml格式的数据之后,解析相关字段,并插入到mysql中
# phrase data from xml def parsexml(): total_rec = 1# record number# open xml file and get data record try: with open(file_name, 'a') as file_handle: dom = minidom.parse(xml_file) root = dom.getElementsByTagName("response")# The function getElementsByTagName returns NodeList.for node in root: total_rec = node.getElementsByTagName('count')[0].childNodes[0].nodeValuepois = node.getElementsByTagName("pois") for poi in pois[0].getElementsByTagName('poi'): branch_id = poi.getElementsByTagName("id")[0].childNodes[0].nodeValue branch_name = poi.getElementsByTagName("name")[0].childNodes[0].nodeValue branch_type = poi.getElementsByTagName("type")[0].childNodes[0].nodeValue bank_type = poi.getElementsByTagName("typecode")[0].childNodes[0].nodeValue pname = poi.getElementsByTagName("pname")[0].childNodes[0].nodeValue cityname = poi.getElementsByTagName("cityname")[0].childNodes[0].nodeValue aname = poi.getElementsByTagName("adname")[0].childNodes[0].nodeValue # address = poi.getElementsByTagName("address")[0].childNodes[0].nodeValue # biz_type = poi.getElementsByTagName("biz_type")[0].childNodes[0].nodeValue # tel = poi.getElementsByTagName("tel")[0].childNodes[0].nodeValue # distance = poi.getElementsByTagName("distance")[0].childNodes[0].nodeValue arr = branch_type.split('; ') bank_name = arr[-1] sql = "insert into bankinfo(branch_id, branch_name, branch_type, bank_name, bank_type, pname, cityname, aname) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % ( branch_id, branch_name.replace('(', '').replace(')', ''), branch_type, bank_name, bank_type, pname, cityname, aname)connection = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', port=3306, db='icoachu', charset="utf8") cursor = connection.cursor() try: print(sql) cursor.execute(sql) connection.commit() if cursor.rowcount != 1: raise Exception("数据插入失败%s", sql) finally: connection.close() cursor.close()except IOError as err: print "IO error: " + str(err)return total_rec

第五步,在主函数中实现处理相关调用
if __name__ == '__main__': cityarr = getallcity() for cityId in cityarr: url = r'%scity=%s%s' % (url_header, cityId, url_end) if gethtml(url) == 0: total_record_str = parsexml() total_record = int(str(total_record_str)) if (total_record % each_page_rec) != 0: page_number = total_record / each_page_rec + 2 else: page_number = total_record / each_page_rec + 1for each_page in frange(2, float(page_number)): print 'parsing page ' + str(each_page) + ' ... ...' url = url.replace('page=' + str(each_page - 1), 'page=' + str(each_page)) print(url) gethtml(url) parsexml() else: print 'error: fail to get xml from amap'

完整的代码如下
# coding:utf-8# 目标:通过高德地图的搜索接口,抓取每个城市的所有银行的分行信息 # 思路:1. 在本地mysql中存储有全国各城市名称、编码 #2. 将城市编码读取到数组中 #3. 按照数据读取每个编码,组拼URL,通过POST请求访问接口 #4. 获取xml后解析出我们需要的数据,插入到mysql中import urllib import xml.dom.minidom as minidom import string import urllib.request import pymysqlfile_name = 'result.txt'# write result to this file url_header = 'http://restapi.amap.com/v3/place/text?&keyword=&types=160100&' url_end = '&citylimit=true&&output=xml&offset=20&page=1&key=c787ae8e49424a657127c3ed64cfe053&extensions=base' url_amap = 'city=' each_page_rec = 20# results that displays in one page xml_file = 'tmp.xml'# xml filen name# get html by url and save the data to xml file def gethtml(url): page = urllib.request.urlopen(url) html = page.read() # print(html)try: # open xml file and save data to it with open(xml_file, 'wb+') as xml_file_handle: xml_file_handle.write(html) except IOError as err: print "IO error: " + str(err) return -1 return 0# phrase data from xml def parsexml(): total_rec = 1# record number# open xml file and get data record try: with open(file_name, 'a') as file_handle: dom = minidom.parse(xml_file) root = dom.getElementsByTagName("response")# The function getElementsByTagName returns NodeList.for node in root: total_rec = node.getElementsByTagName('count')[0].childNodes[0].nodeValuepois = node.getElementsByTagName("pois") for poi in pois[0].getElementsByTagName('poi'): branch_id = poi.getElementsByTagName("id")[0].childNodes[0].nodeValue branch_name = poi.getElementsByTagName("name")[0].childNodes[0].nodeValue branch_type = poi.getElementsByTagName("type")[0].childNodes[0].nodeValue bank_type = poi.getElementsByTagName("typecode")[0].childNodes[0].nodeValue pname = poi.getElementsByTagName("pname")[0].childNodes[0].nodeValue cityname = poi.getElementsByTagName("cityname")[0].childNodes[0].nodeValue aname = poi.getElementsByTagName("adname")[0].childNodes[0].nodeValue # address = poi.getElementsByTagName("address")[0].childNodes[0].nodeValue # biz_type = poi.getElementsByTagName("biz_type")[0].childNodes[0].nodeValue # tel = poi.getElementsByTagName("tel")[0].childNodes[0].nodeValue # distance = poi.getElementsByTagName("distance")[0].childNodes[0].nodeValue arr = branch_type.split('; ') bank_name = arr[-1] sql = "insert into bankinfo(branch_id, branch_name, branch_type, bank_name, bank_type, pname, cityname, aname) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % ( branch_id, branch_name.replace('(', '').replace(')', ''), branch_type, bank_name, bank_type, pname, cityname, aname)connection = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', port=3306, db='icoachu', charset="utf8") cursor = connection.cursor() try: print(sql) cursor.execute(sql) connection.commit() if cursor.rowcount != 1: raise Exception("数据插入失败%s", sql) finally: connection.close() cursor.close()except IOError as err: print "IO error: " + str(err)return total_recdef frange(start, stop, step=1): i = start while i < stop: yield i i += stepdef getallcity(): cityarr = [] connection = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', port=3306, db='icoachu', charset="utf8") cursor = connection.cursor() sql = "select * from region where parent_id in (select id from region where parent_id=0)" try: cursor.execute(sql) rows = cursor.fetchall() for row in rows: cityarr.append(row[0]) return cityarr finally: cursor.close() connection.close() return cityarrif __name__ == '__main__': cityarr = getallcity() for cityId in cityarr: url = r'%scity=%s%s' % (url_header, cityId, url_end) if gethtml(url) == 0: total_record_str = parsexml() total_record = int(str(total_record_str)) if (total_record % each_page_rec) != 0: page_number = total_record / each_page_rec + 2 else: page_number = total_record / each_page_rec + 1for each_page in frange(2, float(page_number)): print 'parsing page ' + str(each_page) + ' ... ...' url = url.replace('page=' + str(each_page - 1), 'page=' + str(each_page)) print(url) gethtml(url) parsexml() else: print 'error: fail to get xml from amap'

【通过高德地图抓起所有城市的银行信息】数据库中数据如下:
通过高德地图抓起所有城市的银行信息
文章图片
查询结果.png

    推荐阅读