爬取appstore应用信息

#!/usr/bin/env python # encoding=utf8 import sysimport bs4 from bs4 import BeautifulSoup import requests import json import regIgnoreGenreList = ['报刊杂志', '贴纸', '商品指南']# https://www.apple.com/cn/itunes/charts/paid-apps/ # https://www.apple.com/cn/itunes/charts/free-apps/ def parse_appstore_page(cate_url, out_file): # section apps grid html = requests.get(cate_url).content soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')result = soup.find("div", id="main")result = result.find("section", class_="section apps grid") result = result.find("ul") app_list = result.children app_result = [] for child in app_list: if isinstance(child, bs4.element.Tag): app_info = child.find("h3").find("a")# print child app_name = app_info.string app_itunes_url = app_info.get("href")# print(repr(app_name).decode('unicode-escape')) # print(repr(app_detail_url).decode('unicode-escape')) one_app = {} one_app["app_name"] = app_name one_app["app_detail_url"] = app_itunes_url app_result.append(one_app)if len(out_file) > 0: print "save result to file :%s" % out_file result_string = json.dumps(app_result, ensure_ascii=False) f=open(out_file,"w") f.write(result_string) f.write('\n') f.close()# https://itunes.apple.com/cn/genre/ios/id36?mt=8 def parse_genre_page(genre_url, limit = 10, out_file = "genre_result.txt"): html = requests.get(genre_url).content soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')result = soup.find("div", id="main")result = result.find("div", id="content") result = result.find("div", id="genre-nav") result = result.find("div", class_="grid3-column")# ul list ul_list = result.find_all("ul", recursive=False) cate_result = []for one_ul in ul_list: if isinstance(one_ul, bs4.element.Tag):cate_list = one_ul.childrenfor cate in cate_list: cate_info = cate.find("a") cate_name = cate_info.stringif cate_name in gIgnoreGenreList: print "ingore cate :" + cate_name continuecate_url = cate_info.get("href")one_cate = {} one_cate["name"] = cate_name one_cate["url"] = cate_urlprint "processing genre %s." % cate_name cate_app = parse_genre_content(cate_url, limit) one_cate["app_list"] = cate_appcate_result.append(one_cate) breakif len(out_file) > 0: print "save result to file :%s" % out_file result_string = json.dumps(cate_result, ensure_ascii=False) f=open(out_file,"w") f.write(result_string) f.write('\n') f.close()# https://itunes.apple.com/cn/genre/ios-导航/id6010?mt=8 # 返回一个数组 [{app_name:xxxx, app_detail_url:xxxxx}] # limit 为app返回数量,-1 为不限制 def parse_genre_content(content_url, limit = -1): html = requests.get(content_url).content soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')result = soup.find("div", id="main")result = result.find("div", id="content") result = result.find("div", id="selectedgenre") result = result.find("div", class_="grid3-column")# ul list ul_list = result.find_all("div") app_result = []count = 0 for one_ul in ul_list: if isinstance(one_ul, bs4.element.Tag):app_list = one_ul.find("ul").childrenfor app in app_list: if isinstance(app, bs4.element.Tag):if limit > -1 and count >= limit: break # print count app_info = app.find("a") app_name = app_info.string app_url = app_info.get("href")one_app = {} one_app["app_name"] = app_name one_app["app_detail_url"] = app_url one_app["app_id"] = parse_appid(app_url)app_detail = parse_detail_page(app_url)if app_detail != None: print app_detail one_app.update(app_detail)app_result.append(one_app) count += 1return app_result# https://itunes.apple.com/cn/app/高德地图-精准导航-出行必备/id461703208?mt=8 # {latest_version:x.x.x, update_date:xxxxxxx, system_version:iOS 7.0} def parse_detail_page(detail_url): app_detail = {}html = requests.get(detail_url).content soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8') result = soup.find("main", class_="is-app-theme")# print result # result_string = soup # f=open("out_file.txt","w") # print >> f, result # f.write(result_string) # f.close()# returnwhats_new = result.find("section", class_="l-content-width section section--bordered whats-new")# 获取版本号和最后更新时间 update_date = whats_new.find("time").string latest_version = parse_appver(whats_new.find("p").string)# print update_date # print latest_version # APP 信息 information = result.find("dl", class_=["information-list", "information-list--app", "medium-columns"])# 获取app支持的最低iOS系统版本号 version_label = information.select("div")[3].select("div > span")[0].string system_version = parse_systver(version_label)# print system_version # returnapp_detail["update_date"] = update_date app_detail["latest_version"] = latest_version app_detail["system_version"] = system_versionreturn app_detaildef parse_appid(detail_url): # [^/]+(?!.*/)(?=[\?]+) return re.search("[^/]+(?!.*/)(?=[\?]+)", detail_url).group(0)def parse_systver(version_label): # return version_label return re.search("iOS.(\d+\.\d+)", version_label).group(1)def parse_appver(version_label): # "版本 10.5.0" # return version_label return re.search(u"版本\s(.+)", version_label).group(1)if __name__ == '__main__':reload(sys) sys.setdefaultencoding('utf8') # parse_appstore_page('https://www.apple.com/cn/itunes/charts/free-apps/', 'free-apps.txt') # parse_appstore_page('https://www.apple.com/cn/itunes/charts/paid-apps/', 'paid-apps.txt')parse_genre_page('https://itunes.apple.com/cn/genre/ios/id36?mt=8', 1, 'appstore-genre.txt')# version = re.search("iOS.(\d+\.\d+)", "xxx iOS 8.0 asdasdadasd").group(1) # print version

    推荐阅读