- 首页 > it技术 > >
爬取appstore应用信息
#!/usr/bin/env python
# encoding=utf8
import sysimport bs4
from bs4 import BeautifulSoup
import requests
import json
import regIgnoreGenreList = ['报刊杂志', '贴纸', '商品指南']# https://www.apple.com/cn/itunes/charts/paid-apps/
# https://www.apple.com/cn/itunes/charts/free-apps/
def parse_appstore_page(cate_url, out_file):
# section apps grid
html = requests.get(cate_url).content
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')result = soup.find("div", id="main")result = result.find("section", class_="section apps grid")
result = result.find("ul")
app_list = result.children
app_result = []
for child in app_list:
if isinstance(child, bs4.element.Tag):
app_info = child.find("h3").find("a")# print child
app_name = app_info.string
app_itunes_url = app_info.get("href")# print(repr(app_name).decode('unicode-escape'))
# print(repr(app_detail_url).decode('unicode-escape'))
one_app = {}
one_app["app_name"] = app_name
one_app["app_detail_url"] = app_itunes_url
app_result.append(one_app)if len(out_file) > 0:
print "save result to file :%s" % out_file
result_string = json.dumps(app_result, ensure_ascii=False)
f=open(out_file,"w")
f.write(result_string)
f.write('\n')
f.close()# https://itunes.apple.com/cn/genre/ios/id36?mt=8
def parse_genre_page(genre_url, limit = 10, out_file = "genre_result.txt"):
html = requests.get(genre_url).content
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')result = soup.find("div", id="main")result = result.find("div", id="content")
result = result.find("div", id="genre-nav")
result = result.find("div", class_="grid3-column")# ul list
ul_list = result.find_all("ul", recursive=False)
cate_result = []for one_ul in ul_list:
if isinstance(one_ul, bs4.element.Tag):cate_list = one_ul.childrenfor cate in cate_list:
cate_info = cate.find("a")
cate_name = cate_info.stringif cate_name in gIgnoreGenreList:
print "ingore cate :" + cate_name
continuecate_url = cate_info.get("href")one_cate = {}
one_cate["name"] = cate_name
one_cate["url"] = cate_urlprint "processing genre %s." % cate_name
cate_app = parse_genre_content(cate_url, limit)
one_cate["app_list"] = cate_appcate_result.append(one_cate)
breakif len(out_file) > 0:
print "save result to file :%s" % out_file
result_string = json.dumps(cate_result, ensure_ascii=False)
f=open(out_file,"w")
f.write(result_string)
f.write('\n')
f.close()# https://itunes.apple.com/cn/genre/ios-导航/id6010?mt=8
# 返回一个数组 [{app_name:xxxx, app_detail_url:xxxxx}]
# limit 为app返回数量,-1 为不限制
def parse_genre_content(content_url, limit = -1):
html = requests.get(content_url).content
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')result = soup.find("div", id="main")result = result.find("div", id="content")
result = result.find("div", id="selectedgenre")
result = result.find("div", class_="grid3-column")# ul list
ul_list = result.find_all("div")
app_result = []count = 0
for one_ul in ul_list:
if isinstance(one_ul, bs4.element.Tag):app_list = one_ul.find("ul").childrenfor app in app_list:
if isinstance(app, bs4.element.Tag):if limit > -1 and count >= limit:
break
# print count
app_info = app.find("a")
app_name = app_info.string
app_url = app_info.get("href")one_app = {}
one_app["app_name"] = app_name
one_app["app_detail_url"] = app_url
one_app["app_id"] = parse_appid(app_url)app_detail = parse_detail_page(app_url)if app_detail != None:
print app_detail
one_app.update(app_detail)app_result.append(one_app)
count += 1return app_result# https://itunes.apple.com/cn/app/高德地图-精准导航-出行必备/id461703208?mt=8
# {latest_version:x.x.x, update_date:xxxxxxx, system_version:iOS 7.0}
def parse_detail_page(detail_url):
app_detail = {}html = requests.get(detail_url).content
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
result = soup.find("main", class_="is-app-theme")# print result
# result_string = soup
# f=open("out_file.txt","w")
# print >> f, result
# f.write(result_string)
# f.close()# returnwhats_new = result.find("section", class_="l-content-width section section--bordered whats-new")# 获取版本号和最后更新时间
update_date = whats_new.find("time").string
latest_version = parse_appver(whats_new.find("p").string)# print update_date
# print latest_version
# APP 信息
information = result.find("dl", class_=["information-list", "information-list--app", "medium-columns"])# 获取app支持的最低iOS系统版本号
version_label = information.select("div")[3].select("div > span")[0].string
system_version = parse_systver(version_label)# print system_version
# returnapp_detail["update_date"] = update_date
app_detail["latest_version"] = latest_version
app_detail["system_version"] = system_versionreturn app_detaildef parse_appid(detail_url):
# [^/]+(?!.*/)(?=[\?]+)
return re.search("[^/]+(?!.*/)(?=[\?]+)", detail_url).group(0)def parse_systver(version_label):
# return version_label
return re.search("iOS.(\d+\.\d+)", version_label).group(1)def parse_appver(version_label):
# "版本 10.5.0"
# return version_label
return re.search(u"版本\s(.+)", version_label).group(1)if __name__ == '__main__':reload(sys)
sys.setdefaultencoding('utf8')
# parse_appstore_page('https://www.apple.com/cn/itunes/charts/free-apps/', 'free-apps.txt')
# parse_appstore_page('https://www.apple.com/cn/itunes/charts/paid-apps/', 'paid-apps.txt')parse_genre_page('https://itunes.apple.com/cn/genre/ios/id36?mt=8', 1, 'appstore-genre.txt')# version = re.search("iOS.(\d+\.\d+)", "xxx iOS 8.0 asdasdadasd").group(1)
# print version
推荐阅读