下载wyoming大学的探空数据下载wyoming大学的探空数据

水平有限,欢迎指正交流,共同进步!

【下载wyoming大学的探空数据】俄怀明大学将全球探空站的数据共享使用,并通过计算了很多的变量指数.非常有用,近期有这方面的需求,暂时写了一段未经优化的代码下载.
如果要大量下载可以尝试代理池1或者代理池2的方式,通过代理和各种规避反扒措施来抓取.

import os import datetime import requests from io import StringIO import numpy as np import pandas as pd from bs4 import BeautifulSoup import calendar import random # url = http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2018&MONTH=07&FROM=0312&TO=0312&STNM=72558 # http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR=2018&MONTH=07&FROM=0312&TO=0312&STNM=54511USER_AGENTS = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", ]def main(start_date, end_date, sta_id=54511): pass # now_date = datetime.date(year, month, 1) allmonths = pd.date_range(start_date, end_date, freq='m') # print(allmonths) for idate, vdate in enumerate(allmonths.astype(object)): handle_html(vdate.year, vdate.month) print('搞定这个时间了:--->>>', vdate)def set_url(date, sta_id): dayNums = calendar.monthrange(date.year, date.month)[1] return 'http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR={year}&MONTH={month:4s}&FROM=0100&TO={day_end}12&STNM={sta_id}'.format( year=date.year, month=str(date.month).zfill(2), day_end=dayNums, sta_id=str(sta_id) )def handle_html(year, month, sta_id=54511): now_date = datetime.date(year, month, 1) urls = set_url(now_date, sta_id) header = {'User-Agent': random.choice(USER_AGENTS)} # print(random.choice(USER_AGENTS)) content = requests.get(urls, headers=header).content # print(content) soup_html(content)def soup_html(html_page): soup = BeautifulSoup(html_page) headers_1 = soup.find_all('h2') details_data = https://www.it610.com/article/soup.find_all('pre') d1s = details_data[::2] d2s = details_data[1::2] headers_2 = soup.find_all('h3') for iheader, vheader in enumerate(headers_1): convert_one_time(vheader, d1s[iheader], d2s[iheader])def convert_one_time(h1,d1, d2, filedir=None): h1_list = h1.string.split() obser_time = pd.to_datetime( ' '.join([i for i in h1_list[-4:]]), format='%HZ %d %b %Y') if filedir is None: filedir = os.path.join(homedir, 'data', obser_time.strftime('%Y%m')) if not os.path.exists(filedir): try: os.makedirs(filedir) except: pass # h1 = '_'.join([i for i in h1.string.split()]) filename1 = 'sounding_details_{}_{}.csv'.format( h1_list[1], obser_time.strftime('%Y%m%d%H')) filename2 = 'sounding_indices_{}_{}.csv'.format( h1_list[1], obser_time.strftime('%Y%m%d%H'))detail_data = https://www.it610.com/article/pd.read_fwf( StringIO(d1.string.replace('-', '')), widths=[7 for _ in range(11)],skiprows=[1, 4] ) detail_data.columns = [ '{}_{}'.format(i, j) for i, j in zip(detail_data.iloc[0], detail_data.iloc[1]) ] detail_data.drop([0, 1], axis=0, inplace=True) detail_data.to_csv(os.path.join(filedir, filename1), index=False)index_data = https://www.it610.com/article/pd.read_csv( StringIO(d2.string), delimiter=':', names=['variable', 'value']).Tindex_data.columns = index_data.iloc[0] index_data.drop(['variable'], axis=0, inplace=True) index_data.to_csv(os.path.join(filedir, filename2), index=False)if __name__ == '__main__': homedir = os.path.dirname(os.path.realpath(__file__)) print(homedir) datadir = os.path.join(homedir, 'data') if not os.path.exists(datadir): try: os.makedirs(datadir) except: pass # main('2000-01-01', '2018-07-01')