hd1

【hd1】获取汉典字符地址列表

# -*- coding: utf-8 -*- # 入re模块 import re import urllib.request from bs4 import BeautifulSoup # 开保存地址的文件 outfile = open("zdurlfile_jibenjieshi", "w") # 首列表正则表达式 bspattern = re.compile(r"(?:%[^%']{2}){3}") # 获取部首列表页面 req1 = urllib.request.Request('http://www.zdic.net/z/jbs/') response1 = urllib.request.urlopen(req1) index_page1 = response1.read() # 析得到部首列表 # 试试直接正则表达式提取部首列表 index_page1 = index_page1.decode('utf8') bslist = re.findall(bspattern, index_page1) # 换成字列表的地址 # 获取部首列表页面 for bu in bslist: bu = "http://www.zdic.net/z/jbs/bs/?bs=" + bu reqb = urllib.request.Request(bu) reqb.add_header('Referer', 'http://www.zdic.net/z/jbs/') responseb = urllib.request.urlopen(reqb) index_z = responseb.read() # 分析得到字列表 index_z = index_z.decode('utf8') zlist = re.findall(r"/z/[^']*?\.htm", index_z) for uz in zlist: url = "http://www.zdic.net/" + uz # 存入文件 outfile.write(url + '\n') # 数不能为叠加器 print(uz) outfile.close()

    推荐阅读