基于python实现垂直爬虫系统的方法详解

【基于python实现垂直爬虫系统的方法详解】html_downloader

from urllib import requestdef download(url):if url is None:returnresponse = request.urlopen(url)if response.getcode() != 200:return Nonereturn response.read()

html_outeputer
data_list = []def collect_data(data):data_list.append(data)def output_html():fout = open('output.html', 'w')fout.write('')fout.write('')fout.write('')for dataitem in data_list:fout.write('')fout.write('' % dataitem['url'])fout.write('' % dataitem['title'])fout.write('' % dataitem['datetime'])fout.write('' % dataitem['visitcount'])fout.write('')fout.write('
%s%s%s%s
')fout.write('')fout.write('')fout.close()

html_parser
import refrom bs4 import BeautifulSoupfrom urllib.parse import urljoindef get_new_urls(page_url, soup):new_urls = set()links = soup.find_all('a', href=https://www.it610.com/article/re.compile(r"/\d+/\d+/\w+/page\.htm"))for link in links:new_url = link['href']new_full_url = urljoin(page_url, new_url)new_urls.add(new_full_url)return new_urlsdef get_new_data(page_url, soup):res_data = https://www.it610.com/article/{}title_node = soup.find('h1', class_='arti-title')if title_node is None:return res_datares_data['title'] = title_node.get_text()datetime_node = soup.find('span', class_='arti-update')res_data['datetime'] = datetime_node.get_text()visitcount_node = soup.find('span', class_='WP_VisitCount')res_data['visitcount'] = visitcount_node.get_text()res_data['url'] = page_urlreturn res_datadef parse(page_url, html_cont):if page_url is None or html_cont is None:returnsoup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')new_urls = get_new_urls(page_url, soup)new_data = https://www.it610.com/article/get_new_data(page_url, soup)return new_urls, new_data

spider_main
import urls_manager, html_downloader, \html_parser, html_outputerdef craw(root_url):count = 1urls_manager.add_new_url(root_url)#启动爬虫循环while urls_manager.has_new_url():new_url = urls_manager.get_new_url()print('craw %d : %s' % (count, new_url))html_cont = html_downloader.download(new_url)new_urls, new_data = html_parser.parse(new_url, html_cont)urls_manager.add_new_urls(new_urls)if new_data:html_outputer.collect_data(new_data)if count == 10:breakcount = count + 1html_outputer.output_html()if __name__ == '__main__':root_url = 'http://news.zzuli.edu.cn/'craw(root_url)import urls_manager, html_downloader, \html_parser, html_outputerdef craw(root_url):count = 1urls_manager.add_new_url(root_url)#启动爬虫循环while urls_manager.has_new_url():new_url = urls_manager.get_new_url()print('craw %d : %s' % (count, new_url))html_cont = html_downloader.download(new_url)new_urls, new_data = html_parser.parse(new_url, html_cont)urls_manager.add_new_urls(new_urls)if new_data:html_outputer.collect_data(new_data)if count == 10:breakcount = count + 1html_outputer.output_html()if __name__ == '__main__':root_url = 'http://news.zzuli.edu.cn/'craw(root_url)

test_64
from bs4 import BeautifulSoupimport rehtml_doc = """The Dormouse's story - 锐客网The Dormouse's story
Once upon a time there were three little sisters; and their names wereElsie,Lacie andTillie; and they lived at the bottom of a well.
...
"""soup = BeautifulSoup(html_doc, 'html.parser')print('获取所有链接')links = soup.find_all('a')for link in links:print(link.name, link['href'], link.get_text())print('获取lacie链接')link_node = soup.find('a', href='http://example.com/lacie')print(link_node.name, link_node['href'], link_node.get_text())print('正则匹配')link_node = soup.find('a', href=https://www.it610.com/article/re.compile(r'ill'))print(link_node.name, link_node['href'], link_node.get_text())print('获取P段落文字')p_node = soup.find('p', class_='title')print(p_node.name, p_node.get_text())

urls_manager
new_urls = set()old_urls = set()def add_new_url(url):if url is None:returnif url not in new_urls and url not in old_urls:new_urls.add(url)def add_new_urls(urls):if urls is None or len(urls) == 0:returnfor url in urls:add_new_url(url)def get_new_url():new_url = new_urls.pop()old_urls.add(new_url)return new_urldef has_new_url():return len(new_urls) != 0

总结 本篇文章就到这里了,希望能够给你带来帮助,也希望您能够多多关注脚本之家的更多内容!

    推荐阅读