利用python进行爬虫操作记录——part1百度贴吧
在进行自然语言的深入学习中,很重要的一个过程是从互联网平台上抓取文本和资料。我开始尝试进行网络爬虫。
从最简单的不需要模拟登陆的百度贴吧和豆瓣等开始。
firefox浏览器 【利用python进行爬虫操作记录——part1百度贴吧】相较于chrome浏览器而言,最近发现firefox特别好用,根本不需要什么额外的抓包,就可以实时监测而且可读性很强。
F12调取网络检测器
查看器——查看整个网页的HTML脚本,支持搜索
控制台——get/post请求的参数/响应/cookie
各项参数和记录特别具体
百度贴吧爬虫
# coding=utf-8
# -*- coding = utf-8 -*-
import urllib2
import reclass Tool:
removeImg = re.compile(r'')# image
removeAddr = re.compile(r'|')
replaceLine = re.compile('|||
')
replaceBR = re.compile('
|
')
replaceTD = re.compile('')
replacePara = re.compile('')
removeExtraTag = re.compile('<.*?>')def replace(self, x):
x = re.sub(self.removeImg, "", x)
x = re.sub(self.removeAddr, "", x)
x = re.sub(self.replaceLine, "\n", x)
x = re.sub(self.replaceTD, "\t", x)
x = re.sub(self.replacePara, "\n", x)
x = re.sub(self.replaceBR, "\n", x)
x = re.sub(self.removeExtraTag, "", x)
return x.strip()class BaiduTieba:
def __init__(self, url, seelz, floortag=1):
self.url = url
self.seeLz = '?see_lz='+str(seelz)
self.tool = Tool()
self.file = None
self.defaultTitle = "百度贴吧"
self.floortag = floortag# 判断是否添加楼层标志
self.floor = 1def getPageContent(self, pagenum):
url = self.url + self.seeLz + '&pn=' + str(pagenum)
user_agent = 'Mozilla/5.0 (Windows NT 10.0;
Win64;
x64;
rv:53.0) Gecko/20100101 Firefox/53.0'
headers = {'User-Agent': user_agent}
try:
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
return content
except urllib2.URLError, e:
if hasattr(e, 'reason'):# 检查是否有name的属性
print e.reasondef get_title(self, pagenum=1):
content = self.getPageContent(pagenum)
pattern_title = re.compile(r'(.*?)
', re.S)
title = re.search(pattern_title, content)
if title:
return title.group(1).strip()
else:
print Nonedef get_author(self, pagenum=1):
content = self.getPageContent(pagenum)
pattern_author = re.compile(r'')
author = re.search(pattern_author, content)
if author:
return author.group(1).strip()
else:
return Nonedef get_reply_page(self, pagenum=1):
content = self.getPageContent(pagenum)
pattern_page = re.compile(
r'(.*?).*?(.*?)')
totalpage = re.search(pattern_page, content)
if totalpage:
return totalpage.group(1).strip(), totalpage.group(2).strip()
else:
return Nonedef getContent(self, pagenum):
content = self.getPageContent(pagenum)
pattern_content = re.compile(r'(.*?)', re.S)
items = re.findall(pattern_content, content)
floor = 1
contents = []
for item in items:
str_floor = str(floor) + u'楼——————————\n'
tempContent = '\n'+ self.tool.replace(item)+'\n'
contents.append(str_floor)
contents.append(tempContent.encode('utf-8'))
floor += 1
return contentsdef writedata2File(self, contents):
for item in contents:
print u"正在写入"+ str(self.floor) + u"楼的内容"
self.file.write(item)
self.floor += 1def newFile(self, title):
if title:
self.file = open(title + '.txt', 'w+')
else:
self.file = open(self.defaultTitle + '.txt', 'w+')def start_spider(self, pagenum=1):# 先获得第一页基础信息
content = self.getPageContent(pagenum)
title = self.get_title(pagenum)
author = self.get_author(pagenum)
self.newFile(title)
totalpage = self.get_reply_page(pagenum)
totalcontent = []
for i in range(1, totalpage[1]+1):
tempcontent = self.getContent(i)
totalcontent += tempcontent
try:
self.writedata2File(totalcontent)
except IOError, e:
print '写入文件发生异常' + e.message
finally:
print '写入文件完成'# 测试记录
tips: 1. re.research().group(0,1,2,3……)
2.user_agent等信息根据firefox的get请求中的响应参数决定
推荐阅读
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- python学习之|python学习之 实现QQ自动发送消息
- 逻辑回归的理解与python示例
- python自定义封装带颜色的logging模块
- 【Leetcode/Python】001-Two|【Leetcode/Python】001-Two Sum
- Python基础|Python基础 - 练习1
- Python爬虫|Python爬虫 --- 1.4 正则表达式(re库)
- Python(pathlib模块)
- python青少年编程比赛_第十一届蓝桥杯大赛青少年创意编程组比赛细则
- Python数据分析(一)(Matplotlib使用)
- Python|Python 9.20