红海战役与战狼2影评分析

一、数据爬取 使用pyspider,在豆瓣把两部电影的影评拉下来,存储起来
pyspider脚本:

#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2018-03-10 21:44:35 # Project: douban_moviefrom pyspider.libs.base_handler import * from pyspider.database.mysql.pymysql import SQLclass Handler(BaseHandler): crawl_config = { 'headers' : {'Connection':'keep-alive','Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh; q=0.8','content-type':'application/x-www-form-urlencoded','User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'} }def on_start(self): self.crawl('https://movie.douban.com/subject/26861685/comments?start=0&limit=20', callback=self.getCount)def getCount(self, response): commentCount = int(response.doc('.is-active > span').text()[3:-1]) startNum = 0 for i in range(int(commentCount/20)): self.crawl('https://movie.douban.com/subject/26861685/comments?start='+str(startNum) +'&limit=20', callback=self.detail_page) startNum = startNum + 20@config(priority=2) def detail_page(self, response): result = [] for item in response.doc('.comment > p').items(): result.append({ "comment": item.text() }) return resultdef on_result(self,result): if not result or len(result)==0: return sql = SQL() for item in result: sql.insert('movie_comment',**item)

需要在/usr/lib/python2.7/site-packages/pyspider/database/mysql/下放入以下文件
文件名:pymysql.py
#!/usr/bin/env python # -*- encoding: utf-8 -*- from six import itervalues import pymysql import pymysql.cursorsclass SQL(): #数据库初始化 def __init__(self): self.connection = False try: self.conn = pymysql.connect(host='localhost', user='root', password='123456', db='data_analyse', charset='utf8', cursorclass=pymysql.cursors.DictCursor) self.cursor = self.conn.cursor() self.cursor.execute("set names utf8") self.connection = True except Exception as e: print("Cannot Connect To Mysql!/n")def escape(self,string): return '%s' % string #插入数据到数据库 def insert(self,tablename=None,**values):if self.connection: tablename = self.escape(tablename) if values: _keys = ",".join(self.escape(k) for k in values) _values = ",".join(['%s',]*len(values)) sql_query = "insert into %s (%s) values (%s)" % (tablename,_keys,_values) else: sql_query = "replace into %s default values" % tablename try: if values: self.cursor.execute(sql_query,list(itervalues(values))) else: self.cursor.execute(sql_query) self.conn.commit() return True except Exception as e: print('An Error Occured: ',e) return False

红海战役与战狼2影评分析
文章图片
image.png 【红海战役与战狼2影评分析】二、数据清洗
后面用到到jieba分词,需要把评论中的标点符号都去掉

    推荐阅读