spark|happybase的使用

一、启动thrift 因为用到了happybase,需要先在服务器上启动thrift服务, 关闭终端thrift继续运行命令如下:

nohup hbase thrift -p 9090 start

二、读取hbase的代码
class GetHbase(object): def __init__(self, hostname,table_name,start_date): self.hostname=hostname#主机名 self.table_name = table_name#表名 self.start_date = start_datedef getdata(self): connection = happybase.Connection(self.hostname, autoconnect=False) connection.open() print"已成功连接到Hbase" print"准备连接到表weibo_content" table = connection.table(self.table_name) scanner = table.scan()# scanner浏览的是Hbase中所有字段数据 print"已成功连接到Hbase中表weibo_content" # for e in scanner: #print e # 下面开始读取这次运行需要的数据 mydata = https://www.it610.com/article/list() d = dict() #读入大于start_date的数据 print"开始读取%s之后的数据" % (self.start_date) for key, data in scanner: if data['cont:pubDate'] >= self.start_date: d['pubDate'] = data['cont:pubDate'] Timedict = TimeMatch(d['pubDate']) #将日期解析为week_num,month_num等形式 d['author'] = data['cont:author'] # 过滤'cont:content'为空的值 try: d['content'] = data['cont:content'] except Exception as e: del data d = {'pubDate':d['pubDate'],'author':d['author'],'content':d['content'],'Timedict':Timedict} # print type(d),'\n',d,'\n',d['content'] mydata.append(d) return d

三、调用 【spark|happybase的使用】mydata为读出的数据
mydata = https://www.it610.com/article/GetHbase(host, table_name, start_date).getdata()

    推荐阅读