python词频统计函数利用python进行词频统计 _词频

如何用python和jieba分词，统计词频？ #! python3
# -*- coding: utf-8 -*-
import os, codecs
import jieba
from collections import Counter
def get_words(txt):
seg_list = jieba.cut(txt)
c = Counter()
for x in seg_list:
if len(x)1 and x != '\r\n':
c[x]= 1
print('常用词频度统计结果')
for (k,v) in c.most_common(100):
print('%s%s %s%d' % (''*(5-len(k)), k, '*'*int(v/3), v))
if __name__ == '__main__':
with codecs.open('19d.txt', 'r', 'utf8') as f:
txt = f.read()
get_words(txt)
如何用python统计一个txt文件中各个单词出现的次数1、首先，定义一个变量，保存要统计的英文文章。
2、接着，定义两个数组，保存文章中的单词，以及各单词的词频。
3、从文章中分割出所有的单词，保存在数组中。
4、然后，计算文章中单词的总数，保存在变量中。
5、用for循环，统计文章中各单词的词频。
6、最后，输出文章中各单词的词频。
7、运行程序，电脑会自动统计输入文章中各单词的词频。
如何用python实现英文短文的双词频统计简单版：
#!/usr/bin/env python3
import re
import jieba
from collections import Counter
fname = 'counttest.txt'
with open(fname) as f:
s = f.read()
pattern = re.compile(r'[a-zA-Z] \-?[a-zA-Z]*')
english_words = Counter(pattern.findall(s))
other_words = Counter(jieba.cut(pattern.sub('', s)))
print('\n英文单词统计结果：\n' '-'*17)
print('\n'.join(['{}: {}'.format(i, j) for i, j in english_words.most_common()]))
print('\n中文及符号统计结果：\n' '-'*19)
print('\n'.join(['{}: {}'.format(i, j) for i, j in other_words.most_common()]))
复杂版：
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function, division, unicode_literals
import sys, re, time, os, jieba
from collections import Counter
from datetime import datetime
class WordCounter(object):
def __init__(self, from_file, to_file=None, coding=None, jieba_cut=None):
'''根据设定的进程数，把文件from_file分割成大小基本相同，数量等同与进程数的文件段，
来读取并统计词频，然后把结果写入to_file中，当其为None时直接打印在终端或命令行上。
Args:
@from_file 要读取的文件
@to_file 结果要写入的文件
@coding 文件的编码方式，默认为采用chardet模块读取前1万个字符来自动判断
@jieba_cut 是否启用结巴分词，默认为None
How to use:
w = WordCounter('a.txt', 'b.txt')
w.run()
'''
if not os.path.isfile(from_file):
raise Exception('No such file: 文件不存在')
self.f1 = from_file
self.filesize = os.path.getsize(from_file)
self.f2 = to_file
if coding is None:
try:
import chardet
except ImportError:
【python词频统计函数利用python进行词频统计】os.system('pip install chardet')
print('-'*70)
import chardet
with open(from_file, 'rb') as f:
coding = chardet.detect(f.read(10000))['encoding']
self.coding = coding
self._c = [Counter(), Counter()]
self.jieba = False
if jieba_cut is not None:
self.jieba = True
def run(self):
start = time.time()
if 1:
self.count_direct(self.f1)
if self.f2 not in ['None', 'Null', 'none', 'null', None]:
with open(self.f2, 'wb') as f:
f.write(self.result.encode(self.coding))
else:
print('\nEnglish words:\n''-'*15)
print(self.result)
cost = '{:.1f}'.format(time.time()-start)
size = humansize(self.filesize)
tip = '\nFile size: {}. Cost time: {} seconds'
#print(tip.format(size, cost))
self.cost = cost's'
def count_direct(self, from_file):
'''直接把文件内容全部读进内存并统计词频'''
start = time.time()
with open(from_file, 'rb') as f:
line = f.read()
for i in range(len(self._c)):
self._c[i].update(self.parse(line)[i])

def parse(self, line):#解析读取的文件流
text = line.decode(self.coding)
text = re.sub(r'\-\n', '', text) #考虑同一个单词被分割成两段的情况，删除行末的-号
pattern = re.compile(r'[a-zA-Z] \-?[a-zA-Z]*') #判断是否为英文单词
english_words = pattern.findall(text)
rest = pattern.sub('', text)
ex = Counter(jieba.cut(rest)) if self.jieba else Counter(text)
return Counter(english_words), ex
def flush(self):#清空统计结果
self._c = [Counter(), Counter()]
@property
def counter(self):#返回统计结果的Counter类
return self._c
@property
def result(self):#返回统计结果的字符串型式，等同于要写入结果文件的内容
ss = []
for c in self._c:
ss.append(['{}: {}'.format(i, j) for i, j in c.most_common()])
tip = '\n\n中文及符号统计结果:\n' '-'*15 '\n'
return tip.join(['\n'.join(s) for s in ss])
def humansize(size):
"""将文件的大小转成带单位的形式
humansize(1024) == '1 KB'
True
humansize(1000) == '1000 B'
True
humansize(1024*1024) == '1 M'
True
humansize(1024*1024*1024*2) == '2 G'
True
"""
units = ['B', 'KB', 'M', 'G', 'T']
for unit in units:
if size1024:
break
size = size // 1024
return '{} {}'.format(size, unit)
def main():
if len(sys.argv)2:
print('Usage: python wordcounter.py from_file to_file')
exit(1)
from_file, to_file = sys.argv[1:3]
args = {'coding' : None, 'jieba_cut': 1}
for i in sys.argv:
for k in args:
if re.search(r'{}=(. )'.format(k), i):
args[k] = re.findall(r'{}=(. )'.format(k), i)[0]
w = WordCounter(from_file, to_file, **args)
w.run()
if __name__ == '__main__':
import doctest
doctest.testmod()
main()
更复杂的：如果是比较大的文件，建议采用多进程，详情百度：多进程读取大文件并统计词频 jaket5219999
如何用python对文章中文分词并统计词频1、全局变量在函数中使用时需要加入global声明
2、获取网页内容存入文件时的编码为ascii进行正则匹配时需要decode为GB2312，当匹配到的中文写入文件时需要encode成GB2312写入文件。
3、中文字符匹配过滤正则表达式为ur'[一-龥] ',使用findall找到所有的中文字符存入分组
4、KEY，Value值可以使用dict存储，排序后可以使用list存储
5、字符串处理使用split分割，然后使用index截取字符串，判断哪些是名词和动词
6、命令行使用需要导入os,os.system(cmd)
python词频统计函数的介绍就聊到这里吧，感谢你花时间阅读本站内容，更多关于利用python进行词频统计、python词频统计函数的信息别忘了在本站进行查找喔。

python词频统计函数利用python进行词频统计

推荐阅读

wa，鞋湿了

酸辣洋姜怎么做

这个重要提醒事关每个人！

ipod|库克：乔布斯教会了我们所有人如何飞翔

鱼可以怎么做好吃

直播电影需要电脑什么配置，直播电影需要版权吗

没有证据医疗事故怎样处理

文件无法粉碎删除解决方法

热水器哪个牌子好汉诺威电热水器哪家售后服务好

月子中心多少钱一月东莞十月后月子中心价格

遇到11升威能热水器显示f2怎么办,按步骤来轻松解决

信用卡逾期了银行要起诉怎么办

想换台手机,iPhoneX和iPhoneXR,选哪个比较好？

宝马m2雷霆版马力多大宝马m2和宝马m2雷霆版有什么区别

归园田居其三东晋陶渊明归园田居其三

Android|Android 利用OpenCV制作人脸检测APP

爱普生680kii清零

支付宝沙箱小程序报未授权，支付宝支付沙箱环境

头像上传jquery，头像上传失败请检查网络抖音

有胃寒但是有点上火要怎么办呢

python词频统计函数 利用python进行词频统计

推荐阅读

python词频统计函数利用python进行词频统计