java代码词频 java词频统计( 二 )


SetString set = new TreeSetString();
for(String word: rawWords){
set.add(word);
}
Iterator ite = set.iterator();
ListString wordsList = new ArrayListString();
ListInteger freqList = new ArrayListInteger();
//多少个字符串未知,所以用list来保存先
while(ite.hasNext()){
String word = (String) ite.next();
int count = 0;//统计相同字符串的个数
for(String str: rawWords){
if(str.equals(word)){
count++;
}
}
wordsList.add(word);
freqList.add(count++);
}
//存入数组当中
words = wordsList.toArray(new String[0]);
wordFreqs = new int[freqList.size()];
for(int i = 0; ifreqList.size(); i++){
wordFreqs[i] = freqList.get(i);
}
}
//根据词频 , 将词数组和词频数组进行降序排序
public void sort() {
class Word{
private String word;
private int freq;
public Word(String word, int freq){
this.word = word;
this.freq = freq;
}
}
//注意:此处排序 , 1)首先按照词频降序排列,2)如果词频相同,按照字母降序排列,
//如 'abc''ab' 'aa'
class WordComparator implements Comparator{
public int compare(Object o1, Object o2) {
Word word1 = (Word) o1;
Word word2 = (Word) o2;
if(word1.freqword2.freq){
return 1;
}else if(word1.freqword2.freq){
return -1;
}else{
int len1 = word1.word.trim().length();
int len2 = word2.word.trim().length();
【java代码词频 java词频统计】String min = len1len2? word2.word: word1.word;
String max = len1len2? word1.word: word2.word;
for(int i = 0; imin.length(); i++){
if(min.charAt(i)max.charAt(i)){
return 1;
}
}
return 1;
}
}
}
List wordList = new ArrayListWord();
for(int i = 0; iwords.length; i++){
wordList.add(new Word(words[i], wordFreqs[i]));
}
Collections.sort(wordList, new WordComparator());
for(int i = 0; iwordList.size(); i++){
Word wor = (Word) wordList.get(i);
words[i] = wor.word;
wordFreqs[i] = wor.freq;
}
}
//将排序结果输出
public void printResult() {
System.out.println("Total " + words.length + " different words in the content!");
for(int i = 0; iwords.length; i++){
System.out.println(wordFreqs[i] + "" + words[i]);
}
}
//测试类的功能
public static void main(String[] args) {
Article a = new Article();
a.splitWord();
a.countWordFreq();
a.sort();
a.printResult();
}
}
-----------------------
Total 99 different words in the content!
5and
4the
4i
4a
3as
2with
2who
2to
2time
2sverak
2son
2s
2old
2of
2it
2in
2his
2czech
1zdenek
1year
1wrote
1writing
1won
1whining
1while
1wanted
1walked
1ve
1values
1though
1this
1these
1that
1than
1taking
1subtitles
1spend
1some
1so
1seen
1script
1saw
1russian
1richest
1remain
1rather
1production
1plays
1oscar
1one
1not
1more
1m
1likely
1life
1language
1kolya
1jan
1is
1increasingly
1impacted
1if
1higher
1high
1he
1golden
1globe
1foreign
1for
1five
1finds
1films
1film
1father
1english
1ends
1dramas
1directed
1delight
1days
1couple
1confirmed
1comparable
1characters

推荐阅读