tf-idf代码java tfjs _String

把测试集里的文本用tfidf算法提取出关键词，求个java代码//直接粘贴就行。
import java.awt.BorderLayout;
import java.awt.EventQueue;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Scanner;
import javax.swing.BorderFactory;
import javax.swing.JButton;
import javax.swing.JFileChooser;
import javax.swing.JFrame;
import javax.swing.JPanel;
import javax.swing.JSplitPane;
import javax.swing.JTextArea;
import javax.swing.border.EtchedBorder;
import javax.swing.filechooser.FileFilter;
public class Application2 extends JFrame implements Cloneable{
public Application2(){
this.setDefaultCloseOperation(EXIT_ON_CLOSE);
this.setSize(800,700);
this.setLayout(new BorderLayout());
keyWords1=new String[]{"那么","还是","sdf"};
keyWords2=new String[]{"所以","而且",};
input=new JTextArea();
JPanel ip=new JPanel();
ip.setLayout(new BorderLayout());
ip.add(input,BorderLayout.CENTER);
ip.setBorder(BorderFactory.createTitledBorder(BorderFactory.createEtchedBorder(EtchedBorder.LOWERED), "输入文本"));
output1=new JTextArea();
JPanel o1p=new JPanel();
o1p.setLayout(new BorderLayout());
o1p.add(output1,BorderLayout.CENTER);
o1p.setBorder(BorderFactory.createTitledBorder(BorderFactory.createEtchedBorder(EtchedBorder.LOWERED), "以下为"));
output2=new JTextArea();
JPanel o2p=new JPanel();
o2p.setLayout(new BorderLayout());
o2p.add(output2,BorderLayout.CENTER);
o2p.setBorder(BorderFactory.createTitledBorder(BorderFactory.createEtchedBorder(EtchedBorder.LOWERED), "以下为"));
JSplitPane split1=new JSplitPane(JSplitPane.HORIZONTAL_SPLIT,o1p,o2p);
split1.setDividerLocation(350);
JSplitPane split2=new JSplitPane(JSplitPane.VERTICAL_SPLIT,ip,split1);
split2.setDividerLocation(300);
this.add(split2,BorderLayout.CENTER);
open=new JButton("导入");
open.addActionListener(new ActionListener(){
public void actionPerformed(ActionEvent e){
JFileChooser chooser=new JFileChooser(".");
chooser.setMultiSelectionEnabled(false);
chooser.addChoosableFileFilter(new FileFilter(){
@Override
public boolean accept(File file) {
if(file.isDirectory())
return true;
int length=file.getName().length();
if(length5)
return false;
if(file.getName().substring(length-4).equals(".txt"))
return true;
return false;
}
@Override
public String getDescription() {
return "文本文件";
}
});
chooser.showOpenDialog(Application2.this);
File file=chooser.getSelectedFile();
if(file==null)
return;
try {
Scanner sc=new Scanner(file);
String text="";
while(sc.hasNextLine())
text =sc.nextLine() "\n";
input.setText(text);
String[] array=getSentences();
output1.setText(getKeySentences(keyWords1,array));
output2.setText(getKeySentences(keyWords2,array));
}catch (IOException e1) {
e1.printStackTrace();
}
}
});
save=new JButton("导出");
save.addActionListener(new ActionListener(){
public void actionPerformed(ActionEvent e){
JFileChooser chooser=new JFileChooser(".");
chooser.setMultiSelectionEnabled(false);
chooser.addChoosableFileFilter(new FileFilter(){
@Override
public boolean accept(File file) {
if(file.isDirectory())
return true;
int length=file.getName().length();
if(length5)
return false;
if(file.getName().substring(length-4).equals(".txt"))
return true;
return false;
}
@Override
public String getDescription() {
return "文本文件";
}
});
chooser.showSaveDialog(Application2.this);
File file=chooser.getSelectedFile();
if(file==null)
return;
try {
PrintWriter pw=new PrintWriter(file);
pw.print(output1.getText());
pw.flush();
pw.print(output2.getText());
pw.flush();
}catch (IOException e1) {
e1.printStackTrace();
}
}
});
JPanel buttonPane=new JPanel();
buttonPane.add(open);
buttonPane.add(save);
this.add(buttonPane,BorderLayout.SOUTH);
}
public String[] getSentences(){
ArrayListString set=new ArrayListString();
int length=input.getText().length();
for(int i=0,last=0;ilength;i){
String s=String.valueOf(input.getText().charAt(i));
if(s.equals("\n"))
last=i 1;
if(s.equals(".")||s.equals(",")||s.equals(" 。")||s.equals(" 。")||s.equals("！")||s.equals("？")||s.equals("?")||s.equals("!")||s.equals("，")){
set.add(input.getText().substring(last,i) s);
last=i 1;
}
}
return set.StringtoArray(new String[set.size()]);
}
public String getKeySentences(String[] key,String[] sentences){
String result="";
A: for(int i=0;isentences.length;i){
for (int k = 0; kkey.length; k)
if (sentences[i].contains(key[k].subSequence(0, key[k].length()))) {
result= sentences[i]"\n";
continue A;
}
}
return result;
}
private JTextArea input;
private JTextArea output1;
private JTextArea output2;
private JButton open;
private JButton save;
private String[] keyWords1;
private String[] keyWords2;
public static void main(String... args){
EventQueue.invokeLater(new Runnable(){
public void run(){
new Application2().setVisible(true);
}
});
}
}
详解 one-hot 和 TF-IDF ?? one-hot和TF-IDF 是提取文本特征的最为常见的方法，下文主要介绍它们主要的思想以及优缺点。
1.1 one-hot编码
one-hot 编码，又称独热编码、一位有效编码。其方法是使用N位状态寄存器来对N个状态进行编码，每个状态都有它独立的寄存器位，并且在任意时候，其中只有一位有效。举个例子，假设我们有三个样本（行），每个样本有三个特征（列）：
??上表中我们已经对每个特征进行了普通的数字编码：我们的feature_1有两种可能的取值，比如是男/女，这里男用0表示，女用1表示。那么one-hot编码是怎么搞的呢？
??我们再拿feature_2来说明：这里feature_2 有4种取值（状态），我们就用4个状态位来表示这个特征，one-hot编码就是保证每个样本中的单个特征只有1位处于状态1，其他的都是0 。
对于两种状态、三种状态、甚至更多状态都是这样表示，所以我们可以得到这些样本特征的新表示：
one-hot编码将每个状态位都看成一个特征。于是我们可以得到它们的特征向量分别为:
1.2 one-hot在提取文本特征上的应用
?? one-hot在特征提取上属于词袋模型（bag of words）。关于如何使用one-hot抽取文本特征向量我们通过以下例子来说明。假设我们的语料库中有三段话：
????我爱中国
????爸爸妈妈爱我
????爸爸妈妈爱中国
我们首先对语料库分离并获取其中所有的词，然后对每个此进行编号：
????1 我； 2 爱； 3 爸爸； 4 妈妈；5 中国
然后使用one-hot对每段话提取特征向量：
因此我们得到了最终的特征向量为
优缺点分析：
优点：
缺点：
sklearn实现one hot encode
注意：假如要进行编码的数据没有出现在对应列中将会出现错误
?? IF-IDF是信息检索（IR）中最常用的一种文本表示法。算法的思想很简单，就是统计每个词出现的词频（TF），然后再为其附上一个权值参数（IDF）。举个例子：
??现在假设我们要统计一篇文档中的前10个关键词，应该怎么下手？首先想到的是统计一下文档中每个词出现的频率（TF），词频越高，这个词就越重要。但是统计完你可能会发现你得到的关键词基本都是“的”、“是”、“为”这样没有实际意义的词（停用词），这个问题怎么解决呢？你可能会想到为每个词都加一个权重，像这种”停用词“就加一个很小的权重（甚至是置为0），这个权重就是IDF 。下面再来看看公式：
优缺点分析
优点：简单快速，结果比较符合实际
缺点：单纯考虑词频，忽略了词与词的位置信息以及词与词之间的相互关系。
sklearn 实现 tfidf
Android Studio运行Java文件，一直显示一个在Gradle build using task[我的项目名]特征抽取完后，因为每个词语对实体的贡献度不同，所以需要对这些词语赋予不同的权重。计算词项在向量中的权重方法——TF-IDF 。
它表示TF（词频）和IDF（倒文档频率）的乘积。
求TF-IDF算法的C或java源码。之前写过的，请加分。
#includemap
#include set
#includestring
#includeiostream
#include fstream
#include vector
#include cmath
#include algorithm
using namespace std;
mapstring,float IDFTable;
struct Words{
string wd;
float freq;
float weight;
};
bool cmp(Words w1,Wordsw2)
{
return w1.weightw2.weight;
}
mapstring,int WordTable;
vectorWords WordList;
char Comment[]=",.!\"?;:()";
int totalText=0;
bool IsAllNumber(string cs)
{
for (int i=0;ics.length();i)
{
if(cs[i]'0'||cs[i]'9')
return false;
}
return true;
}
bool Isblank(string cs)
{
for (int i=0;ics.length();i)
{
if(cs[i]!=' 'cs[i]!='\t')
return false;
}
return true;
}
string ToLower(string cs)
{
for (int i=0;ics.length();i)
{
if(cs[i]='A'cs[i]='Z')
cs[i] =('a'-'A');
}
return cs;
}
void readFile(string fname,setstring wds)
{
ifstream fin(fname.c_str());
string word;
wds.clear();
while (!fin.eof())
{
finword;
for (int i=0;Comment[i]!=0;i)
{
int pos;
while((pos=word.find(Comment[i]))!=-1)
{
word.replace(pos,1,"");
}
}
//the world;
if(!IsAllNumber(word)!Isblank(word))
{
wds.insert(ToLower(word));
}
/*totalwords;
*/
}
fin.close();
}
void GenerateIDF()
{
totalText=0;
string files[7]={"curious.txt",
"erotic.txt",
"fall.txt",
"hands.txt",
"water.txt",
"wifi.txt",
"young.txt"};
int x;
setstring wds;
for (int i=0;i7;i)
{
readFile(files[i],wds);
for (setstring::iterator it=wds.begin();it!=wds.end();it)
{
mapstring,float::iterator iter;
string word=*it;
if((iter=IDFTable.find(word))!=IDFTable.end())
{
iter-second =1;
}
else
{
IDFTable[word]=1;
}
}
totalText;
}
//
int cnt=0;
for (mapstring,float::iterator iter=IDFTable.begin();iter!=IDFTable.end();iter)
{
iter-second=log((float)totalText/(iter-second 1.0));
/*coutiter-first' 'iter-secondendl;
cnt;
if(cnt0==0)
{
cinx;
}*/
}
}
int GenerateTF(){
ifstream fin("Test.txt");
string word;
int textwords=0;
while (!fin.eof())
{
finword;
for (int i=0;Comment[i]!=0;i)
{
int pos;
while((pos=word.find(Comment[i]))!=-1)
{
word.replace(pos,1,"");
}
}
if(!IsAllNumber(word)!Isblank(word))
{
//wds.insert(ToLower(word));
textwords;
ToLower(word);
mapstring,int::iterator it;
if((it=WordTable.find(word))!=WordTable.end())
{
it-second;
}
else
{
WordTable[word]=1;
}
}
}
fin.close();
//计算频率
for (mapstring,int::iterator it=WordTable.begin();it!=WordTable.end();it)
{
Words wd;
wd.wd=it-first;
wd.freq=(float)(it-second)/textwords;
float idf=0;
mapstring,float::iterator iter;
if((iter=IDFTable.find(wd.wd))!=IDFTable.end())
{
idf=iter-second;
}
else
idf=log((float)totalText);
wd.weight=wd.freq*idf;
WordList.push_back(wd);
}
return textwords;
}
void GenerateSort()
{
sort(WordList.begin(),WordList.end(),cmp);
}
int main(){
GenerateIDF();
int txtwd=GenerateTF();
GenerateSort();
int topnum=10;
cout"Total Words: "txtwd" Top "topnum":\n";
cout"Wrod\t\tWeight\n";
for (int i=0;itopnum;i)
{
coutWordList[i].wd"\t\t"WordList[i].weightendl;
}
}
java 如何统计txt文本中的总词数不是总字数呀 TF–IDF 公式中需要用到词频(TF)=某个词在文章中出现的次数
词频(TF)=某个词在文章中出现的次数/文章的总词数
或者：
词频(TF)=某个词在文章中出现的次数/该文出现次数最多的词的出现次数
逆文档率：
TF-IDF
：
TF-IDF=词频(TF)*逆文档率(IDF)
TF-IDF与一个词在文档中的出现次数成正比，与该词在整个语言中的出现次数成反比。
【tf-idf代码java tfjs】tf-idf代码java的介绍就聊到这里吧，感谢你花时间阅读本站内容，更多关于tfjs、tf-idf代码java的信息别忘了在本站进行查找喔。

tf-idf代码java tfjs

推荐阅读

白色念珠菌性阴炎怎么引起的白色念珠菌阴道炎原因是什么

时间频域分析,频域分析法的概念以及应用优势体现在哪几个方面

操作系统|操作系统学习笔记（操作系统基础知识）

白茶在冰箱里存放多久

张一山评价张一山父母

什么是功率回路

酸奶盒子属于什么垃圾类酸奶盒子属于什么垃圾

青蟹吃多了有什么害处

干干净净是什么意思干干净净的含义

长期吃抗精神病药物会损伤智力吗？

毛笔为什么会开叉？毛笔应该如何清洗？

甘肃医保服务平台入口网址甘肃医保服务平台入口

精灵宝可梦暖暖猪进化精灵宝可梦水君的技能

家用储物柜价格表家用储物柜价格

mysql 存储过程异常 mysql的存错过程

婴儿缺钙宝宝缺钙有哪些症状

梦见老师是什么预兆梦到老师的意思

什么是互联网成瘾综合症

idea编译整个项目，MyEclipse 怎样手动编译整个项目

附项目 2023年南昌西湖区妇幼保健院春季入园体检流程