weka[10] - SimpleKmeans

classification的算法还有一些,不过还是打算先进入clustering的阶段。后续再回去补。
这一篇主要看看kmeans。kmeans是最简单的一种聚类算法,很清晰的EM思路。他的主要缺陷是聚类个数无法确定(靠人为设定),受初始中心点影响较大。
下面直接看看代码:国际惯例,看看buildCluster

getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); }m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0];

m_FullMissingCounts就是统计每个属性的缺失值
m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common value } } }

moveCentroid就是更新每个簇的中心点(后面详细看)。下面在做的事情,就是计算每个特征的mean或者mode(出现次数最多的那个),他考虑了如果是连续的,那么计算下标准差。如果是离散的并且missing最多,那么modes就是-1。
m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int [instances.numInstances()]; if(m_PreserveOrder) m_Assignments = clusterAssignments; m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if(m_PreserveOrder) initInstances = new Instances(instances); else initInstances = instances;

clusterAssignments就是每个样本的簇号。下面的RandomO就是为了初始化中心点。
for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j+1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } }

可以看到,这里每次初始化一个instIndex,然后把对应的样本作为一个中心点,直到个数 =m_NumClusters。 【weka[10] - SimpleKmeans】这里还用到了一个HashMap,来判断随机得到的中心点之前是否已经被选择。到目前为止,只是为迭代做一些准备工作。


m_NumClusters = m_ClusterCentroids.numInstances(); //removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances [] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double [m_NumClusters]; m_ClusterNominalCounts = new int [m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; }// update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid( i, tempI[i], true); } }if (m_Iterations == m_MaxIterations) converged = true; if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index++] = tempI[k]; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } }if (!converged) { m_squaredErrors = new double [m_NumClusters]; m_ClusterNominalCounts = new int [m_NumClusters][instances.numAttributes()][0]; } }

这个while就是开始循环啦! ClusterPrecessedInstance就是给样本找簇号,把这个点分到最近中心点那个簇里面去。收敛的标准是,clusterAssign没有改变。
temp存放的是每个cluster的数据,用在后面moveCentriod更新中心点集。如果出现空集,那么enmptyCluster+1,并且在此循环中被减去,更新m_numClusters.
最后看看moveCentriod

protected double[] moveCentroid(int centroidIndex, Instances members, boolean updateClusterInfo){ double [] vals = new double[members.numAttributes()]; //used only for Manhattan Distance Instances sortedMembers = null; int middle = 0; boolean dataIsEven = false; if(m_DistanceFunction instanceof ManhattanDistance){ middle = (members.numInstances()-1)/2; dataIsEven = ((members.numInstances()%2)==0); if(m_PreserveOrder){ sortedMembers = members; }else{ sortedMembers = new Instances(members); } }for (int j = 0; j < members.numAttributes(); j++) {//in case of Euclidian distance the centroid is the mean point //in case of Manhattan distance the centroid is the median point //in both cases, if the attribute is nominal, the centroid is the mode if(m_DistanceFunction instanceof EuclideanDistance || members.attribute(j).isNominal()) { vals[j] = members.meanOrMode(j); }else if(m_DistanceFunction instanceof ManhattanDistance){ //singleton special case if(members.numInstances() == 1){ vals[j] = members.instance(0).value(j); }else{ vals[j] = sortedMembers.kthSmallestValue(j, middle + 1); if (dataIsEven) { vals[j] = (vals[j] + sortedMembers.kthSmallestValue(j, middle + 2)) / 2; } } } if(updateClusterInfo){ m_ClusterMissingCounts[centroidIndex][j] = members.attributeStats(j).missingCount; m_ClusterNominalCounts[centroidIndex][j] = members.attributeStats(j).nominalCounts; if (members.attribute(j).isNominal()) { if (m_ClusterMissingCounts[centroidIndex][j] > m_ClusterNominalCounts[centroidIndex][j][Utils.maxIndex(m_ClusterNominalCounts[centroidIndex][j])]) { vals[j] = Instance.missingValue(); // mark mode as missing } } else { if (m_ClusterMissingCounts[centroidIndex][j] == members.numInstances()) { vals[j] = Instance.missingValue(); // mark mean as missing } } } } if(updateClusterInfo) m_ClusterCentroids.add(new Instance(1.0, vals)); return vals; }

这个很好懂,就是给定一个粗,如何更新他的中心点。具体如何更新,看这三行注释:
//in case of Euclidian distance the centroid is the mean point //in case of Manhattan distance the centroid is the median point //in both cases, if the attribute is nominal, the centroid is the mode

SimpleKmeans就到这里了!!


    推荐阅读