SpringBoot整合并使用Java实现“孤立森林”异常数据过滤算法 SpringBoot

实现背景和意义在物联网应用中，数据的产生大多数都是由传感器采集的，农业物联网更是如此。并且农业物联网中，传感器采集环境更加极端，十分容易发生传感器数据采集异常事件，这些异常的输入随传输协议进入数据库，必然会对本系统的数据库产生污染，影响应用可靠性。所以对异常数据应该采取过滤的方式达到不对应用可靠性产生负面影响的效果。
在我自己的智能水培项目中中，物联网的环境数据由温度、湿度、PH值、EC值、CO2浓度、光照强度组成，所以按时间划分，每一条数据就是这些属性的合集，所以我们可以将环境数据集看成多维数据集。
孤立森林算法 【SpringBoot整合并使用Java实现“孤立森林”异常数据过滤算法】在详细讨论算法的文章里已经说明的很清楚了，这里不做赘述，只讲一下自己的理解作为总结。
孤立森林算法的实质是对“容易离群的点”的过滤。在多维数据中，利用各项属性比对大小来构建树，测试异常点时使用该值进行树的遍历，越是异常的点，它的遍历深度就越低，以此来判断该点是否离群。当然若要增加判断可靠性，就需要多创建树，并遍历所有的树，累计所有的深度，计算平均值，设定评分系统来评定该点是否是所谓的异常点。
Java实现孤立森林项目中用到的二维数组DenseMatrix64F的依赖：

com.googlecode.efficient-java-matrix-library ejml 0.25

项目中的HistoryData为我自己构建的类，是物联网中的历史数据类，贴代码经供参考。

public class HistoryData { private String time; private String CO2; private String EC; private String LED; private String LIGHT; private String PH; private String RH; private String TEMP; private String days; public HistoryData() { }public String getTime() { return time; }public void setTime(String time) { this.time = time; }public String getCO2() { return CO2; }public void setCO2(String CO2) { this.CO2 = CO2; }public String getEC() { return EC; }public void setEC(String EC) { this.EC = EC; }public String getLED() { return LED; }public void setLED(String LED) { this.LED = LED; }public String getLIGHT() { return LIGHT; }public void setLIGHT(String LIGHT) { this.LIGHT = LIGHT; }public String getPH() { return PH; }public void setPH(String PH) { this.PH = PH; } public String getRH() { return RH; }public void setRH(String RH) { this.RH = RH; }public String getTEMP() { return TEMP; }public void setTEMP(String TEMP) { this.TEMP = TEMP; }public String getDays() { return days; }public void setDays(String days) { this.days = days; }@Override public String toString() { return "HistoryData{" + "time='" + time + '\'' + ", CO2='" + CO2 + '\'' + ", EC='" + EC + '\'' + ", LED='" + LED + '\'' + ", LIGHT='" + LIGHT + '\'' + ", PH='" + PH + '\'' + ", RH='" + RH + '\'' + ", TEMP='" + TEMP + '\'' + ", days='" + days + '\'' + '}'; } }

IsoForest类 —用来加载数据并构建整个“森林”即IForest

import com.example.smartf.model.HistoryData; import com.example.smartf.tool.influxdb.InfluxdbDao; import org.ejml.data.DenseMatrix64F; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import javax.annotation.PostConstruct; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Random; @Component public class IsoForest {@Autowired private InfluxdbDao influxdbDao; public static IForest iForest; public static IForest getiForest() { return iForest; }public static void setiForest(IForest iForest) { IsoForest.iForest = iForest; } //本项目内容：historyData为环境数据类集合，将它构建成二维数组DenseMatrix64F public DenseMatrix64F loadData(List historyData) throws IOException {DenseMatrix64F data = https://www.it610.com/article/new DenseMatrix64F(historyData.size(),7); for (int i = 0; i < historyData.size(); i++){ data.set(i,0,Double.parseDouble(historyData.get(i).getCO2())); data.set(i,1,Double.parseDouble(historyData.get(i).getEC())); data.set(i,2,Double.parseDouble(historyData.get(i).getLED())); data.set(i,3,Double.parseDouble(historyData.get(i).getLIGHT())); data.set(i,4,Double.parseDouble(historyData.get(i).getPH())); data.set(i,5,Double.parseDouble(historyData.get(i).getRH())); data.set(i,6,Double.parseDouble(historyData.get(i).getTEMP())); }return data; } //按照最大样本数来切割数据集 public DenseMatrix64F getSubSample(DenseMatrix64F dataSet,int subSampleCount){ int features = dataSet.numCols; DenseMatrix64F subSample = new DenseMatrix64F(subSampleCount,features); for (int i = 0; i < subSampleCount; i++){ for (int j = 0; j < features; j++){ subSample.set(i,j,dataSet.get(i,j)); } }return subSample; }public IForest train(List historyData) throws IOException { DenseMatrix64F dataSet = loadData(historyData); int rows = dataSet.numRows; //树的数量 int numTrees = 50; //计算列数 int numFeatures = dataSet.numCols; //设置样本的最大数量，即构建树的最大样本数 int maxSamples = 65536; //样本数量 int subSampleSize = Math.min(65536,rows); //log2（n）为树的深度 int maxLength = (int) Math.ceil(bottomChanging(subSampleSize,2)); //list即森林 List iTrees = new ArrayList(); for (int i = 0; i < numTrees; i++){ //按照规定样本数量，取出样本 DenseMatrix64F subSample = getSubSample(dataSet, subSampleSize); //使用样本生成一个树 ITree iTree = growTree(subSample, maxLength, numFeatures, 0); //森林中加入这颗刚刚生成的树 iTrees.add(iTree); }return new IForest(iTrees,maxSamples); }public ITree growTree(DenseMatrix64F data,int maxLength,int numFeatures,int currentLength){ //如果当前长度大于标准树深度，或样本数据量为1时则停止构造树，返回叶子节点 if (currentLength >= maxLength || data.numRows <= 1){ return new ITreeLeaf(data.numRows); }Random random = new Random(); //随机选择一个属性 int feature = random.nextInt(numFeatures); int rows = data.numRows; //随机选择一条数据 int randomRow = random.nextInt(rows); double splitPoint = data.get(randomRow,feature); List rightList = new ArrayList(); List leftList = new ArrayList(); for(int i = 0; i < rows; i++){ //利用该属性，比较大小，使用list记录左子树、右子树 if(data.get(i,feature) >= splitPoint){ rightList.add(i); } else { leftList.add(i); } }DenseMatrix64F left = new DenseMatrix64F(leftList.size(), numFeatures); DenseMatrix64F right = new DenseMatrix64F(rightList.size(), numFeatures); //构建左子树 for (int i = 0; i < leftList.size(); i++){ for(int j = 0; j < numFeatures; j++){ left.set(i,j,data.get(i,j)); } } //构建右子树 for (int i = 0; i < rightList.size(); i++){ for(int j = 0; j < numFeatures; j++){ right.set(i,j,data.get(i,j)); } } //设置左子树，右子树，分割属性的大小和分割属性的下标 return new ITreeBranch(growTree(left,maxLength,numFeatures,currentLength+1), growTree(right,maxLength,numFeatures,currentLength+1), splitPoint,feature); } //用于计算所需构建树的深度 public double bottomChanging(int x,int bottom){ double log = Math.log10(x) / Math.log10(bottom); return log; }public void changeIsoForest (String starttime, String endtime , String deviceID)throws IOException{ List historyData = https://www.it610.com/article/influxdbDao.findByTime(starttime,endtime,deviceID); System.out.println(historyData.size()); iForest = this.train(historyData); } //虽然@Component注解将这个类作为Spring的Bean，但是并不会调用构造方法来生成Bean //所以需要加上@PostConstruct来让Spring执行构造方法来构建森林 @PostConstruct public void IsoForest() throws IOException { //记录构件树的开始时间，计算运行时间 long start = System.currentTimeMillis(); //使用influxDB操作类来获取环境数据 List historyData = influxdbDao.findByTime("2019-10-29 16:21:00","2019-12-07 05:35:00","6af6188e14aa"); System.out.println(historyData.size()); IsoForest isoForest = new IsoForest(); iForest = isoForest.train(historyData); long elapse = System.currentTimeMillis() - start; System.out.println("花费时间" + elapse / 1000.0 + "s"); } }

IForest类 —描述森林的类，完成评估异常点的功能

import com.example.smartf.model.HistoryData; import org.ejml.data.DenseMatrix64F; import java.io.IOException; import java.util.ArrayList; import java.util.List; public class IForest{ //树的列表 List iTrees; //样本的数量，即一棵树的节点数 int maxSamples; public IForest(List iTrees, int maxSamples) { this.iTrees = iTrees; this.maxSamples = maxSamples; } //判断一条数据是否异常，x为行数为1的二维数组，本质上是一条数据 public Integer predict(DenseMatrix64F x){ if(iTrees.size() == 0 || iTrees == null){ throw new IllegalArgumentException("请训练后再预测"); }double sum = 0; for(int i = 0; i < iTrees.size(); i++){ sum += pathLengh(x,iTrees.get(i),0); } //计算得分参考算法计算公式，分子为当前节点遍历的平均长度，分母为遍历森林中节点的平均长度 double exponent = -(sum/iTrees.size())/cost(maxSamples); double score = Math.pow(2,exponent); //设置得分阈值，大于0.51分则标记为异常点 if(score > 0.51){ return -1; } else { return 1; } } //使用这一条数据遍历树，得到深度 public double pathLengh(DenseMatrix64F x, ITree tree, int path_length){ String simpleName = tree.getClass().getSimpleName(); if(simpleName.equals("ITreeLeaf")){ ITreeLeaf leaf = (ITreeLeaf) tree; int size = leaf.getSize(); return path_length + cost(size); }ITreeBranch iTreeBranch = (ITreeBranch)tree; int splitAttr = iTreeBranch.getSplitAttr(); double splitValue = https://www.it610.com/article/iTreeBranch.getSplitValue(); double value = x.get(0, splitAttr); if(value < splitValue){ ITree left = iTreeBranch.getLeft(); return pathLengh(x,left,path_length + 1); } else { ITree right = iTreeBranch.getRight(); return pathLengh(x,right,path_length + 1); }} public double getHi(int i){ double constantValue = 0.5772156649; return Math.log(i) + constantValue; }public double cost(int n){ double hi = getHi(n-1); if(n <= 1){ return 1.0; } double cost = 2 * hi - 2*(n-1)/n; return cost; } }

ITree —描述节点的抽象类

abstract class ITree{ }

ITreeBranch —描述节点中的非叶子节点，拥有左子树右子树，该节点的分割属性

class ITreeBranch extends ITree{ ITree left; ITree right; //分割属性的值 double splitValue; //分割属性的下标，在遍历的时候比较大小需要用到，遍历时也需要比较相同属性的大小 int splitAttr; public ITreeBranch(ITree left,ITree right,double splitValue,int splitAttr){ this.left = left; this.right = right; this.splitValue = https://www.it610.com/article/splitValue; this.splitAttr = splitAttr; }public ITree getLeft() { return left; }public void setLeft(ITree left) { this.left = left; }public ITree getRight() { return right; }public void setRight(ITree right) { this.right = right; }public double getSplitValue() { return splitValue; }public void setSplitValue(double splitValue) { this.splitValue = splitValue; }public int getSplitAttr() { return splitAttr; }public void setSplitAttr(int splitAttr) { this.splitAttr = splitAttr; } }

ITreeLeaf —描述节点中的叶子节点，只记录当前节点的集合数量

class ITreeLeaf extends ITree{ int size; public ITreeLeaf(int size) { this.size = size; }public int getSize() { return size; }public void setSize(int size) { this.size = size; } }

实际效果：