rminer|rminer R语言机器学习库

前言 做数据挖掘相关工作的时候,会使用借鉴一些已经包装好的算法。在R中,存在许许多多已经可以使用的算法包。当你想要使用一个算法,可以调用相应的包。
但是有一个问题,如果我需要使用的算法比较多,那我需要从很多不同的包中调用不同的函数,这就有些许繁琐。
所以就有了今天这一篇文章,rminer集成了很多算法,通多设定参数改变你所需要的算法,免去了调用不同包的繁琐。
这里简单介绍一下
rminer 再rminer中,主要通过fit进行训练模型,通过改变model这个参数,进行训练不同的模型。以下是一些例子,包括训练模型,设置参数。

### dontrun is used when the execution of the example requires some computational effort.### simple regression (with a formula) example. x1=rnorm(200,100,20); x2=rnorm(200,100,20) y=0.7*sin(x1/(25*pi))+0.3*sin(x2/(25*pi)) M=fit(y~x1+x2,model="mlpe") new1=rnorm(100,100,20); new2=rnorm(100,100,20) ynew=0.7*sin(new1/(25*pi))+0.3*sin(new2/(25*pi)) P=predict(M,data.frame(x1=new1,x2=new2,y=rep(NA,100))) print(mmetric(ynew,P,"MAE"))### simple classification example. ## Not run: data(iris) M=fit(Species~.,iris,model="rpart") plot(M@object); text(M@object) # show model P=predict(M,iris) print(mmetric(iris$Species,P,"CONF")) print(mmetric(iris$Species,P,"ALL")) mgraph(iris$Species,P,graph="ROC",TC=2,main="versicolor ROC", baseline=TRUE,leg="Versicolor",Grid=10)M2=fit(Species~.,iris,model="ctree") plot(M2@object) # show model P2=predict(M2,iris) print(mmetric(iris$Species,P2,"CONF"))# ctree with different setup: # (ctree_control is from the party package) M3=fit(Species~.,iris,model="ctree",controls = party::ctree_control(testtype="MonteCarlo")) plot(M3@object) # show model## End(Not run)### simple binary classification example with cv.glmnet and xgboost ## Not run: data(sa_ssin_2) H=holdout(sa_ssin_2$y,ratio=2/3) # cv.glmnet: M=fit(y~.,sa_ssin_2[H$tr,],model="cv.glmnet",task="cla") # pure classes P=predict(M,sa_ssin_2[H$ts,]) cat("1st prediction, class:",as.character(P[1]),"\n") cat("Confusion matrix:\n") print(mmetric(sa_ssin_2[H$ts,]$y,P,"CONF")$conf)M2=fit(y~.,sa_ssin_2[H$tr,],model="cv.glmnet") # probabilities P2=predict(M2,sa_ssin_2[H$ts,]) L=M2@levels cat("1st prediction, prob:",L[1],"=",P2[1,1],",",L[2],"=",P2[1,2],"\n") cat("Confusion matrix:\n") print(mmetric(sa_ssin_2[H$ts,]$y,P2,"CONF")$conf) cat("AUC of ROC curve:\n") print(mmetric(sa_ssin_2[H$ts,]$y,P2,"AUC"))M3=fit(y~.,sa_ssin_2[H$tr,],model="cv.glmnet",nfolds=3) # use 3 folds instead of 10 plot(M3@object) # show cv.glmnet object P3=predict(M3,sa_ssin_2[H$ts,])# xgboost: M4=fit(y~.,sa_ssin_2[H$tr,],model="xgboost",verbose=1) # nrounds=2, show rounds: P4=predict(M4,sa_ssin_2[H$ts,]) print(mmetric(sa_ssin_2[H$ts,]$y,P4,"AUC")) M5=fit(y~.,sa_ssin_2[H$tr,],model="xgboost",nrounds=3,verbose=1) # nrounds=3, show rounds: P5=predict(M5,sa_ssin_2[H$ts,]) print(mmetric(sa_ssin_2[H$ts,]$y,P5,"AUC"))## End(Not run)### classification example with discrete classes, probabilities and holdout ## Not run: data(iris) H=holdout(iris$Species,ratio=2/3) M=fit(Species~.,iris[H$tr,],model="ksvm",task="class") M2=fit(Species~.,iris[H$tr,],model="ksvm",task="prob") P=predict(M,iris[H$ts,]) P2=predict(M2,iris[H$ts,]) print(mmetric(iris$Species[H$ts],P,"CONF")) print(mmetric(iris$Species[H$ts],P2,"CONF")) print(mmetric(iris$Species[H$ts],P,"CONF",TC=1)) print(mmetric(iris$Species[H$ts],P2,"CONF",TC=1)) print(mmetric(iris$Species[H$ts],P2,"AUC"))### exploration of some rminer classification models: models=c("lda","naiveBayes","kknn","randomForest","cv.glmnet","xgboost") for(m in models) { cat("model:",m,"\n") M=fit(Species~.,iris[H$tr,],model=m) P=predict(M,iris[H$ts,]) print(mmetric(iris$Species[H$ts],P,"AUC")[[1]]) }## End(Not run)### classification example with hyperparameter selection ###note: for regression, similar code can be used ### SVM ## Not run: data(iris) # large list of SVM configurations: # SVM with kpar="automatic" sigma rbfdot kernel estimation and default C=1: #note: each execution can lead to different M@mpar due to sigest stochastic nature: M=fit(Species~.,iris,model="ksvm") print(M@mpar) # model hyperparameters/arguments # same thing, explicit use of mparheuristic: M=fit(Species~.,iris,model="ksvm",search=list(search=mparheuristic("ksvm"))) print(M@mpar) # model hyperparameters# SVM with C=3, sigma=2^-7 M=fit(Species~.,iris,model="ksvm",C=3,kpar=list(sigma=2^-7)) print(M@mpar) # SVM with different kernels: M=fit(Species~.,iris,model="ksvm",kernel="polydot",kpar="automatic") print(M@mpar) # fit already has a scale argument, thus the only way to fix scale of "tanhdot" # is to use the special search argument with the "none" method: s=list(smethod="none",search=list(scale=2,offset=2)) M=fit(Species~.,iris,model="ksvm",kernel="tanhdot",search=s) print(M@mpar) # heuristic: 10 grid search values for sigma, rbfdot kernel (fdebug is used only for more verbose): s=list(search=mparheuristic("ksvm",10)) # advised "heuristic10" usage M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE) print(M@mpar) # same thing, uses older search="heuristic10" that works for fewer rminer models M=fit(Species~.,iris,model="ksvm",search="heuristic10",fdebug=TRUE) print(M@mpar) # identical search under a different and explicit code: s=list(search=2^seq(-15,3,2)) M=fit(Species~.,iris,model="ksvm",search=2^seq(-15,3,2),fdebug=TRUE) print(M@mpar)# uniform design "UD" for sigma and C, rbfdot kernel, two level of grid searches, # under exponential (2^x) search scale: M=fit(Species~.,iris,model="ksvm",search="UD",fdebug=TRUE) print(M@mpar) M=fit(Species~.,iris,model="ksvm",search="UD1",fdebug=TRUE) print(M@mpar) M=fit(Species~.,iris,model="ksvm",search=2^seq(-15,3,2),fdebug=TRUE) print(M@mpar) # now the more powerful search argument is used for modeling SVM: # grid 3 x 3 search: s=list(smethod="grid",search=list(sigma=2^c(-15,-5,3),C=2^c(-5,0,15)),convex=0, metric="AUC",method=c("kfold",3,12345)) print(s) M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE) print(M@mpar) # identical search with different argument smethod="matrix" s$smethod="matrix" s$search=list(sigma=rep(2^c(-15,-5,3),times=3),C=rep(2^c(-5,0,15),each=3)) print(s) M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE) print(M@mpar) # search for best kernel (only works for kpar="automatic"): s=list(smethod="grid",search=list(kernel=c("rbfdot","laplacedot","polydot","vanilladot")), convex=0,metric="AUC",method=c("kfold",3,12345)) print(s) M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE) print(M@mpar) # search for best parameters of "rbfdot" or "laplacedot" (which use same kpar): s$search=list(kernel=c("rbfdot","laplacedot"),sigma=2^seq(-15,3,5)) print(s) M=fit(Species~.,iris,model="ksvm",search=s,fdebug=TRUE) print(M@mpar)### randomForest # search for mtry and ntree s=list(smethod="grid",search=list(mtry=c(1,2,3),ntree=c(100,200,500)), convex=0,metric="AUC",method=c("kfold",3,12345)) print(search) M=fit(Species~.,iris,model="randomForest",search=s,fdebug=TRUE) print(M@mpar)### rpart # simpler way to tune cp in 0.01 to 0.9 (10 searches): s=list(search=mparheuristic("rpart",n=10,lower=0.01,upper=0.9),method=c("kfold",3,12345)) M=fit(Species~.,iris,model="rpart",search=s,fdebug=TRUE) print(M@mpar)# same thing but with more lines of code # note: this code can be adapted to tune other rpart parameters, #while mparheuristic only tunes cp # a vector list needs to be used for the search$search parameter lcp=vector("list",10) # 10 grid values for the complexity cp names(lcp)=rep("cp",10) # same cp name scp=seq(0.01,0.9,length.out=10) # 10 values from 0.01 to 0.18 for(i in 1:10) lcp[[i]]=scp[i] # cycle needed due to [[]] notation s=list(smethod="grid",search=list(control=lcp), convex=0,metric="AUC",method=c("kfold",3,12345)) M=fit(Species~.,iris,model="rpart",search=s,fdebug=TRUE) print(M@mpar)### ctree # simpler way to tune mincriterion in 0.1 to 0.98 (9 searches): mint=c("kfold",3,123) # internal validation method s=list(search=mparheuristic("ctree",n=8,lower=0.1,upper=0.99),method=mint) M=fit(Species~.,iris,model="ctree",search=s,fdebug=TRUE) print(M@mpar) # same thing but with more lines of code # note: this code can be adapted to tune other ctree parameters, #while mparheuristic only tunes mincriterion # a vector list needs to be used for the search$search parameter lmc=vector("list",9) # 9 grid values for the mincriterion smc=seq(0.1,0.99,length.out=9) for(i in 1:9) lmc[[i]]=party::ctree_control(mincriterion=smc[i]) s=list(smethod="grid",search=list(controls=lmc),method=mint,convex=0) M=fit(Species~.,iris,model="ctree",search=s,fdebug=TRUE) print(M@mpar)### some MLP fitting examples: # simplest use: M=fit(Species~.,iris,model="mlpe") print(M@mpar) # same thing, with explicit use of mparheuristic: M=fit(Species~.,iris,model="mlpe",search=list(search=mparheuristic("mlpe"))) print(M@mpar)print(M@mpar) # hidden nodes and number of ensemble mlps # setting some nnet parameters: M=fit(Species~.,iris,model="mlpe",size=3,decay=0.1,maxit=100,rang=0.9) print(M@mpar) # mlpe hyperparameters # MLP, 5 grid search fdebug is only used to put some verbose in the console: s=list(search=mparheuristic("mlpe",n=5)) # 5 searches for size print(s) # show search M=fit(Species~.,iris,model="mlpe",search=s,fdebug=TRUE) print(M@mpar) # previous searches used a random holdout (seed=NULL), now a fixed seed (123) is used: s=list(smethod="grid",search=mparheuristic("mlpe",n=5),convex=0,metric="AUC", method=c("holdout",2/3,123)) print(search) M=fit(Species~.,iris,model="mlpe",search=s,fdebug=TRUE) print(M@mpar) # faster and greedy grid search: s$convex=1; s$search=list(size=0:9) print(search) M=fit(Species~.,iris,model="mlpe",search=s,fdebug=TRUE) print(M@mpar) # 2 level grid with total of 5 searches #note of caution: some "2L" ranges may lead to non integer (e.g. 1.3) values at #the 2nd level search. And some R functions crash if non integer values are used for #integer parameters. s$smethod="2L"; s$convex=0; s$search=list(size=c(4,8,12)) print(s) M=fit(Species~.,iris,model="mlpe",search=s,fdebug=TRUE) print(M@mpar)## End(Not run)### example of an error (warning) generated using fit: ## Not run: data(iris) # size needs to be a positive integer, thus 0.1 leads to an error: M=fit(Species~.,iris,model="mlp",size=0.1) print(M@object)## End(Not run)### exploration of some rminer regression models: ## Not run: data(sa_ssin) H=holdout(sa_ssin$y,ratio=2/3,seed=12345) models=c("lm","mr","ctree","mars","cubist","cv.glmnet","xgboost","rvm") for(m in models) { cat("model:",m,"\n") M=fit(y~.,sa_ssin[H$tr,],model=m) P=predict(M,sa_ssin[H$ts,]) print(mmetric(sa_ssin$y[H$ts],P,"MAE")) }## End(Not run)### regression example with hyperparameter selection: ## Not run: data(sa_ssin) # some SVM experiments: # default SVM: M=fit(y~.,data=https://www.it610.com/article/sa_ssin,model="svm") print(M@mpar) # SVM with (Cherkassy and Ma, 2004) heuristics to set C and epsilon: M=fit(y~.,data=https://www.it610.com/article/sa_ssin,model="svm",C=NA,epsilon=NA) print(M@mpar) # SVM with Uniform Design set sigma, C and epsilon: M=fit(y~.,data=https://www.it610.com/article/sa_ssin,model="ksvm",search="UD",fdebug=TRUE) print(M@mpar)# sensitivity analysis feature selection M=fit(y~.,data=https://www.it610.com/article/sa_ssin,model="ksvm",search=list(search=mparheuristic("ksvm",n=5)),feature="sabs") print(M@mpar) print(M@attributes) # selected attributes (1, 2 and 3 are the relevant inputs)# example that shows how transform works: M=fit(y~.,data=https://www.it610.com/article/sa_ssin,model="mr") # linear regression P=predict(M,data.frame(x1=-1000,x2=0,x3=0,x4=0,y=NA)) # P should be negative print(P) M=fit(y~.,data=https://www.it610.com/article/sa_ssin,model="mr",transform="positive") P=predict(M,data.frame(x1=-1000,x2=0,x3=0,x4=0,y=NA)) # P is not negative print(P)## End(Not run)### pure classification example with a generic R model ### ## Not run: ### nnet is adopted here but virtually ANY fitting function/package could be used:# since the default nnet prediction is to provide probabilities, there is # a need to create this "wrapping" function: predictprob=function(object,newdata) { predict(object,newdata,type="class") } # list with a fit and predict function: # nnet::nnet (package::function) model=list(fit=nnet::nnet,predict=predictprob,name="nnet") data(iris) # note that size is not a fit parameter and it is sent directly to nnet: M=fit(Species~.,iris,model=model,size=3,task="class") P=predict(M,iris) print(P)## End(Not run)

【rminer|rminer R语言机器学习库】看完这边文章,你就可以用rminer训练各种模型了。

    推荐阅读