jiebaR - 中文分词

http://qinwenfeng.com/jiebaR/

library(jiebaR) wkr = worker() segment("今天天气好晴朗", wkr)

library(jiebaR) library(sqldf)
TA = read.csv('R/table-A.csv', header = TRUE, sep = ",")
txtdf = TA$BAK_TXT TA$BAK_TXT <- as.character(TA$BAK_TXT)
wkr = worker()
# vector words = c() for( txt in txtdf ){ # add new segment into words words <- c(words, segment(txt, wkr) ) }
#jieba functions freqrs <- freq(words)
# sort and count rs <- table(words) # convert to data frame rsdf <- as.data.frame(rs)
rsdf$words <- as.character(rsdf$words)
lowChar <- grep("[a-z]", rsdf$words) upperChar <- grep("[A-Z]", rsdf$words) numbers <- grep("[0-9]", rsdf$words) # check encoding #Encoding( rsdf$words)
rowNums <- c(lowChar, upperChar, numbers)
# delete duplicate row numbers rowNums <- unique(rowNums)
# selelct none char/number rows chrs <- rsdf[-rowNums,]
# check the length of string #nchar(chrs$words)
nwords <- dim(chrs)[1] nrow = dim(TA)[1]
for( i in 1:nwords ){ word <- chrs$words[i]
wordCols <- character() for( j in 1:nrow ){
flg <- grepl(word, TA$BAK_TXT[j])
if( flg == TRUE ){ wordCols <- c( wordCols, "Y" ) }else{ wordCols <- c( wordCols, "N" ) } }
wordCols <- as.data.frame(wordCols) names(wordCols) <- word
TA <- cbind(TA, wordCols)
}
write.csv(TA, file = "rs-words.csv")

https://www.r-bloggers.com/r-function-of-the-day-table/

    推荐阅读