Rhadoop
RHADOOP - TF-IDF (17/11/28 Lecture Note)
딥스탯
2017. 11. 28. 23:30
파일 및 참고자료 : http://stat.knu.ac.kr/
http://stat.knu.ac.kr/pg/bbs/board.php?bo_table=ja02&part=1
11_28_Lecture_Note
library(tm)
## Loading required package: NLP
library(KoNLP)
## Checking user defined dictionary!
library(proxy)
##
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rhdfs)
## Loading required package: rJava
##
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop
##
## Be sure to run hdfs.init()
hdfs.init()
library(rmr2)
## Warning: S3 methods 'gorder.default', 'gorder.factor', 'gorder.data.frame',
## 'gorder.matrix', 'gorder.raw' were declared in NAMESPACE but not found
## Please review your hadoop settings. See help(hadoop.settings)
“R8_1.R”
doc1 <- "The fox chases the rabbit"
doc2 <- "The rabbit ate the cabbage"
doc3 <- "The fox caught the rabbit"
doc.list <- list(doc1, doc2, doc3)
n.docs <- length(doc.list)
names(doc.list) <- paste("doc", c(1:n.docs), sep="")
my.corpus <- Corpus(VectorSource(doc.list))
my.corpus <- tm_map(my.corpus, tolower)
TDM <- TermDocumentMatrix(my.corpus, control=list( weighting=weightTf))
(m <- as.matrix(TDM))
## Docs
## Terms 1 2 3
## chases 1 0 0
## fox 1 0 1
## rabbit 1 1 1
## the 2 2 2
## ate 0 1 0
## cabbage 0 1 0
## caught 0 0 1
TDM <- TermDocumentMatrix(my.corpus, control=list( weighting=weightTfIdf))
(m <- as.matrix(TDM))
## Docs
## Terms 1 2 3
## chases 0.3169925 0.0000000 0.0000000
## fox 0.1169925 0.0000000 0.1169925
## rabbit 0.0000000 0.0000000 0.0000000
## the 0.0000000 0.0000000 0.0000000
## ate 0.0000000 0.3169925 0.0000000
## cabbage 0.0000000 0.3169925 0.0000000
## caught 0.0000000 0.0000000 0.3169925
“R8_1.R” ver_HADOOP
IDEA
문장별로 wordcount를 한 다음 합쳐서 단어별로 TF-IDF를 계산한다.
#library(tm)
#library(rhdfs)
#hdfs.init()
#library(rmr2)
doc1 <- "The fox chases the rabbit"
doc2 <- "The rabbit ate the cabbage"
doc3 <- "The fox caught the rabbit"
doc.vec <- c(doc1, doc2, doc3)
WORDCOUNT MAPPER & REDUCER
WDMap <- function(k,v){
keyval(seq_along(v), strsplit(v,split=" "))
}
WDReduce <- function(k,v){
df_doc <- unlist(v) %>%
tolower() %>%
table() %>%
as.data.frame()
names(df_doc) <- c("word", paste("doc",k,sep=""))
keyval(k,list(df_doc))
}
TF MAPPER
TfMap <- function(k,v){
temp_df <- NULL
for(i in 1:length(v)){
if(is.null(temp_df)){
temp_df <- v[[i]]
}else{
temp_df <- merge(temp_df, v[[i]], by="word", all=T)
}
}
temp_df[is.na(temp_df)] <- 0
keyval(temp_df$word , as.matrix(temp_df[,-1]))
}
TF-IDF MAPPER & REDUCER
TfIdfMap <- function(k,v){
temp_df <- NULL
for(i in 1:length(v)){
if(is.null(temp_df)){
temp_df0 <- v[[i]]
temp_df0[,2] <- temp_df0[,2]/sum(temp_df0[,2])
temp_df <- temp_df0
}else{
temp_df0 <- v[[i]]
temp_df0[,2] <- temp_df0[,2]/sum(temp_df0[,2])
temp_df <- merge(temp_df, temp_df0, by="word", all=T)
}
}
temp_df[is.na(temp_df)] <- 0
keyval(temp_df$word , as.matrix(temp_df[,-1]))
}
TfIdfReduce <- function(k,v){
keyval( k , v * log2(length(v)/sum(v!=0)) )
}
MAPREDUCE (TF)
hadoop.doc.vec <- to.dfs(doc.vec)
result <- mapreduce(
input = hadoop.doc.vec,
map = WDMap,
reduce = WDReduce) %>%
mapreduce(
map = TfMap
)
RESULT (TF)
hadoop.result <- from.dfs(result)
hadoop.result
## $key
## [1] chases fox rabbit the ate cabbage caught
## Levels: ate cabbage caught chases fox rabbit the
##
## $val
## doc1 doc2 doc3
## [1,] 1 0 0
## [2,] 1 0 1
## [3,] 1 1 1
## [4,] 2 2 2
## [5,] 0 1 0
## [6,] 0 1 0
## [7,] 0 0 1
data.frame(word = hadoop.result$key, hadoop.result$val)
word <fctr> | doc1 <dbl> | doc2 <dbl> | doc3 <dbl> | |
---|---|---|---|---|
chases | 1 | 0 | 0 | |
fox | 1 | 0 | 1 | |
rabbit | 1 | 1 | 1 | |
the | 2 | 2 | 2 | |
ate | 0 | 1 | 0 | |
cabbage | 0 | 1 | 0 | |
caught | 0 | 0 | 1 |
RESULT (TF-IDF)
hadoop.doc.vec <- to.dfs(doc.vec)
result2 <- mapreduce(
input = hadoop.doc.vec,
map = WDMap,
reduce = WDReduce) %>%
mapreduce(
map = TfIdfMap,
reduce = TfIdfReduce
)
(hadoop.result2 <- from.dfs(result2))
## $key
## [1] ate fox the caught chases rabbit cabbage
## Levels: ate cabbage caught chases fox rabbit the
##
## $val
## doc1 doc2 doc3
## [1,] 0.0000000 0.3169925 0.0000000
## [2,] 0.1169925 0.0000000 0.1169925
## [3,] 0.0000000 0.0000000 0.0000000
## [4,] 0.0000000 0.0000000 0.3169925
## [5,] 0.3169925 0.0000000 0.0000000
## [6,] 0.0000000 0.0000000 0.0000000
## [7,] 0.0000000 0.3169925 0.0000000
data.frame(word=hadoop.result2$key,hadoop.result2$val)
word <fctr> | doc1 <dbl> | doc2 <dbl> | doc3 <dbl> | |
---|---|---|---|---|
ate | 0.0000000 | 0.3169925 | 0.0000000 | |
fox | 0.1169925 | 0.0000000 | 0.1169925 | |
the | 0.0000000 | 0.0000000 | 0.0000000 | |
caught | 0.0000000 | 0.0000000 | 0.3169925 | |
chases | 0.3169925 | 0.0000000 | 0.0000000 | |
rabbit | 0.0000000 | 0.0000000 | 0.0000000 | |
cabbage | 0.0000000 | 0.3169925 | 0.0000000 |
EXAMPLE
doc1 <- "Stray cats are running all over the place. I see 10 a day!"
doc2 <- "Cats are killers. They kill billions of animals a year."
doc3 <- "The best food in Columbus, OH is the North Market."
doc4 <- "Brand A is the best tasting cat food around. Your cat will love it."
doc5 <- "Buy Brand C cat food for your cat. Brand C makes healthy and happy cats."
doc6 <- "The Arnold Classic came to town this weekend. It reminds us to be healthy."
doc7 <- "I have nothing to say. In summary, I have told you nothing."
hadoop.doc.vec <- to.dfs(c(doc1,doc2,doc3,doc4,doc5,doc6,doc7))
result.example <- mapreduce(
input = hadoop.doc.vec,
map = WDMap,
reduce = WDReduce) %>%
mapreduce(
map = TfIdfMap,
reduce = TfIdfReduce
)
hadoop.result.example <- from.dfs(result.example)
data.frame(word = hadoop.result.example$key, hadoop.result.example$val)
word <fctr> | doc1 <dbl> | doc2 <dbl> | doc3 <dbl> | doc4 <dbl> | doc5 <dbl> | doc6 <dbl> | doc7 <dbl> |
---|---|---|---|---|---|---|---|
0.00000000 | 0.0000000 | 0.4678925 | 0.00000000 | 0.00000000 | 0.00000000 | 0.0000000 | |
a | 0.09403019 | 0.1222392 | 0.0000000 | 0.08731374 | 0.00000000 | 0.00000000 | 0.0000000 |
c | 0.00000000 | 0.0000000 | 0.0000000 | 0.00000000 | 0.37431399 | 0.00000000 | 0.0000000 |
i | 0.13902730 | 0.0000000 | 0.0000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.3012258 |
10 | 0.21595038 | 0.0000000 | 0.0000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.0000000 |
be | 0.00000000 | 0.0000000 | 0.0000000 | 0.00000000 | 0.00000000 | 0.20052535 | 0.0000000 |
in | 0.00000000 | 0.0000000 | 0.1506129 | 0.00000000 | 0.00000000 | 0.00000000 | 0.1506129 |
is | 0.00000000 | 0.0000000 | 0.1506129 | 0.12909678 | 0.00000000 | 0.00000000 | 0.0000000 |
it | 0.00000000 | 0.0000000 | 0.0000000 | 0.00000000 | 0.00000000 | 0.20052535 | 0.0000000 |
of | 0.00000000 | 0.2807355 | 0.0000000 | 0.00000000 | 0.00000000 | 0.00000000 | 0.0000000 |