Rhadoop

RHADOOP - TF-IDF (17/11/28 Lecture Note)

딥스탯 2017. 11. 28. 23:30
11_28_Lecture_Note
library(tm)
## Loading required package: NLP
library(KoNLP)
## Checking user defined dictionary!
library(proxy)
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rhdfs)
## Loading required package: rJava
## 
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop
## 
## Be sure to run hdfs.init()
hdfs.init()
library(rmr2)
## Warning: S3 methods 'gorder.default', 'gorder.factor', 'gorder.data.frame',
## 'gorder.matrix', 'gorder.raw' were declared in NAMESPACE but not found
## Please review your hadoop settings. See help(hadoop.settings)

“R8_1.R”

doc1 <- "The fox chases the rabbit"
doc2 <- "The rabbit ate the cabbage"
doc3 <- "The fox caught the rabbit"

doc.list <- list(doc1, doc2, doc3) 
n.docs <- length(doc.list) 
names(doc.list) <- paste("doc", c(1:n.docs), sep="") 
my.corpus <- Corpus(VectorSource(doc.list)) 

my.corpus <- tm_map(my.corpus, tolower) 
TDM <- TermDocumentMatrix(my.corpus, control=list( weighting=weightTf)) 
(m <- as.matrix(TDM))
##          Docs
## Terms     1 2 3
##   chases  1 0 0
##   fox     1 0 1
##   rabbit  1 1 1
##   the     2 2 2
##   ate     0 1 0
##   cabbage 0 1 0
##   caught  0 0 1
TDM <- TermDocumentMatrix(my.corpus, control=list( weighting=weightTfIdf)) 
(m <- as.matrix(TDM))
##          Docs
## Terms             1         2         3
##   chases  0.3169925 0.0000000 0.0000000
##   fox     0.1169925 0.0000000 0.1169925
##   rabbit  0.0000000 0.0000000 0.0000000
##   the     0.0000000 0.0000000 0.0000000
##   ate     0.0000000 0.3169925 0.0000000
##   cabbage 0.0000000 0.3169925 0.0000000
##   caught  0.0000000 0.0000000 0.3169925

“R8_1.R” ver_HADOOP

IDEA

문장별로 wordcount를 한 다음 합쳐서 단어별로 TF-IDF를 계산한다.

#library(tm)

#library(rhdfs)
#hdfs.init()
#library(rmr2)

doc1 <- "The fox chases the rabbit"
doc2 <- "The rabbit ate the cabbage"
doc3 <- "The fox caught the rabbit"

doc.vec <- c(doc1, doc2, doc3)

WORDCOUNT MAPPER & REDUCER

WDMap <- function(k,v){
  keyval(seq_along(v), strsplit(v,split=" "))
}

WDReduce <- function(k,v){
  df_doc <- unlist(v) %>%
            tolower() %>%
            table()   %>%
            as.data.frame()
  names(df_doc) <- c("word", paste("doc",k,sep=""))
  keyval(k,list(df_doc))
}

TF MAPPER

TfMap <- function(k,v){
  temp_df <- NULL
  for(i in 1:length(v)){
    if(is.null(temp_df)){
      temp_df <- v[[i]]
    }else{
      temp_df <- merge(temp_df, v[[i]], by="word", all=T)
    }
  }
  temp_df[is.na(temp_df)] <- 0
  keyval(temp_df$word , as.matrix(temp_df[,-1]))
}

TF-IDF MAPPER & REDUCER

TfIdfMap <- function(k,v){
  temp_df <- NULL
  for(i in 1:length(v)){
    if(is.null(temp_df)){
      temp_df0     <- v[[i]]
      temp_df0[,2] <- temp_df0[,2]/sum(temp_df0[,2])
      temp_df      <- temp_df0
    }else{
      temp_df0 <- v[[i]]
      temp_df0[,2] <- temp_df0[,2]/sum(temp_df0[,2])
      temp_df <- merge(temp_df, temp_df0, by="word", all=T)
    }
  }
  temp_df[is.na(temp_df)] <- 0
  keyval(temp_df$word , as.matrix(temp_df[,-1]))
}

TfIdfReduce <- function(k,v){
  keyval( k ,  v * log2(length(v)/sum(v!=0)) )
}

MAPREDUCE (TF)

hadoop.doc.vec <- to.dfs(doc.vec)

result <- mapreduce(
    input  = hadoop.doc.vec,
    map    = WDMap,
    reduce = WDReduce) %>%
  mapreduce(
    map    = TfMap
  )

RESULT (TF)

hadoop.result <- from.dfs(result)
hadoop.result
## $key
## [1] chases  fox     rabbit  the     ate     cabbage caught 
## Levels: ate cabbage caught chases fox rabbit the
## 
## $val
##      doc1 doc2 doc3
## [1,]    1    0    0
## [2,]    1    0    1
## [3,]    1    1    1
## [4,]    2    2    2
## [5,]    0    1    0
## [6,]    0    1    0
## [7,]    0    0    1
data.frame(word = hadoop.result$key, hadoop.result$val)
ABCDEFGHIJ0123456789
word
<fctr>
doc1
<dbl>
doc2
<dbl>
doc3
<dbl>
chases100
fox101
rabbit111
the222
ate010
cabbage010
caught001

RESULT (TF-IDF)

hadoop.doc.vec <- to.dfs(doc.vec)

result2 <- mapreduce(
    input  = hadoop.doc.vec,
    map    = WDMap,
    reduce = WDReduce) %>%
  mapreduce(
    map    = TfIdfMap,
    reduce = TfIdfReduce
  )
(hadoop.result2 <- from.dfs(result2))
## $key
## [1] ate     fox     the     caught  chases  rabbit  cabbage
## Levels: ate cabbage caught chases fox rabbit the
## 
## $val
##           doc1      doc2      doc3
## [1,] 0.0000000 0.3169925 0.0000000
## [2,] 0.1169925 0.0000000 0.1169925
## [3,] 0.0000000 0.0000000 0.0000000
## [4,] 0.0000000 0.0000000 0.3169925
## [5,] 0.3169925 0.0000000 0.0000000
## [6,] 0.0000000 0.0000000 0.0000000
## [7,] 0.0000000 0.3169925 0.0000000
data.frame(word=hadoop.result2$key,hadoop.result2$val)
ABCDEFGHIJ0123456789
word
<fctr>
doc1
<dbl>
doc2
<dbl>
doc3
<dbl>
ate0.00000000.31699250.0000000
fox0.11699250.00000000.1169925
the0.00000000.00000000.0000000
caught0.00000000.00000000.3169925
chases0.31699250.00000000.0000000
rabbit0.00000000.00000000.0000000
cabbage0.00000000.31699250.0000000

EXAMPLE

doc1 <- "Stray cats are running all over the place. I see 10 a day!"
doc2 <- "Cats are killers. They kill billions of animals a year." 
doc3 <- "The best food in Columbus, OH is   the North Market." 
doc4 <- "Brand A is the best tasting cat food around. Your cat will love it." 
doc5 <- "Buy Brand C cat food for your cat. Brand C makes healthy and happy cats." 
doc6 <- "The Arnold Classic came to town this weekend. It reminds us to be healthy." 
doc7 <- "I have nothing to say. In summary, I have told you nothing." 

hadoop.doc.vec <- to.dfs(c(doc1,doc2,doc3,doc4,doc5,doc6,doc7))

result.example <- mapreduce(
    input  = hadoop.doc.vec,
    map    = WDMap,
    reduce = WDReduce) %>%
  mapreduce(
    map    = TfIdfMap,
    reduce = TfIdfReduce
  )

hadoop.result.example <- from.dfs(result.example)
data.frame(word = hadoop.result.example$key, hadoop.result.example$val)
ABCDEFGHIJ0123456789
word
<fctr>
doc1
<dbl>
doc2
<dbl>
doc3
<dbl>
doc4
<dbl>
doc5
<dbl>
doc6
<dbl>
doc7
<dbl>
0.000000000.00000000.46789250.000000000.000000000.000000000.0000000
a0.094030190.12223920.00000000.087313740.000000000.000000000.0000000
c0.000000000.00000000.00000000.000000000.374313990.000000000.0000000
i0.139027300.00000000.00000000.000000000.000000000.000000000.3012258
100.215950380.00000000.00000000.000000000.000000000.000000000.0000000
be0.000000000.00000000.00000000.000000000.000000000.200525350.0000000
in0.000000000.00000000.15061290.000000000.000000000.000000000.1506129
is0.000000000.00000000.15061290.129096780.000000000.000000000.0000000
it0.000000000.00000000.00000000.000000000.000000000.200525350.0000000
of0.000000000.28073550.00000000.000000000.000000000.000000000.0000000