Rhadoop

RHADOOP - TF-IDF (17/11/28 Lecture Note)

딥스탯 2017. 11. 28. 23:30

11_28_Lecture_Note

library(tm)

## Loading required package: NLP

library(KoNLP)

## Checking user defined dictionary!

library(proxy)

## 
## Attaching package: 'proxy'

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rhdfs)

## Loading required package: rJava

## 
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop

## 
## Be sure to run hdfs.init()

hdfs.init()
library(rmr2)

## Warning: S3 methods 'gorder.default', 'gorder.factor', 'gorder.data.frame',
## 'gorder.matrix', 'gorder.raw' were declared in NAMESPACE but not found

## Please review your hadoop settings. See help(hadoop.settings)

“R8_1.R”

doc1 <- "The fox chases the rabbit"
doc2 <- "The rabbit ate the cabbage"
doc3 <- "The fox caught the rabbit"

doc.list <- list(doc1, doc2, doc3) 
n.docs <- length(doc.list) 
names(doc.list) <- paste("doc", c(1:n.docs), sep="") 
my.corpus <- Corpus(VectorSource(doc.list)) 

my.corpus <- tm_map(my.corpus, tolower) 
TDM <- TermDocumentMatrix(my.corpus, control=list( weighting=weightTf)) 
(m <- as.matrix(TDM))

##          Docs
## Terms     1 2 3
##   chases  1 0 0
##   fox     1 0 1
##   rabbit  1 1 1
##   the     2 2 2
##   ate     0 1 0
##   cabbage 0 1 0
##   caught  0 0 1

TDM <- TermDocumentMatrix(my.corpus, control=list( weighting=weightTfIdf)) 
(m <- as.matrix(TDM))

##          Docs
## Terms             1         2         3
##   chases  0.3169925 0.0000000 0.0000000
##   fox     0.1169925 0.0000000 0.1169925
##   rabbit  0.0000000 0.0000000 0.0000000
##   the     0.0000000 0.0000000 0.0000000
##   ate     0.0000000 0.3169925 0.0000000
##   cabbage 0.0000000 0.3169925 0.0000000
##   caught  0.0000000 0.0000000 0.3169925

“R8_1.R” ver_HADOOP

IDEA

문장별로 wordcount를 한 다음 합쳐서 단어별로 TF-IDF를 계산한다.

#library(tm)

#library(rhdfs)
#hdfs.init()
#library(rmr2)

doc1 <- "The fox chases the rabbit"
doc2 <- "The rabbit ate the cabbage"
doc3 <- "The fox caught the rabbit"

doc.vec <- c(doc1, doc2, doc3)

WORDCOUNT MAPPER & REDUCER

WDMap <- function(k,v){
  keyval(seq_along(v), strsplit(v,split=" "))
}

WDReduce <- function(k,v){
  df_doc <- unlist(v) %>%
            tolower() %>%
            table()   %>%
            as.data.frame()
  names(df_doc) <- c("word", paste("doc",k,sep=""))
  keyval(k,list(df_doc))
}

TF MAPPER

TfMap <- function(k,v){
  temp_df <- NULL
  for(i in 1:length(v)){
    if(is.null(temp_df)){
      temp_df <- v[[i]]
    }else{
      temp_df <- merge(temp_df, v[[i]], by="word", all=T)
    }
  }
  temp_df[is.na(temp_df)] <- 0
  keyval(temp_df$word , as.matrix(temp_df[,-1]))
}

TF-IDF MAPPER & REDUCER

TfIdfMap <- function(k,v){
  temp_df <- NULL
  for(i in 1:length(v)){
    if(is.null(temp_df)){
      temp_df0     <- v[[i]]
      temp_df0[,2] <- temp_df0[,2]/sum(temp_df0[,2])
      temp_df      <- temp_df0
    }else{
      temp_df0 <- v[[i]]
      temp_df0[,2] <- temp_df0[,2]/sum(temp_df0[,2])
      temp_df <- merge(temp_df, temp_df0, by="word", all=T)
    }
  }
  temp_df[is.na(temp_df)] <- 0
  keyval(temp_df$word , as.matrix(temp_df[,-1]))
}

TfIdfReduce <- function(k,v){
  keyval( k ,  v * log2(length(v)/sum(v!=0)) )
}

MAPREDUCE (TF)

hadoop.doc.vec <- to.dfs(doc.vec)

result <- mapreduce(
    input  = hadoop.doc.vec,
    map    = WDMap,
    reduce = WDReduce) %>%
  mapreduce(
    map    = TfMap
  )

RESULT (TF)

hadoop.result <- from.dfs(result)
hadoop.result

## $key
## [1] chases  fox     rabbit  the     ate     cabbage caught 
## Levels: ate cabbage caught chases fox rabbit the
## 
## $val
##      doc1 doc2 doc3
## [1,]    1    0    0
## [2,]    1    0    1
## [3,]    1    1    1
## [4,]    2    2    2
## [5,]    0    1    0
## [6,]    0    1    0
## [7,]    0    0    1

data.frame(word = hadoop.result$key, hadoop.result$val)

ABCDEFGHIJ0123456789

word <fctr>	doc1 <dbl>	doc2 <dbl>	doc3 <dbl>
chases	1	0	0
fox	1	0	1
rabbit	1	1	1
the	2	2	2
ate	0	1	0
cabbage	0	1	0
caught	0	0	1

RESULT (TF-IDF)

hadoop.doc.vec <- to.dfs(doc.vec)

result2 <- mapreduce(
    input  = hadoop.doc.vec,
    map    = WDMap,
    reduce = WDReduce) %>%
  mapreduce(
    map    = TfIdfMap,
    reduce = TfIdfReduce
  )

(hadoop.result2 <- from.dfs(result2))

## $key
## [1] ate     fox     the     caught  chases  rabbit  cabbage
## Levels: ate cabbage caught chases fox rabbit the
## 
## $val
##           doc1      doc2      doc3
## [1,] 0.0000000 0.3169925 0.0000000
## [2,] 0.1169925 0.0000000 0.1169925
## [3,] 0.0000000 0.0000000 0.0000000
## [4,] 0.0000000 0.0000000 0.3169925
## [5,] 0.3169925 0.0000000 0.0000000
## [6,] 0.0000000 0.0000000 0.0000000
## [7,] 0.0000000 0.3169925 0.0000000

data.frame(word=hadoop.result2$key,hadoop.result2$val)

ABCDEFGHIJ0123456789

word <fctr>	doc1 <dbl>	doc2 <dbl>	doc3 <dbl>
ate	0.0000000	0.3169925	0.0000000
fox	0.1169925	0.0000000	0.1169925
the	0.0000000	0.0000000	0.0000000
caught	0.0000000	0.0000000	0.3169925
chases	0.3169925	0.0000000	0.0000000
rabbit	0.0000000	0.0000000	0.0000000
cabbage	0.0000000	0.3169925	0.0000000

EXAMPLE

doc1 <- "Stray cats are running all over the place. I see 10 a day!"
doc2 <- "Cats are killers. They kill billions of animals a year." 
doc3 <- "The best food in Columbus, OH is   the North Market." 
doc4 <- "Brand A is the best tasting cat food around. Your cat will love it." 
doc5 <- "Buy Brand C cat food for your cat. Brand C makes healthy and happy cats." 
doc6 <- "The Arnold Classic came to town this weekend. It reminds us to be healthy." 
doc7 <- "I have nothing to say. In summary, I have told you nothing." 

hadoop.doc.vec <- to.dfs(c(doc1,doc2,doc3,doc4,doc5,doc6,doc7))

result.example <- mapreduce(
    input  = hadoop.doc.vec,
    map    = WDMap,
    reduce = WDReduce) %>%
  mapreduce(
    map    = TfIdfMap,
    reduce = TfIdfReduce
  )

hadoop.result.example <- from.dfs(result.example)

data.frame(word = hadoop.result.example$key, hadoop.result.example$val)

ABCDEFGHIJ0123456789

word <fctr>	doc1 <dbl>	doc2 <dbl>	doc3 <dbl>	doc4 <dbl>	doc5 <dbl>	doc6 <dbl>	doc7 <dbl>
	0.00000000	0.0000000	0.4678925	0.00000000	0.00000000	0.00000000	0.0000000
a	0.09403019	0.1222392	0.0000000	0.08731374	0.00000000	0.00000000	0.0000000
c	0.00000000	0.0000000	0.0000000	0.00000000	0.37431399	0.00000000	0.0000000
i	0.13902730	0.0000000	0.0000000	0.00000000	0.00000000	0.00000000	0.3012258
10	0.21595038	0.0000000	0.0000000	0.00000000	0.00000000	0.00000000	0.0000000
be	0.00000000	0.0000000	0.0000000	0.00000000	0.00000000	0.20052535	0.0000000
in	0.00000000	0.0000000	0.1506129	0.00000000	0.00000000	0.00000000	0.1506129
is	0.00000000	0.0000000	0.1506129	0.12909678	0.00000000	0.00000000	0.0000000
it	0.00000000	0.0000000	0.0000000	0.00000000	0.00000000	0.20052535	0.0000000
of	0.00000000	0.2807355	0.0000000	0.00000000	0.00000000	0.00000000	0.0000000

저작자표시 비영리 변경금지 (새창열림)

RHADOOP - TF-IDF (17/11/28 Lecture Note)

파일 및 참고자료 : http://stat.knu.ac.kr/

http://stat.knu.ac.kr/pg/bbs/board.php?bo_table=ja02&part=1

11_28_Lecture_Note

“R8_1.R”

“R8_1.R” ver_HADOOP

IDEA

WORDCOUNT MAPPER & REDUCER

TF MAPPER

TF-IDF MAPPER & REDUCER

MAPREDUCE (TF)

RESULT (TF)

RESULT (TF-IDF)

EXAMPLE