Rhadoop

RHADOOP - WORD COUNT & WORD CLOUD -2 (17/11/14 Lecture Note)

딥스탯 2017. 11. 28. 23:20

11_14_Lecture_Note

“R2_3.R”

setwd("/home/stat/다운로드")

#install.packages("tm")
#install.packages("wordcloud")

library(tm)

## Loading required package: NLP

library(wordcloud)

## Loading required package: RColorBrewer

textMining = readLines("wikipedia.txt") 
myCorpus <- Corpus(VectorSource(textMining))

myCorpus <- tm_map(myCorpus, stripWhitespace)
myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus <- tm_map(myCorpus, removeWords, stopwords("english"))

#myCorpus <- tm_map(myCorpus, PlainTextDocument)
tdm <- TermDocumentMatrix(myCorpus)
m <- as.matrix(tdm)

# calculate the frequency of words and sort it descendingly by frequency
wordFreq <- sort(rowSums(m), decreasing=TRUE)
pal=brewer.pal(8,"Dark2")

set.seed(1234) # to make it reproducible
wordcloud(words=names(wordFreq), freq=wordFreq, min.freq=2,colors=pal, random.order=F)

“R2_3.R” ver.HADOOP

#install.packages("dplyr")

library(tm) 
library(wordcloud) 
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rhdfs)

## Loading required package: rJava

## 
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop

## 
## Be sure to run hdfs.init()

hdfs.init()
library(rmr2)

## Warning: S3 methods 'gorder.default', 'gorder.factor', 'gorder.data.frame',
## 'gorder.matrix', 'gorder.raw' were declared in NAMESPACE but not found

## Please review your hadoop settings. See help(hadoop.settings)

IDEA

병렬처리를 위해서 줄별로 나눈 후 각 줄별로 따로 처리한다.

setwd("/home/stat/다운로드")
textMining = readLines("wikipedia.txt") 

textMining

## [1] "Text mining, also referred to as text data mining, roughly equivalent to text analytics, refers to the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities)."
## [2] " "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [3] "Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
## [4] " "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
## [5] "A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted."

MAPPER 3.1

mapper_3.1 <- function(k,v){
  return(keyval(seq_along(v),v))
}

REDUCER 3.1

reducer_3.1 <- function(k,v){
  myCorpus <- Corpus(VectorSource(v)) %>%
    tm_map(stripWhitespace) %>%
    tm_map(tolower) %>%
    tm_map(removePunctuation) %>%
    tm_map(removeNumbers) %>%
    tm_map(removeWords, stopwords("english")) %>%
    TermDocumentMatrix() %>%
    as.matrix() %>%
    rowSums()
  return(keyval(k,list(myCorpus)))
}

MAPREDUCE 3.1

hadoop_text <- to.dfs(textMining)

mr_3.1 <- mapreduce(hadoop_text,
          map = mapper_3.1,
          reduce = reducer_3.1)

result_mr_3.1 <- from.dfs(mr_3.1)

result_mr_3.1

## $key
## [1] 1 2 3 4 5
## 
## $val
## $val[[1]]
##        addition           along            also        analysis 
##               1               1               1               1 
##       analytics  categorization      clustering     combination 
##               1               1               1               1 
##   conceptentity            data        database         derived 
##               1               2               1               2 
##        deriving        devising        document        entities 
##               2               1               1               1 
##          entity      equivalent      evaluation      extraction 
##               1               1               1               1 
##        features         finally        granular            high 
##               1               1               1               1 
##     highquality         include     information           input 
##               2               1               2               1 
##       insertion interestingness  interpretation        involves 
##               1               1               1               1 
##        learning      linguistic           means          mining 
##               2               1               1               5 
##        modeling           named         novelty          others 
##               1               1               1               1 
##          output         parsing         pattern        patterns 
##               1               1               1               2 
##         process      production         quality        referred 
##               2               1               1               1 
##          refers        relation       relations       relevance 
##               2               1               1               1 
##         removal         roughly       sentiment     statistical 
##               1               1               1               1 
##      structured     structuring      subsequent   summarization 
##               1               1               1               1 
##           tasks      taxonomies            text          trends 
##               1               1              10               1 
##         typical       typically         usually          within 
##               1               1               3               1 
## 
## $val[[2]]
## numeric(0)
## 
## $val[[3]]
##          analysis        analytical         analytics       application 
##                 4                 1                 1                 1 
##       association              data     distributions       essentially 
##                 1                 2                 1                 1 
##        extraction         frequency              goal         including 
##                 1                 1                 1                 1 
##       information          involves          language           lexical 
##                 2                 1                 1                 1 
##              link           methods            mining           natural 
##                 1                 1                 1                 1 
##               nlp       overarching           pattern        predictive 
##                 1                 1                 1                 1 
##        processing       recognition         retrieval             study 
##                 1                 1                 1                 1 
## taggingannotation        techniques              text              turn 
##                 1                 1                 2                 1 
##               via     visualization              word 
##                 1                 1                 1 
## 
## $val[[4]]
## numeric(0)
## 
## $val[[5]]
##    application classification       database       document      documents 
##              1              1              1              1              1 
##         either      extracted          index    information       language 
##              1              1              1              1              1 
##          model        natural       populate     predictive       purposes 
##              1              1              1              1              1 
##           scan         search            set        typical        written 
##              1              1              2              1              1

MAPPER 3.2

mapper_3.2 <- function(k,v){
  keyval(names(unlist(v)),unlist(v))
}

REDUCER 3.2

reducer_3.2 <- function(k,v){
  keyval(k,sum(v))
}

MAPREDUCE 3.2

mr_3.2 <- mapreduce(mr_3.1,
                  map = mapper_3.2,
                  reduce = reducer_3.2)

result_mr_3.2 <- from.dfs(mr_3.2)

result_mr_3.2

## $key
##   [1] "nlp"               "set"               "via"              
##   [4] "also"              "data"              "goal"             
##   [7] "high"              "link"              "scan"             
##  [10] "text"              "turn"              "word"             
##  [13] "along"             "index"             "input"            
##  [16] "means"             "model"             "named"            
##  [19] "study"             "tasks"             "either"           
##  [22] "entity"            "mining"            "others"           
##  [25] "output"            "refers"            "search"           
##  [28] "trends"            "within"            "derived"          
##  [31] "finally"           "include"           "lexical"          
##  [34] "methods"           "natural"           "novelty"          
##  [37] "parsing"           "pattern"           "process"          
##  [40] "quality"           "removal"           "roughly"          
##  [43] "typical"           "usually"           "written"          
##  [46] "addition"          "analysis"          "database"         
##  [49] "deriving"          "devising"          "document"         
##  [52] "entities"          "features"          "granular"         
##  [55] "involves"          "language"          "learning"         
##  [58] "modeling"          "patterns"          "populate"         
##  [61] "purposes"          "referred"          "relation"         
##  [64] "analytics"         "documents"         "extracted"        
##  [67] "frequency"         "including"         "insertion"        
##  [70] "relations"         "relevance"         "retrieval"        
##  [73] "sentiment"         "typically"         "analytical"       
##  [76] "clustering"        "equivalent"        "evaluation"       
##  [79] "extraction"        "linguistic"        "predictive"       
##  [82] "processing"        "production"        "structured"       
##  [85] "subsequent"        "taxonomies"        "techniques"       
##  [88] "application"       "association"       "combination"      
##  [91] "essentially"       "highquality"       "information"      
##  [94] "overarching"       "recognition"       "statistical"      
##  [97] "structuring"       "conceptentity"     "distributions"    
## [100] "summarization"     "visualization"     "categorization"   
## [103] "classification"    "interpretation"    "interestingness"  
## [106] "taggingannotation"
## 
## $val
##   [1]  1  2  1  1  4  1  1  1  1 12  1  1  1  1  1  1  1  1  1  1  1  1  6
##  [24]  1  1  2  1  1  1  2  1  1  1  1  2  1  1  2  2  1  1  1  2  3  1  1
##  [47]  5  2  2  1  2  1  1  1  2  2  2  1  2  1  1  1  1  2  1  1  1  1  1
##  [70]  1  1  1  1  1  1  1  1  1  2  1  2  1  1  1  1  1  1  2  1  1  1  2
##  [93]  5  1  1  1  1  1  1  1  1  1  1  1  1  1

pal=brewer.pal(8,"Dark2")

set.seed(1234) # to make it reproducible
wordcloud(words=result_mr_3.2$key, freq=result_mr_3.2$val, min.freq=2,colors=pal, random.order=F)

“R2_4.R”

#install.packages("tm")
#install.packages("wordcloud")
library(tm)
library(wordcloud)

setwd("/home/stat/다운로드")
shakespeare = readLines("shakespeare.txt")
length(shakespeare)

## [1] 124386

head(shakespeare)

## [1] "The Project Gutenberg EBook of The Complete Works of William Shakespeare, by"
## [2] "William Shakespeare"                                                         
## [3] ""                                                                            
## [4] "This eBook is for the use of anyone anywhere at no cost and with"            
## [5] "almost no restrictions whatsoever.  You may copy it, give it away or"        
## [6] "re-use it under the terms of the Project Gutenberg License included"

tail(shakespeare)

## [1] ""                                                                               
## [2] ""                                                                               
## [3] "End of the Project Gutenberg EBook of The Complete Works of William"            
## [4] "Shakespeare, by William Shakespeare"                                            
## [5] ""                                                                               
## [6] "*** END OF THIS PROJECT GUTENBERG EBOOK COMPLETE WORKS--WILLIAM SHAKESPEARE ***"

shakespeare = shakespeare[-(124369:length(shakespeare))]
shakespeare = shakespeare[-(1:174)]
length(shakespeare)

## [1] 124194

shakespeare = paste(shakespeare, collapse = " ")
length(shakespeare)

## [1] 1

shakespeare = strsplit(shakespeare, "<<[^>]*>>")[[1]]
length(shakespeare)

## [1] 218

(dramatis.personae <- grep("Dramatis Personae", shakespeare, ignore.case = TRUE))

##  [1]   2   8  11  17  23  28  33  43  49  55  62  68  74  81  87  93  99
## [18] 105 111 117 122 126 134 140 146 152 158 164 170 176 182 188 194 200
## [35] 206 212

length(dramatis.personae)

## [1] 36

shakespeare = shakespeare[-dramatis.personae]
length(shakespeare)

## [1] 182

myCorpus <- Corpus(VectorSource(shakespeare))
myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
#myCorpus <- tm_map(myCorpus, removeWords, stopwords("english"))
myStopwords <- c(stopwords('english'), "thou", "let","shall",
 "thee", "thy", "will", "now", "sir", "now", "well", "upon", "one", "tis", "may", "yet", "must", "enter")
# remove stopwords from corpus
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
#myCorpus <- tm_map(myCorpus, PlainTextDocument)

tdm <- TermDocumentMatrix(myCorpus)
m <- as.matrix(tdm)

wordFreq <- sort(rowSums(m), decreasing=TRUE)

pal=brewer.pal(8,"Dark2")

set.seed(1234) # to make it reproducible
wordcloud(words=names(wordFreq), freq=wordFreq, min.freq=500, colors=pal, random.order=F)

“R2_4.R” ver.HADOOP

library(tm)
library(wordcloud)
library(dplyr)

library(rhdfs)
hdfs.init()
library(rmr2)

setwd("/home/stat/다운로드")
shakespeare = readLines("shakespeare.txt")

MAPPER 4.1

mapper_4.1 <- function(k,v){
  temp_vec <- v[-(124369:length(v))]
  temp_vec <- temp_vec[-(1:174)]
  temp_vec <- paste(temp_vec, collapse = " ")
  temp_vec <- strsplit(temp_vec, "<<[^>]*>>")[[1]]
  
  temp_index <- grep("Dramatis Personae", shakespeare, ignore.case = TRUE)
  temp_vec <- temp_vec[-temp_index]
  keyval(seq_along(temp_vec),temp_vec)
}

REDUCER 4.1

reducer_4.1 <- function(k,v){
  myStopwords <- c(stopwords('english'), "thou", "let","shall","thee", "thy",
                   "will", "now", "sir", "now", "well", "upon", "one", "tis",
                   "may", "yet", "must", "enter")    
  
  wordFreq <- Corpus(VectorSource(v)) %>%
    tm_map(tolower) %>%
    tm_map(removePunctuation) %>%
    tm_map(removeNumbers) %>%
    tm_map(removeWords, myStopwords) %>%
    TermDocumentMatrix() %>%
    as.matrix() %>%
    rowSums()
    
  return(keyval(k,list(wordFreq)))
}

MAPPER 4.2

mapper_4.2 <- function(k,v){
  keyval(names(unlist(v)),unlist(v))
}

REDUCER 4.2

reducer_4.2 <- function(k,v){
  keyval(k,sum(v))
}

MAPREDUCE 4

hadoop_shak <- to.dfs(shakespeare)

mr_4 <- mapreduce(
  hadoop_shak,
  map = mapper_4.1,
  reduce = reducer_4.1
  ) %>%
  mapreduce(map = mapper_4.2,
            reduce = reducer_4.2)

result_mr_4 <- from.dfs(mr_4)

as.data.frame(result_mr_4)

ABCDEFGHIJ0123456789

key <fctr>	val <dbl>
abr	5
aby	2
ace	3
act	346
add	39
ado	19
aer	2
agd	3
age	197
ago	30

pal=brewer.pal(8,"Dark2")

set.seed(1234)
wordcloud(words=result_mr_4$key, freq=result_mr_4$val, min.freq=500, colors=pal, random.order=F)

저작자표시 비영리 변경금지 (새창열림)

RHADOOP - WORD COUNT & WORD CLOUD -2 (17/11/14 Lecture Note)

파일 및 참고자료 : http://stat.knu.ac.kr/

http://stat.knu.ac.kr/pg/bbs/board.php?bo_table=ja02&part=1

11_14_Lecture_Note

“R2_3.R”

“R2_3.R” ver.HADOOP

IDEA

MAPPER 3.1

REDUCER 3.1

MAPREDUCE 3.1

MAPPER 3.2

REDUCER 3.2

MAPREDUCE 3.2

“R2_4.R”

“R2_4.R” ver.HADOOP

MAPPER 4.1

REDUCER 4.1

MAPPER 4.2

REDUCER 4.2

MAPREDUCE 4