Rhadoop
RHADOOP - WORD COUNT & WORD CLOUD -2 (17/11/14 Lecture Note)
딥스탯
2017. 11. 28. 23:20
파일 및 참고자료 : http://stat.knu.ac.kr/
http://stat.knu.ac.kr/pg/bbs/board.php?bo_table=ja02&part=1
11_14_Lecture_Note
“R2_3.R”
setwd("/home/stat/다운로드")
#install.packages("tm")
#install.packages("wordcloud")
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
textMining = readLines("wikipedia.txt")
myCorpus <- Corpus(VectorSource(textMining))
myCorpus <- tm_map(myCorpus, stripWhitespace)
myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus <- tm_map(myCorpus, removeWords, stopwords("english"))
#myCorpus <- tm_map(myCorpus, PlainTextDocument)
tdm <- TermDocumentMatrix(myCorpus)
m <- as.matrix(tdm)
# calculate the frequency of words and sort it descendingly by frequency
wordFreq <- sort(rowSums(m), decreasing=TRUE)
pal=brewer.pal(8,"Dark2")
set.seed(1234) # to make it reproducible
wordcloud(words=names(wordFreq), freq=wordFreq, min.freq=2,colors=pal, random.order=F)
“R2_3.R” ver.HADOOP
#install.packages("dplyr")
library(tm)
library(wordcloud)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rhdfs)
## Loading required package: rJava
##
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop
##
## Be sure to run hdfs.init()
hdfs.init()
library(rmr2)
## Warning: S3 methods 'gorder.default', 'gorder.factor', 'gorder.data.frame',
## 'gorder.matrix', 'gorder.raw' were declared in NAMESPACE but not found
## Please review your hadoop settings. See help(hadoop.settings)
IDEA
병렬처리를 위해서 줄별로 나눈 후 각 줄별로 따로 처리한다.
setwd("/home/stat/다운로드")
textMining = readLines("wikipedia.txt")
textMining
## [1] "Text mining, also referred to as text data mining, roughly equivalent to text analytics, refers to the process of deriving high-quality information from text. High-quality information is typically derived through the devising of patterns and trends through means such as statistical pattern learning. Text mining usually involves the process of structuring the input text (usually parsing, along with the addition of some derived linguistic features and the removal of others, and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 'High quality' in text mining usually refers to some combination of relevance, novelty, and interestingness. Typical text mining tasks include text categorization, text clustering, concept/entity extraction, production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities)."
## [2] " "
## [3] "Text analysis involves information retrieval, lexical analysis to study word frequency distributions, pattern recognition, tagging/annotation, information extraction, data mining techniques including link and association analysis, visualization, and predictive analytics. The overarching goal is, essentially, to turn text into data for analysis, via application of natural language processing (NLP) and analytical methods."
## [4] " "
## [5] "A typical application is to scan a set of documents written in a natural language and either model the document set for predictive classification purposes or populate a database or search index with the information extracted."
MAPPER 3.1
mapper_3.1 <- function(k,v){
return(keyval(seq_along(v),v))
}
REDUCER 3.1
reducer_3.1 <- function(k,v){
myCorpus <- Corpus(VectorSource(v)) %>%
tm_map(stripWhitespace) %>%
tm_map(tolower) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("english")) %>%
TermDocumentMatrix() %>%
as.matrix() %>%
rowSums()
return(keyval(k,list(myCorpus)))
}
MAPREDUCE 3.1
hadoop_text <- to.dfs(textMining)
mr_3.1 <- mapreduce(hadoop_text,
map = mapper_3.1,
reduce = reducer_3.1)
result_mr_3.1 <- from.dfs(mr_3.1)
result_mr_3.1
## $key
## [1] 1 2 3 4 5
##
## $val
## $val[[1]]
## addition along also analysis
## 1 1 1 1
## analytics categorization clustering combination
## 1 1 1 1
## conceptentity data database derived
## 1 2 1 2
## deriving devising document entities
## 2 1 1 1
## entity equivalent evaluation extraction
## 1 1 1 1
## features finally granular high
## 1 1 1 1
## highquality include information input
## 2 1 2 1
## insertion interestingness interpretation involves
## 1 1 1 1
## learning linguistic means mining
## 2 1 1 5
## modeling named novelty others
## 1 1 1 1
## output parsing pattern patterns
## 1 1 1 2
## process production quality referred
## 2 1 1 1
## refers relation relations relevance
## 2 1 1 1
## removal roughly sentiment statistical
## 1 1 1 1
## structured structuring subsequent summarization
## 1 1 1 1
## tasks taxonomies text trends
## 1 1 10 1
## typical typically usually within
## 1 1 3 1
##
## $val[[2]]
## numeric(0)
##
## $val[[3]]
## analysis analytical analytics application
## 4 1 1 1
## association data distributions essentially
## 1 2 1 1
## extraction frequency goal including
## 1 1 1 1
## information involves language lexical
## 2 1 1 1
## link methods mining natural
## 1 1 1 1
## nlp overarching pattern predictive
## 1 1 1 1
## processing recognition retrieval study
## 1 1 1 1
## taggingannotation techniques text turn
## 1 1 2 1
## via visualization word
## 1 1 1
##
## $val[[4]]
## numeric(0)
##
## $val[[5]]
## application classification database document documents
## 1 1 1 1 1
## either extracted index information language
## 1 1 1 1 1
## model natural populate predictive purposes
## 1 1 1 1 1
## scan search set typical written
## 1 1 2 1 1
MAPPER 3.2
mapper_3.2 <- function(k,v){
keyval(names(unlist(v)),unlist(v))
}
REDUCER 3.2
reducer_3.2 <- function(k,v){
keyval(k,sum(v))
}
MAPREDUCE 3.2
mr_3.2 <- mapreduce(mr_3.1,
map = mapper_3.2,
reduce = reducer_3.2)
result_mr_3.2 <- from.dfs(mr_3.2)
result_mr_3.2
## $key
## [1] "nlp" "set" "via"
## [4] "also" "data" "goal"
## [7] "high" "link" "scan"
## [10] "text" "turn" "word"
## [13] "along" "index" "input"
## [16] "means" "model" "named"
## [19] "study" "tasks" "either"
## [22] "entity" "mining" "others"
## [25] "output" "refers" "search"
## [28] "trends" "within" "derived"
## [31] "finally" "include" "lexical"
## [34] "methods" "natural" "novelty"
## [37] "parsing" "pattern" "process"
## [40] "quality" "removal" "roughly"
## [43] "typical" "usually" "written"
## [46] "addition" "analysis" "database"
## [49] "deriving" "devising" "document"
## [52] "entities" "features" "granular"
## [55] "involves" "language" "learning"
## [58] "modeling" "patterns" "populate"
## [61] "purposes" "referred" "relation"
## [64] "analytics" "documents" "extracted"
## [67] "frequency" "including" "insertion"
## [70] "relations" "relevance" "retrieval"
## [73] "sentiment" "typically" "analytical"
## [76] "clustering" "equivalent" "evaluation"
## [79] "extraction" "linguistic" "predictive"
## [82] "processing" "production" "structured"
## [85] "subsequent" "taxonomies" "techniques"
## [88] "application" "association" "combination"
## [91] "essentially" "highquality" "information"
## [94] "overarching" "recognition" "statistical"
## [97] "structuring" "conceptentity" "distributions"
## [100] "summarization" "visualization" "categorization"
## [103] "classification" "interpretation" "interestingness"
## [106] "taggingannotation"
##
## $val
## [1] 1 2 1 1 4 1 1 1 1 12 1 1 1 1 1 1 1 1 1 1 1 1 6
## [24] 1 1 2 1 1 1 2 1 1 1 1 2 1 1 2 2 1 1 1 2 3 1 1
## [47] 5 2 2 1 2 1 1 1 2 2 2 1 2 1 1 1 1 2 1 1 1 1 1
## [70] 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 2 1 1 1 2
## [93] 5 1 1 1 1 1 1 1 1 1 1 1 1 1
pal=brewer.pal(8,"Dark2")
set.seed(1234) # to make it reproducible
wordcloud(words=result_mr_3.2$key, freq=result_mr_3.2$val, min.freq=2,colors=pal, random.order=F)
“R2_4.R”
#install.packages("tm")
#install.packages("wordcloud")
library(tm)
library(wordcloud)
setwd("/home/stat/다운로드")
shakespeare = readLines("shakespeare.txt")
length(shakespeare)
## [1] 124386
head(shakespeare)
## [1] "The Project Gutenberg EBook of The Complete Works of William Shakespeare, by"
## [2] "William Shakespeare"
## [3] ""
## [4] "This eBook is for the use of anyone anywhere at no cost and with"
## [5] "almost no restrictions whatsoever. You may copy it, give it away or"
## [6] "re-use it under the terms of the Project Gutenberg License included"
tail(shakespeare)
## [1] ""
## [2] ""
## [3] "End of the Project Gutenberg EBook of The Complete Works of William"
## [4] "Shakespeare, by William Shakespeare"
## [5] ""
## [6] "*** END OF THIS PROJECT GUTENBERG EBOOK COMPLETE WORKS--WILLIAM SHAKESPEARE ***"
shakespeare = shakespeare[-(124369:length(shakespeare))]
shakespeare = shakespeare[-(1:174)]
length(shakespeare)
## [1] 124194
shakespeare = paste(shakespeare, collapse = " ")
length(shakespeare)
## [1] 1
shakespeare = strsplit(shakespeare, "<<[^>]*>>")[[1]]
length(shakespeare)
## [1] 218
(dramatis.personae <- grep("Dramatis Personae", shakespeare, ignore.case = TRUE))
## [1] 2 8 11 17 23 28 33 43 49 55 62 68 74 81 87 93 99
## [18] 105 111 117 122 126 134 140 146 152 158 164 170 176 182 188 194 200
## [35] 206 212
length(dramatis.personae)
## [1] 36
shakespeare = shakespeare[-dramatis.personae]
length(shakespeare)
## [1] 182
myCorpus <- Corpus(VectorSource(shakespeare))
myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
#myCorpus <- tm_map(myCorpus, removeWords, stopwords("english"))
myStopwords <- c(stopwords('english'), "thou", "let","shall",
"thee", "thy", "will", "now", "sir", "now", "well", "upon", "one", "tis", "may", "yet", "must", "enter")
# remove stopwords from corpus
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
#myCorpus <- tm_map(myCorpus, PlainTextDocument)
tdm <- TermDocumentMatrix(myCorpus)
m <- as.matrix(tdm)
wordFreq <- sort(rowSums(m), decreasing=TRUE)
pal=brewer.pal(8,"Dark2")
set.seed(1234) # to make it reproducible
wordcloud(words=names(wordFreq), freq=wordFreq, min.freq=500, colors=pal, random.order=F)
“R2_4.R” ver.HADOOP
library(tm)
library(wordcloud)
library(dplyr)
library(rhdfs)
hdfs.init()
library(rmr2)
setwd("/home/stat/다운로드")
shakespeare = readLines("shakespeare.txt")
MAPPER 4.1
mapper_4.1 <- function(k,v){
temp_vec <- v[-(124369:length(v))]
temp_vec <- temp_vec[-(1:174)]
temp_vec <- paste(temp_vec, collapse = " ")
temp_vec <- strsplit(temp_vec, "<<[^>]*>>")[[1]]
temp_index <- grep("Dramatis Personae", shakespeare, ignore.case = TRUE)
temp_vec <- temp_vec[-temp_index]
keyval(seq_along(temp_vec),temp_vec)
}
REDUCER 4.1
reducer_4.1 <- function(k,v){
myStopwords <- c(stopwords('english'), "thou", "let","shall","thee", "thy",
"will", "now", "sir", "now", "well", "upon", "one", "tis",
"may", "yet", "must", "enter")
wordFreq <- Corpus(VectorSource(v)) %>%
tm_map(tolower) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, myStopwords) %>%
TermDocumentMatrix() %>%
as.matrix() %>%
rowSums()
return(keyval(k,list(wordFreq)))
}
MAPPER 4.2
mapper_4.2 <- function(k,v){
keyval(names(unlist(v)),unlist(v))
}
REDUCER 4.2
reducer_4.2 <- function(k,v){
keyval(k,sum(v))
}
MAPREDUCE 4
hadoop_shak <- to.dfs(shakespeare)
mr_4 <- mapreduce(
hadoop_shak,
map = mapper_4.1,
reduce = reducer_4.1
) %>%
mapreduce(map = mapper_4.2,
reduce = reducer_4.2)
result_mr_4 <- from.dfs(mr_4)
as.data.frame(result_mr_4)
key <fctr> | val <dbl> | |||
---|---|---|---|---|
abr | 5 | |||
aby | 2 | |||
ace | 3 | |||
act | 346 | |||
add | 39 | |||
ado | 19 | |||
aer | 2 | |||
agd | 3 | |||
age | 197 | |||
ago | 30 |
pal=brewer.pal(8,"Dark2")
set.seed(1234)
wordcloud(words=result_mr_4$key, freq=result_mr_4$val, min.freq=500, colors=pal, random.order=F)