Rhadoop
RHADOOP - SENTIMENTAL ANALYSIS (17/11/23 Lecture Note)
딥스탯
2017. 11. 28. 23:25
파일 및 참고자료 : http://stat.knu.ac.kr/
http://stat.knu.ac.kr/pg/bbs/board.php?bo_table=ja02&part=1
11_23_Lecture_Note
“R4_1.R”
rm(list = ls())
#install.packages("twitteR")
#install.packages("plyr")
#install.packages("stringr")
#install.packages("ggplot2")
setwd("/home/stat/다운로드/┴ж4└х")
library(twitteR)
library(plyr)
##
## Attaching package: 'plyr'
## The following object is masked from 'package:twitteR':
##
## id
library(stringr)
library(ggplot2)
score.sentiment = function(sentences, pos.words, neg.words)
{
# Parameters
# sentences: vector of text to score
# pos.words: vector of words of postive sentiment
# neg.words: vector of words of negative sentiment
# create simple array of scores with laply
scores = laply(sentences,
function(sentence, pos.words, neg.words)
{
# remove punctuation
sentence = gsub("[[:punct:]]", "", sentence)
# remove control characters
sentence = gsub("[[:cntrl:]]", "", sentence)
# remove digits?
sentence = gsub('\\d+', '', sentence)
# define error handling function when trying tolower
tryTolower = function(x)
{
# create missing value
y = NA
# tryCatch error
try_error = tryCatch(tolower(x), error=function(e) e)
# if not an error
if (!inherits(try_error, "error"))
y = tolower(x)
# result
return(y)
}
# use tryTolower with sapply
sentence = sapply(sentence, tryTolower)
# split sentence into words with str_split (stringr package)
word.list = str_split(sentence, "\\s+")
words = unlist(word.list)
# compare words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# get the position of the matched term or NA
# we just want a TRUE/FALSE
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# final score
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words)
# data frame with scores for each sentence
scores.df = data.frame(text=sentences, score=scores)
return(scores.df)
}
pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')
#pos.words = c(hu.liu.pos)
#neg.words = c(hu.liu.neg)
sample = c("You're awesome and I love you",
"I hate and hate and hate. So angry. Die!",
"Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity",
"Oh how I love being ignored",
"Absolutely adore it when my bus is late.")
result = score.sentiment(sample, pos.words, neg.words)
result
text <fctr> | score <int> |
---|---|
You're awesome and I love you | 2 |
I hate and hate and hate. So angry. Die! | -5 |
Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity | 4 |
Oh how I love being ignored | 1 |
Absolutely adore it when my bus is late. | 1 |
hist(result$score)
qplot(result$score,bins=10)+theme_bw()
“R4_2.R”
rm(list = ls())
setwd("/home/stat/다운로드/┴ж4└х")
library(twitteR)
library(plyr)
library(stringr)
library(ggplot2)
FUN=function(sentence, pos.words, neg.words)
{
# remove punctuation
sentence = gsub("[[:punct:]]", "", sentence)
# remove control characters
sentence = gsub("[[:cntrl:]]", "", sentence)
# remove digits?
sentence = gsub('\\d+', '', sentence)
# define error handling function when trying tolower
tryTolower = function(x)
{
# create missing value
y = NA
# tryCatch error
try_error = tryCatch(tolower(x), error=function(e) e)
# if not an error
if (!inherits(try_error, "error"))
y = tolower(x)
# result
return(y)
}
# use tryTolower with sapply
sentence = sapply(sentence, tryTolower)
# split sentence into words with str_split (stringr package)
word.list = str_split(sentence, "\\s+")
words = unlist(word.list)
# compare words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# get the position of the matched term or NA
# we just want a TRUE/FALSE
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# final score
score = sum(pos.matches) - sum(neg.matches)
return(score)
}
score.sentiment = function(sentences, pos.words, neg.words)
{
# Parameters
# sentences: vector of text to score
# pos.words: vector of words of postive sentiment
# neg.words: vector of words of negative sentiment
# create simple array of scores with laply
scores = laply(sentences, FUN, pos.words, neg.words)
# data frame with scores for each sentence
scores.df = data.frame(text=sentences, score=scores)
return(scores.df)
}
pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')
#pos.words = c(hu.liu.pos)
#neg.words = c(hu.liu.neg)
sample = c("You're awesome and I love you",
"I hate and hate and hate. So angry. Die!",
"Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity",
"Oh how I love being ignored",
"Absolutely adore it when my bus is late.")
result = score.sentiment(sample, pos.words, neg.words)
result
text <fctr> | score <int> |
---|---|
You're awesome and I love you | 2 |
I hate and hate and hate. So angry. Die! | -5 |
Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity | 4 |
Oh how I love being ignored | 1 |
Absolutely adore it when my bus is late. | 1 |
hist(result$score)
qplot(result$score,bins=10)+theme_bw()
ver.hadoop
rm(list = ls())
setwd("/home/stat/다운로드/┴ж4└х")
pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')
head(pos.words)
## [1] "a+" "abound" "abounds" "abundance" "abundant"
## [6] "accessable"
head(neg.words)
## [1] "2-faced" "2-faces" "abnormal" "abolish" "abominable"
## [6] "abominably"
sample = c("You're awesome and I love you",
"I hate and hate and hate. So angry. Die!",
"Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity",
"Oh how I love being ignored",
"Absolutely adore it when my bus is late.")
library(twitteR)
library(plyr)
library(stringr)
library(ggplot2)
library(rhdfs)
## Loading required package: rJava
##
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop
##
## Be sure to run hdfs.init()
hdfs.init()
library(rmr2)
## Warning: S3 methods 'gorder.default', 'gorder.factor', 'gorder.data.frame',
## 'gorder.matrix', 'gorder.raw' were declared in NAMESPACE but not found
## Please review your hadoop settings. See help(hadoop.settings)
IDEA
병렬처리를 위해서 줄별로 나눈 후 각 줄별로 따로 처리한다.
Defining FUN
FUN <- function(sentence, pos.words, neg.words){
# remove punctuation
sentence = gsub("[[:punct:]]", "", sentence)
# remove control characters
sentence = gsub("[[:cntrl:]]", "", sentence)
# remove digits?
sentence = gsub('\\d+', '', sentence)
# define error handling function when trying tolower
tryTolower = function(x)
{
# create missing value
y = NA
# tryCatch error
try_error = tryCatch(tolower(x), error=function(e) e)
# if not an error
if (!inherits(try_error, "error"))
y = tolower(x)
# result
return(y)
}
# use tryTolower with sapply
sentence = sapply(sentence, tryTolower)
# split sentence into words with str_split (stringr package)
word.list = str_split(sentence, "\\s+")
words = unlist(word.list)
# compare words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# get the position of the matched term or NA
# we just want a TRUE/FALSE
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# final score
score = sum(pos.matches) - sum(neg.matches)
return(score)
}
MAPPER
mapper <- function(k,v){
return(keyval(v,""))
}
REDUCER
reducer <- function(k,v){
sen_score <- FUN(k, pos.words = pos.words, neg.words = neg.words)
return(keyval(k,sen_score))
}
MAPREDUCE
hadoop_sample <- to.dfs(sample)
hadoop_result <- mapreduce(hadoop_sample,
map = mapper,
reduce = reducer)
result <- from.dfs(hadoop_result)
as.data.frame(result)
key <fctr> | val <int> |
---|---|
Oh how I love being ignored | 1 |
You're awesome and I love you | 2 |
Absolutely adore it when my bus is late. | 1 |
I hate and hate and hate. So angry. Die! | -5 |
Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity | 4 |
hist(result$val)
qplot(result$val,bins=10)+theme_bw()