Rhadoop

RHADOOP - SENTIMENTAL ANALYSIS (17/11/23 Lecture Note)

딥스탯 2017. 11. 28. 23:25
11_23_Lecture_Note

“R4_1.R”

rm(list = ls())              

#install.packages("twitteR")
#install.packages("plyr")
#install.packages("stringr")
#install.packages("ggplot2")

setwd("/home/stat/다운로드/┴ж4└х")
library(twitteR)
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:twitteR':
## 
##     id
library(stringr)
library(ggplot2)

score.sentiment = function(sentences, pos.words, neg.words)
{
   # Parameters
   # sentences: vector of text to score
   # pos.words: vector of words of postive sentiment
   # neg.words: vector of words of negative sentiment  
   # create simple array of scores with laply

   scores = laply(sentences, 
   function(sentence, pos.words, neg.words)
   {
      # remove punctuation
      sentence = gsub("[[:punct:]]", "", sentence)
      # remove control characters
      sentence = gsub("[[:cntrl:]]", "", sentence)
      # remove digits?
      sentence = gsub('\\d+', '', sentence)

      # define error handling function when trying tolower
      tryTolower = function(x)
      {
         # create missing value
         y = NA
         # tryCatch error
         try_error = tryCatch(tolower(x), error=function(e) e)
         # if not an error
         if (!inherits(try_error, "error"))
         y = tolower(x)
         # result
         return(y)
      }

      # use tryTolower with sapply 
      sentence = sapply(sentence, tryTolower)
      # split sentence into words with str_split (stringr package)
      word.list = str_split(sentence, "\\s+")
      words = unlist(word.list)

      # compare words to the dictionaries of positive & negative terms
      pos.matches = match(words, pos.words)
      neg.matches = match(words, neg.words)

      # get the position of the matched term or NA
      # we just want a TRUE/FALSE
      pos.matches = !is.na(pos.matches)
      neg.matches = !is.na(neg.matches)

      # final score
      score = sum(pos.matches) - sum(neg.matches)
      return(score)
    }, pos.words, neg.words)

   # data frame with scores for each sentence
   scores.df = data.frame(text=sentences, score=scores)
   return(scores.df)
}

pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')
#pos.words = c(hu.liu.pos)
#neg.words = c(hu.liu.neg)

sample = c("You're awesome and I love you",
"I hate and hate and hate. So angry. Die!",
"Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity",
"Oh how I love being ignored",
"Absolutely adore it when my bus is late.")

result = score.sentiment(sample, pos.words, neg.words)

result
ABCDEFGHIJ0123456789
text
<fctr>
score
<int>
You're awesome and I love you2
I hate and hate and hate. So angry. Die!-5
Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity4
Oh how I love being ignored1
Absolutely adore it when my bus is late.1
hist(result$score)

qplot(result$score,bins=10)+theme_bw()

“R4_2.R”

rm(list = ls())              

setwd("/home/stat/다운로드/┴ж4└х")
library(twitteR)
library(plyr)
library(stringr)
library(ggplot2)

FUN=function(sentence, pos.words, neg.words)
   {
      # remove punctuation
      sentence = gsub("[[:punct:]]", "", sentence)
      # remove control characters
      sentence = gsub("[[:cntrl:]]", "", sentence)
      # remove digits?
      sentence = gsub('\\d+', '', sentence)

      # define error handling function when trying tolower
      tryTolower = function(x)
      {
         # create missing value
         y = NA
         # tryCatch error
         try_error = tryCatch(tolower(x), error=function(e) e)
         # if not an error
         if (!inherits(try_error, "error"))
         y = tolower(x)
         # result
         return(y)
      }

      # use tryTolower with sapply 
      sentence = sapply(sentence, tryTolower)
      # split sentence into words with str_split (stringr package)
      word.list = str_split(sentence, "\\s+")
      words = unlist(word.list)

      # compare words to the dictionaries of positive & negative terms
      pos.matches = match(words, pos.words)
      neg.matches = match(words, neg.words)

      # get the position of the matched term or NA
      # we just want a TRUE/FALSE
      pos.matches = !is.na(pos.matches)
      neg.matches = !is.na(neg.matches)

      # final score
      score = sum(pos.matches) - sum(neg.matches)
      return(score)
    }

score.sentiment = function(sentences, pos.words, neg.words)
{
   # Parameters
   # sentences: vector of text to score
   # pos.words: vector of words of postive sentiment
   # neg.words: vector of words of negative sentiment  
   # create simple array of scores with laply

   scores = laply(sentences, FUN, pos.words, neg.words)

   # data frame with scores for each sentence
   scores.df = data.frame(text=sentences, score=scores)
   return(scores.df)
}
pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')
#pos.words = c(hu.liu.pos)
#neg.words = c(hu.liu.neg)

sample = c("You're awesome and I love you",
"I hate and hate and hate. So angry. Die!",
"Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity",
"Oh how I love being ignored",
"Absolutely adore it when my bus is late.")

result = score.sentiment(sample, pos.words, neg.words)

result
ABCDEFGHIJ0123456789
text
<fctr>
score
<int>
You're awesome and I love you2
I hate and hate and hate. So angry. Die!-5
Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity4
Oh how I love being ignored1
Absolutely adore it when my bus is late.1
hist(result$score)

qplot(result$score,bins=10)+theme_bw()

ver.hadoop

rm(list = ls())              

setwd("/home/stat/다운로드/┴ж4└х")

pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')

head(pos.words)
## [1] "a+"         "abound"     "abounds"    "abundance"  "abundant"  
## [6] "accessable"
head(neg.words)
## [1] "2-faced"    "2-faces"    "abnormal"   "abolish"    "abominable"
## [6] "abominably"
sample = c("You're awesome and I love you",
"I hate and hate and hate. So angry. Die!",
"Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity",
"Oh how I love being ignored",
"Absolutely adore it when my bus is late.")

library(twitteR)
library(plyr)
library(stringr)
library(ggplot2)

library(rhdfs)
## Loading required package: rJava
## 
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop
## 
## Be sure to run hdfs.init()
hdfs.init()
library(rmr2)
## Warning: S3 methods 'gorder.default', 'gorder.factor', 'gorder.data.frame',
## 'gorder.matrix', 'gorder.raw' were declared in NAMESPACE but not found
## Please review your hadoop settings. See help(hadoop.settings)

IDEA

병렬처리를 위해서 줄별로 나눈 후 각 줄별로 따로 처리한다.

Defining FUN

FUN <- function(sentence, pos.words, neg.words){
  
      # remove punctuation
      sentence = gsub("[[:punct:]]", "", sentence)
      # remove control characters
      sentence = gsub("[[:cntrl:]]", "", sentence)
      # remove digits?
      sentence = gsub('\\d+', '', sentence)

      # define error handling function when trying tolower
      tryTolower = function(x)
        {
         # create missing value
         y = NA
         # tryCatch error
         try_error = tryCatch(tolower(x), error=function(e) e)
         # if not an error
         if (!inherits(try_error, "error"))
         y = tolower(x)
         # result
         return(y)
         }

      # use tryTolower with sapply 
      sentence = sapply(sentence, tryTolower)
      # split sentence into words with str_split (stringr package)
      word.list = str_split(sentence, "\\s+")
      words = unlist(word.list)

      # compare words to the dictionaries of positive & negative terms
      pos.matches = match(words, pos.words)
      neg.matches = match(words, neg.words)

      # get the position of the matched term or NA
      # we just want a TRUE/FALSE
      pos.matches = !is.na(pos.matches)
      neg.matches = !is.na(neg.matches)

      # final score
      score = sum(pos.matches) - sum(neg.matches)
      return(score)
      }

MAPPER

mapper <- function(k,v){
  return(keyval(v,""))
}

REDUCER

reducer <- function(k,v){
  sen_score <- FUN(k, pos.words = pos.words, neg.words = neg.words)
  return(keyval(k,sen_score))
}

MAPREDUCE

hadoop_sample <- to.dfs(sample)

hadoop_result <- mapreduce(hadoop_sample,
                           map = mapper,
                           reduce = reducer)
result <- from.dfs(hadoop_result)
as.data.frame(result)
ABCDEFGHIJ0123456789
key
<fctr>
val
<int>
Oh how I love being ignored1
You're awesome and I love you2
Absolutely adore it when my bus is late.1
I hate and hate and hate. So angry. Die!-5
Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity4
hist(result$val)

qplot(result$val,bins=10)+theme_bw()