Rhadoop

RHADOOP - SENTIMENTAL ANALYSIS (17/11/23 Lecture Note)

딥스탯 2017. 11. 28. 23:25

11_23_Lecture_Note

“R4_1.R”

rm(list = ls())              

#install.packages("twitteR")
#install.packages("plyr")
#install.packages("stringr")
#install.packages("ggplot2")

setwd("/home/stat/다운로드/┴ж4└х")
library(twitteR)
library(plyr)

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:twitteR':
## 
##     id

library(stringr)
library(ggplot2)

score.sentiment = function(sentences, pos.words, neg.words)
{
   # Parameters
   # sentences: vector of text to score
   # pos.words: vector of words of postive sentiment
   # neg.words: vector of words of negative sentiment  
   # create simple array of scores with laply

   scores = laply(sentences, 
   function(sentence, pos.words, neg.words)
   {
      # remove punctuation
      sentence = gsub("[[:punct:]]", "", sentence)
      # remove control characters
      sentence = gsub("[[:cntrl:]]", "", sentence)
      # remove digits?
      sentence = gsub('\\d+', '', sentence)

      # define error handling function when trying tolower
      tryTolower = function(x)
      {
         # create missing value
         y = NA
         # tryCatch error
         try_error = tryCatch(tolower(x), error=function(e) e)
         # if not an error
         if (!inherits(try_error, "error"))
         y = tolower(x)
         # result
         return(y)
      }

      # use tryTolower with sapply 
      sentence = sapply(sentence, tryTolower)
      # split sentence into words with str_split (stringr package)
      word.list = str_split(sentence, "\\s+")
      words = unlist(word.list)

      # compare words to the dictionaries of positive & negative terms
      pos.matches = match(words, pos.words)
      neg.matches = match(words, neg.words)

      # get the position of the matched term or NA
      # we just want a TRUE/FALSE
      pos.matches = !is.na(pos.matches)
      neg.matches = !is.na(neg.matches)

      # final score
      score = sum(pos.matches) - sum(neg.matches)
      return(score)
    }, pos.words, neg.words)

   # data frame with scores for each sentence
   scores.df = data.frame(text=sentences, score=scores)
   return(scores.df)
}

pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')
#pos.words = c(hu.liu.pos)
#neg.words = c(hu.liu.neg)

sample = c("You're awesome and I love you",
"I hate and hate and hate. So angry. Die!",
"Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity",
"Oh how I love being ignored",
"Absolutely adore it when my bus is late.")

result = score.sentiment(sample, pos.words, neg.words)

result

ABCDEFGHIJ0123456789

text <fctr>	score <int>
You're awesome and I love you	2
I hate and hate and hate. So angry. Die!	-5
Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity	4
Oh how I love being ignored	1
Absolutely adore it when my bus is late.	1

hist(result$score)

qplot(result$score,bins=10)+theme_bw()

“R4_2.R”

rm(list = ls())              

setwd("/home/stat/다운로드/┴ж4└х")
library(twitteR)
library(plyr)
library(stringr)
library(ggplot2)

FUN=function(sentence, pos.words, neg.words)
   {
      # remove punctuation
      sentence = gsub("[[:punct:]]", "", sentence)
      # remove control characters
      sentence = gsub("[[:cntrl:]]", "", sentence)
      # remove digits?
      sentence = gsub('\\d+', '', sentence)

      # define error handling function when trying tolower
      tryTolower = function(x)
      {
         # create missing value
         y = NA
         # tryCatch error
         try_error = tryCatch(tolower(x), error=function(e) e)
         # if not an error
         if (!inherits(try_error, "error"))
         y = tolower(x)
         # result
         return(y)
      }

      # use tryTolower with sapply 
      sentence = sapply(sentence, tryTolower)
      # split sentence into words with str_split (stringr package)
      word.list = str_split(sentence, "\\s+")
      words = unlist(word.list)

      # compare words to the dictionaries of positive & negative terms
      pos.matches = match(words, pos.words)
      neg.matches = match(words, neg.words)

      # get the position of the matched term or NA
      # we just want a TRUE/FALSE
      pos.matches = !is.na(pos.matches)
      neg.matches = !is.na(neg.matches)

      # final score
      score = sum(pos.matches) - sum(neg.matches)
      return(score)
    }

score.sentiment = function(sentences, pos.words, neg.words)
{
   # Parameters
   # sentences: vector of text to score
   # pos.words: vector of words of postive sentiment
   # neg.words: vector of words of negative sentiment  
   # create simple array of scores with laply

   scores = laply(sentences, FUN, pos.words, neg.words)

   # data frame with scores for each sentence
   scores.df = data.frame(text=sentences, score=scores)
   return(scores.df)
}
pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')
#pos.words = c(hu.liu.pos)
#neg.words = c(hu.liu.neg)

sample = c("You're awesome and I love you",
"I hate and hate and hate. So angry. Die!",
"Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity",
"Oh how I love being ignored",
"Absolutely adore it when my bus is late.")

result = score.sentiment(sample, pos.words, neg.words)

result

ABCDEFGHIJ0123456789

text <fctr>	score <int>
You're awesome and I love you	2
I hate and hate and hate. So angry. Die!	-5
Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity	4
Oh how I love being ignored	1
Absolutely adore it when my bus is late.	1

hist(result$score)

qplot(result$score,bins=10)+theme_bw()

ver.hadoop

rm(list = ls())              

setwd("/home/stat/다운로드/┴ж4└х")

pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')

head(pos.words)

## [1] "a+"         "abound"     "abounds"    "abundance"  "abundant"  
## [6] "accessable"

head(neg.words)

## [1] "2-faced"    "2-faces"    "abnormal"   "abolish"    "abominable"
## [6] "abominably"

sample = c("You're awesome and I love you",
"I hate and hate and hate. So angry. Die!",
"Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity",
"Oh how I love being ignored",
"Absolutely adore it when my bus is late.")

library(twitteR)
library(plyr)
library(stringr)
library(ggplot2)

library(rhdfs)

## Loading required package: rJava

## 
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop

## 
## Be sure to run hdfs.init()

hdfs.init()
library(rmr2)

## Warning: S3 methods 'gorder.default', 'gorder.factor', 'gorder.data.frame',
## 'gorder.matrix', 'gorder.raw' were declared in NAMESPACE but not found

## Please review your hadoop settings. See help(hadoop.settings)

IDEA

병렬처리를 위해서 줄별로 나눈 후 각 줄별로 따로 처리한다.

Defining FUN

FUN <- function(sentence, pos.words, neg.words){
  
      # remove punctuation
      sentence = gsub("[[:punct:]]", "", sentence)
      # remove control characters
      sentence = gsub("[[:cntrl:]]", "", sentence)
      # remove digits?
      sentence = gsub('\\d+', '', sentence)

      # define error handling function when trying tolower
      tryTolower = function(x)
        {
         # create missing value
         y = NA
         # tryCatch error
         try_error = tryCatch(tolower(x), error=function(e) e)
         # if not an error
         if (!inherits(try_error, "error"))
         y = tolower(x)
         # result
         return(y)
         }

      # use tryTolower with sapply 
      sentence = sapply(sentence, tryTolower)
      # split sentence into words with str_split (stringr package)
      word.list = str_split(sentence, "\\s+")
      words = unlist(word.list)

      # compare words to the dictionaries of positive & negative terms
      pos.matches = match(words, pos.words)
      neg.matches = match(words, neg.words)

      # get the position of the matched term or NA
      # we just want a TRUE/FALSE
      pos.matches = !is.na(pos.matches)
      neg.matches = !is.na(neg.matches)

      # final score
      score = sum(pos.matches) - sum(neg.matches)
      return(score)
      }

MAPPER

mapper <- function(k,v){
  return(keyval(v,""))
}

REDUCER

reducer <- function(k,v){
  sen_score <- FUN(k, pos.words = pos.words, neg.words = neg.words)
  return(keyval(k,sen_score))
}

MAPREDUCE

hadoop_sample <- to.dfs(sample)

hadoop_result <- mapreduce(hadoop_sample,
                           map = mapper,
                           reduce = reducer)
result <- from.dfs(hadoop_result)

as.data.frame(result)

ABCDEFGHIJ0123456789

key <fctr>	val <int>
Oh how I love being ignored	1
You're awesome and I love you	2
Absolutely adore it when my bus is late.	1
I hate and hate and hate. So angry. Die!	-5
Impressed and amazed: you are peerless in your achievement of unparalleled mediocrity	4

hist(result$val)

qplot(result$val,bins=10)+theme_bw()

저작자표시 비영리 변경금지

티스토리

RHADOOP - SENTIMENTAL ANALYSIS (17/11/23 Lecture Note)

RHADOOP - SENTIMENTAL ANALYSIS (17/11/23 Lecture Note)

파일 및 참고자료 : http://stat.knu.ac.kr/

http://stat.knu.ac.kr/pg/bbs/board.php?bo_table=ja02&part=1

11_23_Lecture_Note

“R4_1.R”

“R4_2.R”

ver.hadoop

IDEA

Defining FUN

MAPPER

REDUCER

MAPREDUCE