티스토리 뷰

2017/09/28 LECTURE NOTE

Object : Understanding “REDUCE”

require(rhdfs)
## Loading required package: rhdfs
## Loading required package: rJava
## 
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop
## 
## Be sure to run hdfs.init()
hdfs.init()
require(rmr2)
## Loading required package: rmr2
## Please review your hadoop settings. See help(hadoop.settings)

Hadoop Example 3

Generate 100 random samples from N(0,1) and count numbers greater than or less than 0.

small.ints<-to.dfs(rnorm(100))

Freq<-mapreduce(
  input=small.ints,
  map=function(k,v) keyval(ifelse(v<0,"less_than_0","greater_than_0"),1),
  reduce=function(k,v) keyval(k,sum(v))
)

out<-from.dfs(Freq)
out
## $key
## [1] "less_than_0"    "greater_than_0"
## 
## $val
## [1] 51 49
x<-out$val;names(x)<-out$key
barplot(x)

Exercises

  1. Generate 1000 random samples from B(5,0.3) and count the elements
  2. Generate 100 random samples from Geom(0.8) and count the elements
  3. Generate 1000 random samples from N(0,1), classify them into the positives and negatives, and calculate the mean of each group.

Example 4

Generate 10 random samples from B(5,0.9) and find the minimum value. Repeat it 1000 times and count the number of the minimum values.

s<-to.dfs(1:1000)
map1<-function(k,v){
    keyval(v,sapply(v,function(r) min(rbinom(10,5,0.9))))
}
map2<-function(k,v){
  keyval(v,1)
}
reduce2<-function(k,v){
  keyval(k,sum(v))
}
a<-mapreduce(input=mapreduce(input=s,
                             map=map1),
             map=map2,
             reduce=reduce2)
from.dfs(a)
## $key
## [1] 1 2 3 4 5
## 
## $val
## [1]   4  85 493 411   7

or

s<-to.dfs(1:1000)
a<-mapreduce(input=s,
             map=function(k,v){
               keyval(v,sapply(v,function(r) min(rbinom(10,5,0.9))))
             })
b<-mapreduce(input=a,
             map=function(k,v){
               keyval(v,1)
             },
             reduce=function(k,v) keyval(k,sum(v)))
from.dfs(b)
## $key
## [1] 1 2 3 4 5
## 
## $val
## [1]   4  85 483 422   6

Exercises

  1. Generate 10 random samples from Geom(0.5) and find the maximum value. Repeat it 1000 times and count the number of the maximum values.

  2. Generate 10 random samples from Cauchy(0,1) and find the mean. Repeat it 1000 times, classify them into the positives and negatives, and calculate the mean of each group.

2017/09/28 SOLUTIONS
require(rhdfs)
## Loading required package: rhdfs
## Loading required package: rJava
## 
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop
## 
## Be sure to run hdfs.init()
hdfs.init()
require(rmr2)
## Loading required package: rmr2
## Please review your hadoop settings. See help(hadoop.settings)

Exercise 3.1

small.ints<-to.dfs(rbinom(1000,5,.3))

Freq<-mapreduce(
  input=small.ints,
  map=function(k,v) keyval(v,1),
  reduce=function(k,v) keyval(k,sum(v))
)

out<-from.dfs(Freq)
out
## $key
## [1] 0 1 2 3 4 5
## 
## $val
## [1] 198 323 311 140  26   2
x<-out$val;names(x)<-out$key
barplot(x)

Exercise 3.2

small.ints<-to.dfs(rgeom(100,.8))

Freq<-mapreduce(
  input=small.ints,
  map=function(k,v) keyval(v,1),
  reduce=function(k,v) keyval(k,sum(v))
)

out<-from.dfs(Freq)
out
## $key
## [1] 0 1 2 3
## 
## $val
## [1] 73 23  3  1
x<-out$val;names(x)<-out$key
barplot(x)

Exercise 3.3

small.ints<-to.dfs(rnorm(1000))

Freq<-mapreduce(
  input=small.ints,
  map=function(k,v) keyval(ifelse(v<0,"less_than_0","greater_than_0"),v),
  reduce=function(k,v) keyval(k,mean(v))
)

out<-from.dfs(Freq)
out
## $key
## [1] "less_than_0"    "greater_than_0"
## 
## $val
## [1] -0.7531157  0.7885698
x<-out$val;names(x)<-out$key
barplot(x)

Exercise 4.1

s<-to.dfs(1:1000)
map1<-function(k,v){
    keyval(v,sapply(v,function(r) max(rgeom(10,.5))))
}
map2<-function(k,v){
  keyval(v,1)
}
reduce2<-function(k,v){
  keyval(k,sum(v))
}
a<-mapreduce(input=mapreduce(input=s,
                             map=map1),
             map=map2,
             reduce=reduce2)
from.dfs(a)
## $key
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13
## 
## $val
##  [1]   2  51 211 274 197 123  68  31  18  11   9   2   1   2

or

s<-to.dfs(1:1000)
a<-mapreduce(input=s,
             map=function(k,v){
              keyval(v,sapply(v,function(r) max(rgeom(10,.5))))
             })
b<-mapreduce(input=a,
             map=function(k,v){
               keyval(v,1)
             },
             reduce=function(k,v) keyval(k,sum(v)))
from.dfs(b)
## $key
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13
## 
## $val
##  [1]   1  60 212 264 202 118  69  31  18  11   9   2   1   2

Exercise 4.2

s<-to.dfs(1:1000)
map1<-function(k,v){
  keyval(v,sapply(v,function(r) mean(rcauchy(10))))
}
map2<-function(k,v){
  keyval(ifelse(v<0,"less_than_0","greater_than_0"),v)
}
reduce2<-function(k,v){
  keyval(k,mean(v))
}
a<-mapreduce(input=mapreduce(input=s,
                             map=map1),
             map=map2,
             reduce=reduce2)
from.dfs(a)
## $key
## [1] "less_than_0"    "greater_than_0"
## 
## $val
## [1] -2.998590  6.106552

or

s<-to.dfs(1:1000)
a<-mapreduce(input=s,
             map=function(k,v){
               keyval(v,sapply(v,function(r) mean(rcauchy(10))))
               })
b<-mapreduce(input=a,
             map=function(k,v){
               keyval(ifelse(v<0,"less_than_0","greater_than_0"),v)
             },
             reduce=function(k,v) keyval(k,mean(v)))
from.dfs(b)
## $key
## [1] "less_than_0"    "greater_than_0"
## 
## $val
## [1] -2.935838  6.244269


공지사항
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크
TAG
more
«   2025/05   »
1 2 3
4 5 6 7 8 9 10
11 12 13 14 15 16 17
18 19 20 21 22 23 24
25 26 27 28 29 30 31
글 보관함