Rhadoop
RHADOOP - Linear Regression (17/10/26 Lecture Note)
딥스탯
2017. 11. 28. 23:06
10 26 Lecture Note
Choi Youngtae
선형 회귀 Linear Regression
Simple Linear Regression
Multiple Linear Regression
그럼 이 때의 은?
일반적인 상황에서는 그냥 계산하면 된다.
그럼 빅데이터에서는 대체 무슨 문제가?
Time Complexity
example
Algorithm
=====
– Input : matrices and
– Let be a new matrix of the appropriate size
– For i from 1 to n:
– For j from 1 to p:
– Let sum = 0
– For k from 1 to m:
– Set sum <- sum +
– Set <- sum
– Return
=====
Time Complexity :
그럼 위의 계산 는?
이므로 Time Complexity는
빅데이터에서는 도 어마어마하게 크고 도 어마어마하게 크므로 계산 속도가 기하급수적으로 느려진다.
해결 방법은? 병렬처리.
어떻게 이 계산을 병렬처리 할 수 있을까?
Mapreduce Code
example
require(rhdfs)
## Loading required package: rhdfs
## Loading required package: rJava
##
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop
##
## Be sure to run hdfs.init()
hdfs.init()
require(rmr2)
## Loading required package: rmr2
## Please review your hadoop settings. See help(hadoop.settings)
X <- rnorm(10) ; dim(X)<-c(5,2)
X_dfs <- to.dfs(X)
X_mapred <- mapreduce(
input = X_dfs,
map=function(k,v) keyval(1:10,v),
reduce=function(k,v){
keyval(k,list(t(v)%*%t(t(v))))
}
)
temp <- from.dfs(X_mapred)
temp
## $key
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $val
## $val[[1]]
## [,1] [,2]
## [1,] 3.563711 -2.138390
## [2,] -2.138390 1.283132
##
## $val[[2]]
## [,1] [,2]
## [1,] 0.05293757 -0.3605653
## [2,] -0.36056530 2.4558616
##
## $val[[3]]
## [,1] [,2]
## [1,] 0.4357421 0.9972138
## [2,] 0.9972138 2.2821651
##
## $val[[4]]
## [,1] [,2]
## [1,] 1.2495918 0.34072940
## [2,] 0.3407294 0.09290756
##
## $val[[5]]
## [,1] [,2]
## [1,] 4.961552 -5.078212
## [2,] -5.078212 5.197616
##
## $val[[6]]
## [,1] [,2]
## [1,] 3.563711 -2.138390
## [2,] -2.138390 1.283132
##
## $val[[7]]
## [,1] [,2]
## [1,] 0.05293757 -0.3605653
## [2,] -0.36056530 2.4558616
##
## $val[[8]]
## [,1] [,2]
## [1,] 0.4357421 0.9972138
## [2,] 0.9972138 2.2821651
##
## $val[[9]]
## [,1] [,2]
## [1,] 1.2495918 0.34072940
## [2,] 0.3407294 0.09290756
##
## $val[[10]]
## [,1] [,2]
## [1,] 4.961552 -5.078212
## [2,] -5.078212 5.197616
X_mapred2 <- mapreduce(
input = X_mapred,
map=function(k,v) keyval(1,v),
reduce=function(k,v){
keyval(k,list(Reduce('+',v)))
}
)
XT_X <- from.dfs(X_mapred2)
XT_X
## $key
## [1] 1
##
## $val
## $val[[1]]
## [,1] [,2]
## [1,] 20.52707 -12.47845
## [2,] -12.47845 22.62337
example2
X <- rnorm(20000) ; dim(X)<-c(2000,10)
y <- apply(X,1,sum) + rnorm(2000)
X_dfs <- to.dfs(X)
X_mapred <- mapreduce(
input = X_dfs,
map=function(k,v) keyval(1:2000,v),
reduce=function(k,v){
keyval(k,list(t(v)%*%t(t(v))))
}
)
X_mapred2 <- mapreduce(
input = X_mapred,
map=function(k,v) keyval(1,v),
reduce=function(k,v){
keyval(k,list(Reduce('+',v)))
}
)
XT_X <- from.dfs(X_mapred2)
hat_beta_hadoop <- solve(XT_X$val[[1]],t(X)%*%y)
hat_beta_hadoop
## [,1]
## [1,] 0.9955351
## [2,] 0.9855769
## [3,] 0.9775013
## [4,] 0.9678488
## [5,] 1.0311705
## [6,] 1.0079072
## [7,] 1.0013685
## [8,] 0.9880594
## [9,] 1.0260438
## [10,] 0.9853899
data_df <- data.frame(y,X)
hat_beta_lm <- coef(lm(y~.-1,data=data_df))
hat_beta_lm
## X1 X2 X3 X4 X5 X6 X7
## 0.9955351 0.9855769 0.9775013 0.9678488 1.0311705 1.0079072 1.0013685
## X8 X9 X10
## 0.9880594 1.0260438 0.9853899
data.frame(real_beta=1,hadoop=as.vector(hat_beta_hadoop),lm=hat_beta_lm)
## real_beta hadoop lm
## X1 1 0.9955351 0.9955351
## X2 1 0.9855769 0.9855769
## X3 1 0.9775013 0.9775013
## X4 1 0.9678488 0.9678488
## X5 1 1.0311705 1.0311705
## X6 1 1.0079072 1.0079072
## X7 1 1.0013685 1.0013685
## X8 1 0.9880594 0.9880594
## X9 1 1.0260438 1.0260438
## X10 1 0.9853899 0.9853899
Exercise
마찬가지로 또한 병렬처리 할 수 있다.
이를 병렬처리하는 코드를 작성 할 수 있는가?
11 02 Solution
Choi Youngtae
require(rhdfs)
## Loading required package: rhdfs
## Loading required package: rJava
##
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop
##
## Be sure to run hdfs.init()
hdfs.init()
require(rmr2)
## Loading required package: rmr2
## Warning: S3 methods 'gorder.default', 'gorder.factor', 'gorder.data.frame',
## 'gorder.matrix', 'gorder.raw' were declared in NAMESPACE but not found
## Please review your hadoop settings. See help(hadoop.settings)
X <- rnorm(20000) ; dim(X) <- c(2000,10)
y <- apply(X,1,sum) + rnorm(2000)
Exercise
마찬가지로 또한 병렬처리 할 수 있다.
이를 병렬처리하는 코드를 작성 할 수 있는가?
yX_dfs <- to.dfs(cbind(y,X))
yX_mapred <- mapreduce(
input = yX_dfs,
map = function(k,v) keyval(1:2000,v),
reduce = function(k,v){
keyval(k,list(v[1]*v[-1]))
}
)
yX_mapred2 <- mapreduce(
input = yX_mapred,
map = function(k,v) keyval(1,v),
reduce = function(k,v){
keyval(k,Reduce('+',v))
}
)
XT_y <- from.dfs(yX_mapred2)
data.frame(original = t(X)%*%y, hadoop = XT_y$val)
## original hadoop
## 1 2057.901 2057.901
## 2 2076.298 2076.298
## 3 1990.872 1990.872
## 4 2016.655 2016.655
## 5 1865.059 1865.059
## 6 2069.732 2069.732
## 7 2014.935 2014.935
## 8 1929.780 1929.780
## 9 1841.150 1841.150
## 10 2163.508 2163.508