티스토리 뷰
Logistic Regression in RHadoop
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rhdfs)
## Loading required package: rJava
##
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop
##
## Be sure to run hdfs.init()
hdfs.init()
library(rmr2)
## Warning: S3 methods 'gorder.default', 'gorder.factor', 'gorder.data.frame',
## 'gorder.matrix', 'gorder.raw' were declared in NAMESPACE but not found
## Please review your hadoop settings. See help(hadoop.settings)
Logistic Regression in R
#install.packages("catdata")
library(catdata)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data(foodstamp)
foodstamp
y <dbl> | TEN <dbl> | SUP <dbl> | INC <dbl> | |
---|---|---|---|---|
0 | 1 | 0 | 271 | |
0 | 1 | 0 | 287 | |
0 | 1 | 1 | 714 | |
0 | 1 | 0 | 521 | |
0 | 0 | 0 | 0 | |
0 | 1 | 0 | 518 | |
0 | 1 | 0 | 458 | |
0 | 1 | 0 | 1266 | |
0 | 0 | 0 | 350 | |
0 | 1 | 0 | 168 |
summary(foodstamp)
## y TEN SUP INC
## Min. :0.00 Min. :0.00 Min. :0.0000 Min. : 0.0
## 1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.0000 1st Qu.: 264.5
## Median :0.00 Median :1.00 Median :0.0000 Median : 461.0
## Mean :0.16 Mean :0.66 Mean :0.2733 Mean : 701.4
## 3rd Qu.:0.00 3rd Qu.:1.00 3rd Qu.:1.0000 3rd Qu.: 781.5
## Max. :1.00 Max. :1.00 Max. :1.0000 Max. :4533.0
temp_foodstamp <- foodstamp ; temp_foodstamp[,4] <- scale(temp_foodstamp[,4])
summary(glm(y ~ ., data=temp_foodstamp, family = binomial(link = "logit")))
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = temp_foodstamp)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2376 -0.5564 -0.3464 -0.1545 2.7955
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.3844 0.4676 -2.960 0.00307 **
## TEN -1.7603 0.5292 -3.326 0.00088 ***
## SUP 0.7752 0.5066 1.530 0.12591
## INC -1.0715 0.6745 -1.588 0.11218
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 131.90 on 149 degrees of freedom
## Residual deviance: 104.33 on 146 degrees of freedom
## AIC: 112.33
##
## Number of Fisher Scoring iterations: 6
Logistic Regression in RHadoop
Optimizer : gradient decent (Original “glm” optimizer : Fisher scoring)
defining g
g <- function(x) {1 / (1 + exp(-x))}
MAPPER_1
logistic_map1 <- function(k,v){
keyval(1:(dim(v)[1]), v)
}
REDUCE_1
logistic_red1 <- function(k,v){
Y <- v[1]
X <- v[-1]
theta <- g(plane %*% X)
temp <- (theta - Y) * X
keyval(k,t(temp))
}
MAPPER_2
logistic_map2 <- function(k,v){
keyval(1,v)
}
REDUCE_2
logistic_red2 <- function(k,v){
keyval(k,apply(v,2,sum))
}
MAPREDUCE
hadoop_foodstamp <- to.dfs(as.matrix(data.frame(y=temp_foodstamp[,1], intercept=1, temp_foodstamp[,-1])))
plane <- t(rep(0 , 4))
for(i in 1:10){
result <- mapreduce(
input = hadoop_foodstamp,
map = logistic_map1,
reduce = logistic_red1
) %>%
mapreduce(
map = logistic_map2,
reduce = logistic_red2
)
temp_result <- from.dfs(result)
gradient <- values(from.dfs(result))
print(plane <- plane - 0.05 * gradient)
}
## [,1] [,2] [,3] [,4]
## [1,] -2.55 -2.175 -0.425 -0.5520207
## [,1] [,2] [,3] [,4]
## [1,] -1.589195 -1.916039 0.1047398 -1.008167
## [,1] [,2] [,3] [,4]
## [1,] -1.189547 -1.787534 0.3844662 -1.196242
## [,1] [,2] [,3] [,4]
## [1,] -1.256021 -1.810193 0.4330996 -1.162168
## [,1] [,2] [,3] [,4]
## [1,] -1.263854 -1.808322 0.4951543 -1.155337
## [,1] [,2] [,3] [,4]
## [1,] -1.283777 -1.809604 0.5402463 -1.142785
## [,1] [,2] [,3] [,4]
## [1,] -1.297454 -1.807896 0.5791073 -1.133267
## [,1] [,2] [,3] [,4]
## [1,] -1.310007 -1.805556 0.6109193 -1.124462
## [,1] [,2] [,3] [,4]
## [1,] -1.320515 -1.802542 0.6374605 -1.116851
## [,1] [,2] [,3] [,4]
## [1,] -1.329551 -1.799257 0.6595355 -1.110187
plane
## [,1] [,2] [,3] [,4]
## [1,] -1.329551 -1.799257 0.6595355 -1.110187
'Rhadoop' 카테고리의 다른 글
RHADOOP - TF-IDF (17/11/28 Lecture Note) (0) | 2017.11.28 |
---|---|
RHADOOP - SENTIMENTAL ANALYSIS (17/11/23 Lecture Note) (0) | 2017.11.28 |
RHADOOP - WORD COUNT & WORD CLOUD -2 (17/11/14 Lecture Note) (0) | 2017.11.28 |
RHADOOP - WORD COUNT & WORD CLOUD -1 (17/11/09 Lecture Note) (0) | 2017.11.28 |
RHADOOP - K-means Clustering (17/11/02 Lecture Note) (0) | 2017.11.28 |