티스토리 뷰

Rhadoop

RHADOOP - Logistic Regression

딥스탯 2017. 11. 30. 17:39
Logistic Regression in RHadoop
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rhdfs)
## Loading required package: rJava
## 
## HADOOP_CMD=/home/stat/hadoop/hadoop-2.7.4/bin/hadoop
## 
## Be sure to run hdfs.init()
hdfs.init()
library(rmr2)
## Warning: S3 methods 'gorder.default', 'gorder.factor', 'gorder.data.frame',
## 'gorder.matrix', 'gorder.raw' were declared in NAMESPACE but not found
## Please review your hadoop settings. See help(hadoop.settings)

Logistic Regression in R

#install.packages("catdata")
library(catdata)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
data(foodstamp)
foodstamp
ABCDEFGHIJ0123456789
y
<dbl>
TEN
<dbl>
SUP
<dbl>
INC
<dbl>
010271
010287
011714
010521
0000
010518
010458
0101266
000350
010168
summary(foodstamp)
##        y             TEN            SUP              INC        
##  Min.   :0.00   Min.   :0.00   Min.   :0.0000   Min.   :   0.0  
##  1st Qu.:0.00   1st Qu.:0.00   1st Qu.:0.0000   1st Qu.: 264.5  
##  Median :0.00   Median :1.00   Median :0.0000   Median : 461.0  
##  Mean   :0.16   Mean   :0.66   Mean   :0.2733   Mean   : 701.4  
##  3rd Qu.:0.00   3rd Qu.:1.00   3rd Qu.:1.0000   3rd Qu.: 781.5  
##  Max.   :1.00   Max.   :1.00   Max.   :1.0000   Max.   :4533.0
temp_foodstamp <- foodstamp ; temp_foodstamp[,4] <- scale(temp_foodstamp[,4])
summary(glm(y ~ ., data=temp_foodstamp, family = binomial(link = "logit")))
## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = temp_foodstamp)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2376  -0.5564  -0.3464  -0.1545   2.7955  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.3844     0.4676  -2.960  0.00307 ** 
## TEN          -1.7603     0.5292  -3.326  0.00088 ***
## SUP           0.7752     0.5066   1.530  0.12591    
## INC          -1.0715     0.6745  -1.588  0.11218    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 131.90  on 149  degrees of freedom
## Residual deviance: 104.33  on 146  degrees of freedom
## AIC: 112.33
## 
## Number of Fisher Scoring iterations: 6

Logistic Regression in RHadoop

Optimizer : gradient decent (Original “glm” optimizer : Fisher scoring)

defining g

g <- function(x) {1 / (1 + exp(-x))}

MAPPER_1

logistic_map1 <- function(k,v){
  keyval(1:(dim(v)[1]), v)
}

REDUCE_1

logistic_red1 <- function(k,v){
  Y <- v[1]
  X <- v[-1]
  theta <- g(plane %*% X)
  temp <- (theta - Y) * X
  keyval(k,t(temp))
}

MAPPER_2

logistic_map2 <- function(k,v){
  keyval(1,v)
}

REDUCE_2

logistic_red2 <- function(k,v){
  keyval(k,apply(v,2,sum))
}

MAPREDUCE

hadoop_foodstamp <- to.dfs(as.matrix(data.frame(y=temp_foodstamp[,1], intercept=1, temp_foodstamp[,-1])))
plane <- t(rep(0 , 4))

for(i in 1:10){
  result <- mapreduce(
    input = hadoop_foodstamp,
    map = logistic_map1,
    reduce = logistic_red1
  ) %>%
    mapreduce(
      map = logistic_map2,
      reduce = logistic_red2
    )
  
  temp_result <- from.dfs(result)
  
  
  gradient <- values(from.dfs(result))
  print(plane <- plane - 0.05 * gradient)
}
##       [,1]   [,2]   [,3]       [,4]
## [1,] -2.55 -2.175 -0.425 -0.5520207
##           [,1]      [,2]      [,3]      [,4]
## [1,] -1.589195 -1.916039 0.1047398 -1.008167
##           [,1]      [,2]      [,3]      [,4]
## [1,] -1.189547 -1.787534 0.3844662 -1.196242
##           [,1]      [,2]      [,3]      [,4]
## [1,] -1.256021 -1.810193 0.4330996 -1.162168
##           [,1]      [,2]      [,3]      [,4]
## [1,] -1.263854 -1.808322 0.4951543 -1.155337
##           [,1]      [,2]      [,3]      [,4]
## [1,] -1.283777 -1.809604 0.5402463 -1.142785
##           [,1]      [,2]      [,3]      [,4]
## [1,] -1.297454 -1.807896 0.5791073 -1.133267
##           [,1]      [,2]      [,3]      [,4]
## [1,] -1.310007 -1.805556 0.6109193 -1.124462
##           [,1]      [,2]      [,3]      [,4]
## [1,] -1.320515 -1.802542 0.6374605 -1.116851
##           [,1]      [,2]      [,3]      [,4]
## [1,] -1.329551 -1.799257 0.6595355 -1.110187
plane
##           [,1]      [,2]      [,3]      [,4]
## [1,] -1.329551 -1.799257 0.6595355 -1.110187
공지사항
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크
TAG
more
«   2025/06   »
1 2 3 4 5 6 7
8 9 10 11 12 13 14
15 16 17 18 19 20 21
22 23 24 25 26 27 28
29 30
글 보관함