We first load all the required libraries.
library(devtools)
library(gRim)
library(igraph)
install_github("pzwiernik/golazo")
library(golazo)
Load the Body Fat dataset.
data(BodyFat)
head(BodyFat)
Modify the data as it contains errors and make all measurements.
BodyFat <- BodyFat[-c(31,42,48,76,86,96,159,169,175,182,206),]
Transforming measurements to a similar scale and removing variables Density and Age.
#BodyFat$Age <- sqrt(BodyFat$Age)
BodyFat$Weight <- sqrt(BodyFat$Weight)
gRbodyfat <- BodyFat[,-c(1,3)]
n <- nrow(gRbodyfat)
Calculate covariance matrix and correlation matrix.
S.body <- cov.wt(gRbodyfat, method ="ML")$cov
R.body <- cov2cor(S.body)
We first compute the positive glasso estimate for a number of possible penalty parameters. For each \(\rho\) we compute the EBIC score.
d <- nrow(R.body)
L <- matrix(0,d,d)
U <- matrix(1,d,d)
diag(U) <- 0
ebic <- EBICgolazo(R.body,n=n,L=L,U=U,rhomin=0.001,rhomax=1,nrhos=20,gamma=0.5)
plot(ebic$rhos,ebic$ebic,type="l",xlab="rho",ylab="EBIC",main="EBIC for gamma=0.5")
rho <- ebic$opt.rho
Here \(\rho=0.11\) gives the lowest value of EBIC. The algorithm converges in a couple of steps.
res <- positive.golazo(R.body,rho)
## ** The function maximizes the log-likelihood function penalized with the general GOLAZO penalty.
## The algorithm is initiated at the input matrix.
##
## The algorithm will stop when the dual gap is below: 1e-07 .
##
## Iteration | Dual Gap
## 1 | 0.3229601
## 2 | 0.02903326
## 3 | 0.004478887
## 4 | 0.0007829846
## 5 | 0.0001191888
## 6 | 1.759889e-05
## 7 | 2.592005e-06
## 8 | 3.431388e-07
## 9 | 9.724048e-09
Khat <- res$K
print(round(Khat,3))
## BodyFat Weight Height Neck Chest Abdomen Hip Thigh Knee Ankle
## BodyFat 3.186 0.000 0.400 0.000 0.000 -2.926 0.000 0.000 0.000 0.289
## Weight 0.000 16.492 -1.516 -1.317 -2.953 -2.273 -5.595 -0.206 -2.068 -0.711
## Height 0.400 -1.516 1.674 0.000 0.508 0.396 0.000 0.000 -0.292 -0.177
## Neck 0.000 -1.317 0.000 3.803 -0.733 0.000 0.000 0.000 0.000 0.000
## Chest 0.000 -2.953 0.508 -0.733 7.694 -4.007 0.000 0.000 0.000 0.000
## Abdomen -2.926 -2.273 0.396 0.000 -4.007 10.421 -1.728 0.000 0.000 0.000
## Hip 0.000 -5.595 0.000 0.000 0.000 -1.728 10.707 -3.298 -0.043 0.000
## Thigh 0.000 -0.206 0.000 0.000 0.000 0.000 -3.298 5.496 -0.800 -0.238
## Knee 0.000 -2.068 -0.292 0.000 0.000 0.000 -0.043 -0.800 4.269 -0.923
## Ankle 0.289 -0.711 -0.177 0.000 0.000 0.000 0.000 -0.238 -0.923 2.866
## Biceps 0.000 -0.981 0.000 -0.238 0.000 0.000 0.000 -0.761 0.000 0.000
## Forearm 0.006 -0.413 0.000 -0.432 0.000 0.000 0.000 0.000 0.000 0.000
## Wrist 0.069 -0.301 0.000 -0.898 0.000 0.000 0.000 0.000 0.000 -0.745
## Biceps Forearm Wrist
## BodyFat 0.000 0.006 0.069
## Weight -0.981 -0.413 -0.301
## Height 0.000 0.000 0.000
## Neck -0.238 -0.432 -0.898
## Chest 0.000 0.000 0.000
## Abdomen 0.000 0.000 0.000
## Hip 0.000 0.000 0.000
## Thigh -0.761 0.000 0.000
## Knee 0.000 0.000 0.000
## Ankle 0.000 0.000 -0.745
## Biceps 3.755 -1.541 0.000
## Forearm -1.541 3.339 -0.710
## Wrist 0.000 -0.710 2.860
The graph of the underlying partial correlations looks as follows.
G <- qgraph::qgraph(-cov2cor(res$K),layout="spring",palette="colorblind",theme="colorblind")
layout <- G$layout
We run the graphical lasso for the same data. The optimal penalty is chosen again using EBIC. The penalty is chosen the minimal possible.
L <- matrix(-1,d,d)
U <- matrix(1,d,d)
diag(U) <- diag(L) <- 0
gebic <- EBICgolazo(R.body,n=n,L=L,U=U,rhomin=0.001,rhomax=1,nrhos=20,gamma=0.5)
plot(gebic$rhos,gebic$ebic,type="l",xlab="rho",ylab="EBIC",main="EBIC for gamma=0.5")
grho <- gebic$opt.rho
And again rerun GOLAZO for this optimal rho.
L <- matrix(-grho,d,d)
U <- matrix(grho,d,d)
diag(U) <- diag(L) <- 0
res2 <- golazo(R.body,L=L,U=U)
## ** The function maximizes the log-likelihood function penalized with the general GOLAZO penalty.
## The algorithm is initiated at the input matrix.
##
## The algorithm will stop when the dual gap is below: 1e-07 .
##
## Iteration | Dual Gap
## 1 | 0.0002576488
## 2 | 5.224596e-07
## 3 | -4.163336e-15
Kglasso <- res2$K
print(round(Kglasso,3))
## BodyFat Weight Height Neck Chest Abdomen Hip Thigh Knee
## BodyFat 3.689 0.000 0.292 0.605 0.339 -4.584 1.032 -0.215 -0.135
## Weight 0.000 47.011 -8.881 -3.602 -11.105 -8.379 -11.321 -4.187 -0.998
## Height 0.292 -8.881 3.331 0.589 2.448 2.052 1.318 1.285 -0.649
## Neck 0.605 -3.602 0.589 4.236 0.009 -0.769 1.089 -0.342 0.252
## Chest 0.339 -11.105 2.448 0.009 10.358 -3.865 1.960 2.266 0.196
## Abdomen -4.584 -8.379 2.052 -0.769 -3.865 16.609 -3.091 0.963 -0.723
## Hip 1.032 -11.321 1.318 1.089 1.960 -3.091 12.988 -3.587 -0.246
## Thigh -0.215 -4.187 1.285 -0.342 2.266 0.963 -3.587 7.100 -1.077
## Knee -0.135 -0.998 -0.649 0.252 0.196 -0.723 -0.246 -1.077 4.439
## Ankle 0.218 -3.218 0.349 0.621 0.262 1.257 0.296 -0.683 -0.894
## Biceps -0.088 -1.969 0.374 -0.167 -0.162 0.705 0.056 -1.002 0.043
## Forearm -0.427 -2.838 0.269 -0.440 -0.558 2.197 1.336 -0.528 -0.079
## Wrist 0.532 -0.323 0.042 -1.127 0.382 -1.194 0.035 1.284 -0.364
## Ankle Biceps Forearm Wrist
## BodyFat 0.218 -0.088 -0.427 0.532
## Weight -3.218 -1.969 -2.838 -0.323
## Height 0.349 0.374 0.269 0.042
## Neck 0.621 -0.167 -0.440 -1.127
## Chest 0.262 -0.162 -0.558 0.382
## Abdomen 1.257 0.705 2.197 -1.194
## Hip 0.296 0.056 1.336 0.035
## Thigh -0.683 -1.002 -0.528 1.284
## Knee -0.894 0.043 -0.079 -0.364
## Ankle 3.346 0.492 0.121 -1.168
## Biceps 0.492 3.966 -1.541 -0.129
## Forearm 0.121 -1.541 3.996 -0.962
## Wrist -1.168 -0.129 -0.962 3.406
And the partial correlation graph is
qgraph::qgraph(-cov2cor(Kglasso),layout=layout,palette="colorblind",theme="colorblind")
The glasso estimate is dense so the likelihood is higher for that point.
c(log(det(Khat))-sum(diag(R.body%*%Khat)),log(det(Kglasso))-sum(diag(R.body%*%Kglasso)))
## [1] 3.05566 4.37581
However, penalizing for size of the graph, the EBIC criterion prefers the positive glasso estimate.
# number of edges
cat("The positive glasso EBIC: ",ebic$ebic[which(ebic$rhos==rho)],"\n")
## The positive glasso EBIC: -364.8998
cat("The glasso EBIC: ",gebic$ebic[which(gebic$rhos==grho)],"\n")
## The glasso EBIC: -226.6239
Now we take \(\hat K\) computed in the previous step and maximize the dual likelihood subject to edge positivity constraints on the underlying graph with edges \(\hat E\).
resDLE <- la.golazo(Khat)
## ** The function maximizes the log-likelihood function penalized with the general GOLAZO penalty.
## The algorithm is initiated at the input matrix.
##
## The algorithm will stop when the dual gap is below: 1e-07 .
##
## Iteration | Dual Gap
## 1 | 4.80742e-09
The positive graphical lasso procedure already gave an edge positive distribution so the second step does not do anything.