We first load all the required libraries.
library(devtools)
library(gRim)
library(igraph)
install_github("pzwiernik/golazo")
library(golazo)
Download FIR upregulated genes.
download.file("https://functionalgenomics.upf.edu/supplements/FIRinELGANs/TableS6.csv",
"FIRupregulatedgenes.csv")
upgenes <- read.csv("FIRupregulatedgenes.csv", sep="\t", stringsAsFactors=FALSE)
dim(upgenes)
## [1] 592 7
Download genes with a role in the innate immune response.
download.file("https://www.innatedb.com/download/innatedb_curated_genes.xls",
"innatedb_curated_genes.xls")
library(xlsx)
innatedb <- read.xlsx(file="innatedb_curated_genes.xls", sheetIndex=1,
stringsAsFactors=FALSE)
dim(innatedb)
## [1] 3713 5
Subset innatedb for human genes with a role in the innate immune response human is taxon 9606 https://www.ncbi.nlm.nih.gov/taxonomy/9606
iigenes.human <- unique(innatedb[innatedb$Species == 9606, "Gene.Symbol"])
length(iigenes.human)
## [1] 1052
Read the gene expression dataset.
temp <- tempfile()
download.file("https://functionalgenomics.upf.edu/supplements/FIRinELGANs/CostaCastelo2016data.zip",temp)
y <- read.csv(unz(temp, "CostaCastelo2016normalizedGeneExp.csv"), row.names=1)
unlink(temp)
dim(y)
## [1] 12093 43
stopifnot(all(upgenes$Symbol %in% rownames(y))) ## QC
Further subset the innate immune genes to those present in our data
iigenes.human <- iigenes.human[iigenes.human %in% rownames(y)]
length(iigenes.human)
## [1] 704
fetch the intersecting set of 136 upregulated genes with a role in the innate immune response
iiupgenes <- intersect(upgenes$Symbol, iigenes.human)
length(iiupgenes)
## [1] 136
Now we are finally ready to begin our analysis. Compute the sample correlation matrix
S <- cov.wt(t(y[iiupgenes,]), method ="ML")$cov
n <- length(iiupgenes)
R <- cov2cor(S)
d <- nrow(R)
Select the penalty parameter using BIC/EBIC. For illustration, we check only 5 potential rhos.
L <- matrix(0,d,d)
U <- matrix(1,d,d)
diag(U) <- 0
ebic <- EBICgolazo(R,n=n,L=L,U=U,rhomin = 0.01,rhomax=1,nrhos=10)
plot(ebic$rhos,ebic$ebic,type="l")
rho <- ebic$opt.rho
Rerun the analysis for the optimal rho.
res <- positive.golazo(R,rho)
## ** The function maximizes the log-likelihood function penalized with the general GOLAZO penalty.
## The input matrix is not positive definite. Computing the starting point.. DONE
##
## The algorithm will stop when the dual gap is below: 1e-07 .
##
## Iteration | Dual Gap
## 1 | 78.9456
## 2 | 16.39732
## 3 | 7.256931
## 4 | 4.26599
## 5 | 2.858529
## 6 | 2.004468
## 7 | 1.432673
## 8 | 1.040407
## 9 | 0.7629886
## 10 | 0.5644676
## 11 | 0.420677
## 12 | 0.3164019
## 13 | 0.2391608
## 14 | 0.1816402
## 15 | 0.1382717
## 16 | 0.1053131
## 17 | 0.08023786
## 18 | 0.06112197
## 19 | 0.04655535
## 20 | 0.03546659
## 21 | 0.02702336
## 22 | 0.0205944
## 23 | 0.01569752
## 24 | 0.01196517
## 25 | 0.009118848
## 26 | 0.006951403
## 27 | 0.005299603
## 28 | 0.004037011
## 29 | 0.003073707
## 30 | 0.00233886
## 31 | 0.001775927
## 32 | 0.001346277
## 33 | 0.001019749
## 34 | 0.0007664926
## 35 | 0.0005742699
## 36 | 0.0004284788
## 37 | 0.0003170495
## 38 | 0.0002319242
## 39 | 0.0001690118
## 40 | 0.0001237473
## 41 | 8.608429e-05
## 42 | 5.862505e-05
## 43 | 3.894448e-05
## 44 | 2.517995e-05
## 45 | 1.420998e-05
## 46 | 6.744628e-06
## 47 | 2.494403e-06
## 48 | -1.386148e-07
The estimated graph is sparse. We can play around with it using the igraph
package.
G <- qgraph::qgraph(-cov2cor(res$K),layout="spring",palette="colorblind",theme="colorblind",threshold=1e-5)
## Registered S3 methods overwritten by 'huge':
## method from
## plot.sim BDgraph
## print.sim BDgraph
layout <- G$layout
G <- as.igraph(G)
edge_density(G, loops=F)
## [1] 0.06851852
diameter(G, directed=F, weights=NA)
## [1] 5
Or do some clustering.
GG <- G
E(GG)$weights <- 1
ceb <- cluster_edge_betweenness(G,directed=FALSE,weights=NULL)
plot(ceb, G,vertex.label.cex=.2,layout=layout)
We will now compare our results with graphical lasso.
L <- matrix(-1,d,d)
U <- matrix(1,d,d)
diag(U) <- diag(L) <- 0
gebic <- EBICgolazo(R,n=n,L=L,U=U,rhomin = 0.01,rhomax=1,nrhos=10)
grho <- gebic$opt.rho
Recompute for the optimal rho.
L <- matrix(-rho,d,d)
U <- matrix(rho,d,d)
diag(U) <- diag(L) <- 0
resgl <- golazo(R,L=L,U=U)
## ** The function maximizes the log-likelihood function penalized with the general GOLAZO penalty.
## The input matrix is not positive definite. Computing the starting point.. DONE
##
## The algorithm will stop when the dual gap is below: 1e-07 .
##
## Iteration | Dual Gap
## 1 | 22.68689
## 2 | 1.728408
## 3 | 0.3878756
## 4 | 0.1137225
## 5 | 0.03512693
## 6 | 0.01114183
## 7 | 0.00357517
## 8 | 0.001110542
## 9 | 0.0003062499
## 10 | 6.271978e-05
## 11 | 9.176186e-06
## 12 | 2.132126e-06
## 13 | 7.218843e-07
## 14 | 2.459596e-07
## 15 | 8.422813e-08
Check the corresponding EBIC score.
Kgl <- resgl$K
Rgl <- abs(cov2cor(resgl$K))
nedggl = length(which(Rgl[upper.tri(abs(Rgl),diag=FALSE)]> 1e-6))
cat("Edge density: ",nedggl/(d*(d-1)/2),"\n")
## Edge density: 0.2020697
ebic$ebic[which(ebic$rhos==rho)]
## [1] -8758.366
gebic$ebic[which(gebic$rhos==grho)]
## [1] 4610.703
To complete the analysis we run the second step of our procedure.
resDLE <- la.golazo(res$K)
## ** The function maximizes the log-likelihood function penalized with the general GOLAZO penalty.
## The algorithm is initiated at the input matrix.
##
## The algorithm will stop when the dual gap is below: 1e-07 .
##
## Iteration | Dual Gap
## 1 | -3.488211e-08