Â
library(ggm)
# Generate synthetic data for illustration
set.seed(123)
n <- 100 # Sample size
p <- 5 # Number of variables
# Simulate data from a multivariate normal distribution
Sigma <- solve(matrix(c(1, 0.3, 0.4, 0.5, 0,
0.3, 1, 0.2, 0, 0,
0.4, 0.2, 1, 0, 0.4,
0.5, 0, 0, 1, 0.5,
0, 0, 0.4, 0.5, 1), nrow = 5))
data <- MASS::mvrnorm(n = n, mu = rep(0, p), Sigma = Sigma)
# Define two competing graphs (G0: subgraph, G: full graph)
G0 <- matrix(c(0, 1, 1, 1, 0,
1, 0, 1, 0, 0,
1, 1, 0, 0, 1,
1, 0, 0, 0, 1,
0, 0, 1, 1, 0), nrow = 5, byrow = TRUE)
G <- matrix(1, nrow = 5, ncol = 5) # Full graph (fully connected)
diag(G) <- 0
# Assign variable names
var_names <- c("X1", "X2", "X3", "X4", "X5")
colnames(data) <- colnames(G0) <- colnames(G) <- rownames(G) <- rownames(G0) <- var_names
S <- cov(data)
# Fit models under G0 and G
Shat0 <- fitConGraph(G0,S, nrow(data))$Shat
Shat <- fitConGraph(G,S, nrow(data))$Shat
Khat0 <- solve(Shat0)
Khat <- solve(Shat)
# Likelihood Ratio Test
logLik_G0 <- (n/2)*(log(det(Khat0))-sum(Khat0*S))
logLik_G <- (n/2)*(log(det(Khat))-sum(Khat*S))
LRT_stat <- 2 * (logLik_G - logLik_G0)
df <- (sum(G) - sum(G0))/2 # Difference in number of edges
p_value <- pchisq(LRT_stat, df = df, lower.tail = FALSE)
cat("Likelihood Ratio Test Statistic:", LRT_stat, "\n")
## Likelihood Ratio Test Statistic: 2.92624
cat("Degrees of Freedom:", df, "\n")
## Degrees of Freedom: 4
cat("p-value:", p_value, "\n")
## p-value: 0.5702438
Â
Â
library(MASS)
# Set dimensions and define the true concentration matrix (Theta_true) according to G0:
p <- 5
Theta_true <- diag(p)
Theta_true[1,2] <- Theta_true[2,1] <- 0.3
Theta_true[1,3] <- Theta_true[3,1] <- 0.3
Theta_true[1,4] <- Theta_true[4,1] <- 0.3
Theta_true[2,3] <- Theta_true[3,2] <- 0.3
Theta_true[3,5] <- Theta_true[5,3] <- 0.3
Theta_true[4,5] <- Theta_true[5,4] <- 0.3
# Compute corresponding covariance matrix
Sigma_true <- solve(Theta_true)
# free parameters in the constrained model correspond to:
# diagonals: Theta[1,1],...,Theta[5,5]
# off-diagonals in G0: (1,2), (1,3), (1,4), (2,3), (3,5), (4,5)
# Total number of free parameters: 5 + 6 = 11.
# Function to construct a 5x5 precision matrix from the 11 free parameters:
vector_to_Theta <- function(x) {
Theta <- matrix(0, nrow = p, ncol = p)
diag(Theta) <- x[1:p]
Theta[1,2] <- Theta[2,1] <- x[p+1] # Edge (1,2)
Theta[1,3] <- Theta[3,1] <- x[p+2] # Edge (1,3)
Theta[1,4] <- Theta[4,1] <- x[p+3] # Edge (1,4)
Theta[2,3] <- Theta[3,2] <- x[p+4] # Edge (2,3)
Theta[3,5] <- Theta[5,3] <- x[p+5] # Edge (3,5)
Theta[4,5] <- Theta[5,4] <- x[p+6] # Edge (4,5)
return(Theta)
}
# Objective function to minimize (negative log–likelihood, up to constant)
# For n observations and sample covariance S, the log–likelihood is:
# L(Theta) = (n/2)[log(det(Theta)) - trace(S %*% Theta)]
# We maximize L(Theta), so we minimize its negative.
obj_fun <- function(x, S) {
Theta <- vector_to_Theta(x)
# Ensure Theta is positive definite:
eigvals <- eigen(Theta, symmetric = TRUE, only.values = TRUE)$values
if (min(eigvals) <= 0) {
return(1e10) # Large penalty if not positive definite
}
logdet <- determinant(Theta, logarithm = TRUE)$modulus
val <- logdet - sum(S * Theta) # sum(S * Theta) computes trace(S %*% Theta)
return(-val)
}
# Simulation parameters:
n <- 500 # Sample size per simulation
B <- 5000 # Number of simulation replications
lr_stats <- numeric(B)
set.seed(437)
for (b in 1:B) {
# Generate data from N(0, Sigma_true)
X <- mvrnorm(n, mu = rep(0, p), Sigma = Sigma_true)
S <- cov(X)
# Full model log-likelihood:
# For the full (unconstrained) model, the MLE is Theta_full = S^(-1),
# and the log-likelihood becomes:
# ll_full = (n/2)[log(det(S^(-1))) - trace(S %*% S^(-1))]
# = (n/2)[-log(det(S)) - p]
ll_full <- (n/2)*(-log(det(S)) - p)
# Set up initial values for optimization:
# Use the full model estimate Theta_full as a guide.
Theta_full <- solve(S)
x0 <- numeric(11)
# Diagonals:
x0[1:p] <- diag(Theta_full)
# Off-diagonals corresponding to edges in G0:
x0[p+1] <- Theta_full[1,2]
x0[p+2] <- Theta_full[1,3]
x0[p+3] <- Theta_full[1,4]
x0[p+4] <- Theta_full[2,3]
x0[p+5] <- Theta_full[3,5]
x0[p+6] <- Theta_full[4,5]
# Optimize the objective function to obtain the constrained MLE
opt <- optim(x0, obj_fun, S = S, method = "BFGS",
control = list(maxit = 10000))
if (opt$convergence != 0) {
warning("Optimization did not converge at simulation ", b)
}
# The maximized value
max_val <- -opt$value
# Constrained log-likelihood:
ll_restr <- (n/2)*max_val
# Likelihood ratio statistic:
# LR = -2 * (ll_restr - ll_full)
lr_stats[b] <- -2*(ll_restr - ll_full)
}
# Plot the histogram of the LR statistics with the χ²₄ density overlay:
hist(lr_stats, breaks = 30, probability = TRUE,
main = "LR vs. χ² density df=4",
xlab = "Likelihood Ratio Statistic")
curve(dchisq(x, df = 4), add = TRUE, col = "red", lwd = 2)
# Generate a QQ-plot to compare the empirical distribution to χ² quantiles:
qqplot(qchisq(ppoints(B), df = 4), lr_stats,
main = "QQ-plot LR statistic vs. χ² df=4",
xlab = "Theoretical χ² quantiles", ylab = "Empirical quantiles")
abline(0, 1, col = "red")