survey creation and validation
Introduction
This code was generated with the help of Claude
Set up synthetic data
# Load necessary libraries
library(survey)
library(lavaan)
library(psych)
library(dplyr)
library(ggplot2)
library(missMethods)
# Set seed for reproducibility
set.seed(123)
# Create simulated survey data
<- 500
n <- data.frame(
survey_data id = 1:n,
q1 = sample(1:5, n, replace = TRUE),
q2 = sample(1:5, n, replace = TRUE),
q3 = sample(1:5, n, replace = TRUE),
q4 = sample(1:5, n, replace = TRUE),
q5 = sample(1:5, n, replace = TRUE),
q6 = sample(1:5, n, replace = TRUE)
)
<- impute_median(survey_data)
survey_data1
# Round to whole numbers and ensure they're within 1-5 range
2:7] <- round(pmin(pmax(survey_data1[, 2:7], 1), 5)) survey_data1[,
Descriptives
# Basic descriptive statistics
<- survey_data1 %>%
summary_stats select(-id) %>%
summarise_all(list(mean = mean, sd = sd, median = median))
print(summary_stats)
q1_mean q2_mean q3_mean q4_mean q5_mean q6_mean q1_sd q2_sd q3_sd
1 2.956 3.014 2.874 3.1 3.064 2.868 1.429038 1.412017 1.419208
q4_sd q5_sd q6_sd q1_median q2_median q3_median q4_median q5_median
1 1.413505 1.357789 1.400893 3 3 3 3 3
q6_median
1 3
# Correlation matrix
<- cor(survey_data1[, 2:7])
cor_matrix print(cor_matrix)
q1 q2 q3 q4 q5 q6
q1 1.00000000 -0.021543464 -0.043251986 0.01111160 0.103703120 -0.036942333
q2 -0.02154346 1.000000000 -0.003118096 0.02841507 0.015210726 -0.008181839
q3 -0.04325199 -0.003118096 1.000000000 0.05224656 -0.012446366 0.002705396
q4 0.01111160 0.028415068 0.052246558 1.00000000 -0.016915500 -0.010525205
q5 0.10370312 0.015210726 -0.012446366 -0.01691550 1.000000000 -0.007138971
q6 -0.03694233 -0.008181839 0.002705396 -0.01052520 -0.007138971 1.000000000
Reliability analysis Cronbachs alpha
<- psych::alpha(survey_data1[, 2:7], check.keys = T)
alpha_result print(alpha_result$total$raw_alpha)
[1] 0.1016285
Exploratory factor analysis
# Determine number of factors
fa.parallel(survey_data1[, 2:7], fa = "fa")
Parallel analysis suggests that the number of factors = 0 and the number of components = NA
# Perform EFA
<- fa(survey_data1[, 2:7], nfactors = 2, rotate = "varimax")
efa_result print(efa_result)
Factor Analysis using method = minres
Call: fa(r = survey_data1[, 2:7], nfactors = 2, rotate = "varimax")
Standardized loadings (pattern matrix) based upon correlation matrix
MR1 MR2 h2 u2 com
q1 0.98 -0.20 0.9950 0.005 1.1
q2 -0.01 0.03 0.0012 0.999 1.3
q3 -0.03 0.06 0.0047 0.995 1.5
q4 0.21 0.97 0.9950 0.005 1.1
q5 0.10 -0.04 0.0111 0.989 1.3
q6 -0.04 0.00 0.0015 0.999 1.0
MR1 MR2
SS loadings 1.01 1.00
Proportion Var 0.17 0.17
Cumulative Var 0.17 0.33
Proportion Explained 0.50 0.50
Cumulative Proportion 0.50 1.00
Mean item complexity = 1.2
Test of the hypothesis that 2 factors are sufficient.
df null model = 15 with the objective function = 0.02 with Chi Square = 9.52
df of the model are 4 and the objective function was 0
The root mean square of the residuals (RMSR) is 0.01
The df corrected root mean square of the residuals is 0.01
The harmonic n.obs is 500 with the empirical chi square 0.49 with prob < 0.97
The total n.obs was 500 with Likelihood Chi Square = 0.25 with prob < 0.99
Tucker Lewis Index of factoring reliability = -1.553
RMSEA index = 0 and the 90 % confidence intervals are 0 0
BIC = -24.61
Fit based upon off diagonal values = 0.97
Measures of factor score adequacy
MR1 MR2
Correlation of (regression) scores with factors 1.00 1.00
Multiple R square of scores with factors 1.00 1.00
Minimum correlation of possible factor scores 0.99 0.99
Confirmatory factor analysis
# Define the model
<- '
cfa_model factor1 =~ q1 + q2 + q3
factor2 =~ q4 + q5 + q6
'
Fit the model
library(ltm)
<- grm(survey_data1[, 2:7])
irt_model summary(irt_model)
Call:
grm(data = survey_data1[, 2:7])
Model Summary:
log.Lik AIC BIC
-4810.717 9681.434 9807.872
Coefficients:
$q1
value
Extrmt1 -0.883
Extrmt2 -0.227
Extrmt3 0.320
Extrmt4 0.930
Dscrmn 3.737
$q2
value
Extrmt1 25.554
Extrmt2 7.573
Extrmt3 -7.290
Extrmt4 -24.441
Dscrmn -0.056
$q3
value
Extrmt1 12.353
Extrmt2 3.359
Extrmt3 -6.256
Extrmt4 -16.193
Dscrmn -0.095
$q4
value
Extrmt1 -65.687
Extrmt2 -24.995
Extrmt3 13.505
Extrmt4 55.298
Dscrmn 0.023
$q5
value
Extrmt1 -7.941
Extrmt2 -2.195
Extrmt3 1.753
Extrmt4 6.721
Dscrmn 0.214
$q6
value
Extrmt1 15.955
Extrmt2 2.275
Extrmt3 -6.823
Extrmt4 -19.249
Dscrmn -0.082
Integration:
method: Gauss-Hermite
quadrature points: 21
Optimization:
Convergence: 0
max(|grad|): 0.0074
quasi-Newton: BFGS
# Plot Item Characteristic Curves
plot(irt_model, type = "ICC")
Survey design and construct weighting
# Create a survey design object (assuming equal weights for simplicity)
<- svydesign(ids = ~1, weights = ~1, data = survey_data1)
svy_design
# Calculate weighted means
<- svymean(~q1 + q2 + q3 + q4 + q5 + q6, design = svy_design)
svy_means print(svy_means)
mean SE
q1 2.956 0.0639
q2 3.014 0.0631
q3 2.874 0.0635
q4 3.100 0.0632
q5 3.064 0.0607
q6 2.868 0.0626
Construct validity
# Convergent validity (using AVE - Average Variance Extracted)
<- '
lavaan_model factor1 =~ q1 + q2 + q3
factor2 =~ q4 + q5 + q6
'
<- sem(lavaan_model, data = survey_data1)
fit summary(fit, standardized = TRUE)
lavaan 0.6-19 ended normally after 134 iterations
Estimator ML
Optimization method NLMINB
Number of model parameters 13
Number of observations 500
Model Test User Model:
Test statistic 2.198
Degrees of freedom 8
P-value (Chi-square) 0.974
Parameter Estimates:
Standard errors Standard
Information Expected
Information saturated (h1) model Structured
Latent Variables:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
factor1 =~
q1 1.000 NA NA
q2 0.053 0.232 0.228 0.820 NA NA
q3 0.078 0.333 0.235 0.814 NA NA
factor2 =~
q4 1.000 NA NA
q5 4.722 10.214 0.462 0.644 NA NA
q6 -1.702 4.167 -0.408 0.683 NA NA
Covariances:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
factor1 ~~
factor2 0.042 0.089 0.472 0.637 4.188 4.188
Variances:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
.q1 3.063 4.532 0.676 0.499 3.063 1.503
.q2 1.993 0.127 15.692 0.000 1.993 1.001
.q3 2.016 0.131 15.409 0.000 2.016 1.003
.q4 1.994 0.126 15.771 0.000 1.994 1.000
.q5 1.842 0.232 7.935 0.000 1.842 1.001
.q6 1.959 0.127 15.472 0.000 1.959 1.000
factor1 -1.025 4.525 -0.227 0.821 NA NA
factor2 -0.000 0.009 -0.011 0.991 NA NA
# Calculate AVE
<- standardizedSolution(fit)
std_loadings <- std_loadings %>%
ave filter(op == "=~") %>%
group_by(lhs) %>%
summarise(ave = mean(est.std^2))
print(ave)
# A tibble: 2 × 2
lhs ave
<chr> <dbl>
1 factor1 NA
2 factor2 NA
Comments
Post a Comment