usarrests-NIW-vignette

# Import the TIP library 
library(tip)

# Import the US Arrests dataset
data(USArrests)

# Convert the data to a matrix 
X <- data.matrix(USArrests)

# Compute the distance matrix
distance_matrix <- data.matrix(dist(X))

# Compute the temperature parameter estiamte
temperature <- 1/median(distance_matrix[upper.tri(distance_matrix)])

# For each subject, compute the point estimate for the number of similar 
# subjects using  univariate multiple change point detection (i.e.)
init_num_neighbors = get_cpt_neighbors(.distance_matrix = distance_matrix)
#> Warning in BINSEG(sumstat, pen = pen.value, cost_func = costfunc, minseglen =
#> minseglen, : The number of changepoints identified is Q, it is advised to
#> increase Q to make sure changepoints have not been missed.

# Set the number of burn-in iterations in the Gibbs samlper
# RECOMENDATION: *** burn >= 1000 ***
burn <- 10

# Set the number of sampling iterations in the Gibbs sampler
# RECOMENDATION: *** samples >= 1000 ***
samples <- 10

# Extract the state names 
names_subjects <- rownames(USArrests)

# Run TIP clustering using only the prior
# --> That is, the likelihood function is constant
tip1 <- tip(.data = data.matrix(X),
            .burn = burn,
            .samples = samples,
            .similarity_matrix = exp(-1.0*temperature*distance_matrix),
            .init_num_neighbors = init_num_neighbors,
            .likelihood_model = "NIW",
            .subject_names = names_subjects,
            .num_cores = 1)
#> Bayesian Clustering: Table Invitation Prior Gibbs Sampler
#> burn-in: 10
#> samples: 10
#> Likelihood Model: NIW
#>   |                                                                              |                                                                      |   0%  |                                                                              |====                                                                  |   6%  |                                                                              |========                                                              |  11%  |                                                                              |============                                                          |  17%  |                                                                              |================                                                      |  22%  |                                                                              |===================                                                   |  28%  |                                                                              |=======================                                               |  33%  |                                                                              |===========================                                           |  39%  |                                                                              |===============================                                       |  44%  |                                                                              |===================================                                   |  50%  |                                                                              |=======================================                               |  56%  |                                                                              |===========================================                           |  61%  |                                                                              |===============================================                       |  67%  |                                                                              |===================================================                   |  72%  |                                                                              |======================================================                |  78%  |                                                                              |==========================================================            |  83%  |                                                                              |==============================================================        |  89%  |                                                                              |==================================================================    |  94%  |                                                                              |======================================================================| 100%

# Produce plots for the Bayesian Clustering Model
tip_plots <- plot(tip1)

# View the posterior distribution of the number of clusters
tip_plots$trace_plot_posterior_number_of_clusters

# View the trace plot with respect to the posterior number of clusters
tip_plots$trace_plot_posterior_number_of_clusters

# Extract posterior cluster assignments using the Posterior Expected Adjusted Rand (PEAR) index
cluster_assignments <- mcclust::maxpear(psm = tip1@posterior_similarity_matrix)$cl

# Create a list where each element stores the cluster assignments
cluster_assignment_list <- list()
for(k in 1:length(unique(cluster_assignments))){
  cluster_assignment_list[[k]] <- names_subjects[which(cluster_assignments == k)]
}
cluster_assignment_list
#> [[1]]
#>  [1] "Alabama"        "Alaska"         "Arizona"        "California"    
#>  [5] "Colorado"       "Delaware"       "Georgia"        "Illinois"      
#>  [9] "Louisiana"      "Maryland"       "Michigan"       "Mississippi"   
#> [13] "Nevada"         "New Jersey"     "New Mexico"     "New York"      
#> [17] "North Carolina" "Rhode Island"   "South Carolina" "Tennessee"     
#> [21] "Texas"         
#> 
#> [[2]]
#>  [1] "Arkansas"      "Connecticut"   "Hawaii"        "Kentucky"     
#>  [5] "Maine"         "Massachusetts" "Oregon"        "West Virginia"
#>  [9] "Wisconsin"     "Wyoming"      
#> 
#> [[3]]
#> [1] "Florida"    "Missouri"   "Washington"
#> 
#> [[4]]
#> [1] "Idaho"        "Indiana"      "Minnesota"    "Ohio"         "Pennsylvania"
#> [6] "Utah"        
#> 
#> [[5]]
#> [1] "Iowa"          "Montana"       "New Hampshire" "North Dakota" 
#> [5] "South Dakota"  "Vermont"      
#> 
#> [[6]]
#> [1] "Kansas"   "Nebraska"
#> 
#> [[7]]
#> [1] "Oklahoma" "Virginia"

# Create the one component graph with minimum entropy
partition_list <- partition_undirected_graph(.graph_matrix = tip1@posterior_similarity_matrix,
                                             .num_components = 1,
                                             .step_size = 0.001)

# View the state names
# names_subjects

# Create a vector of true region labels to see if there is a pattern
true_region <- c("Southeast", "West", "Southwest", "Southeast", "West", "West",
                 "Northeast", "Northeast", "Southeast", "Southeast", "West", "West",
                 "Midwest", "Midwest", "Midwest", "Midwest", "Southeast", "Southeast",
                 "Northeast", "Northeast", "Northeast", "Midwest", "Midwest", "Southeast",
                 "Midwest", "West", "Midwest", "West", "Northeast", "Northeast",
                 "Southwest", "Northeast", "Southeast", "Midwest", "Midwest", "Southwest",
                 "West", "Northeast", "Northeast", "Southeast", "Midwest", "Southeast",
                 "Southwest", "West", "Northeast", "Southeast", "West", "Southeast",
                 "Midwest", "West")

names_subjects
#>  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
#>  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
#>  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
#> [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
#> [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
#> [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
#> [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
#> [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
#> [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
#> [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
#> [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
#> [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
#> [49] "Wisconsin"      "Wyoming"

# Associate class labels and colors for the plot
class_palette_colors <- c("Northeast" = "blue",
                          "Southeast" = "red",
                          "Midwest" = "purple",
                          "Southwest" = "orange",
                          "West" = "green")

# Associate class labels and shapes for the plot
class_palette_shapes <- c("Northeast" = 15,
                          "Southeast" = 16,
                          "Midwest" = 17,
                          "Southwest" = 18,
                          "West" = 19)

# Visualize the posterior similarity matrix by constructing a graph plot of 
# the one-cluster graph. The true labels are used here (below they are not).
ggnet2_network_plot(.matrix_graph = partition_list$partitioned_graph_matrix,
                .subject_names = names_subjects,
                .subject_class_names = true_region,
                .class_colors = class_palette_colors,
                .class_shapes = class_palette_shapes,
                .node_size = 2,
                .add_node_labels = TRUE)
#> Warning: Duplicated `override.aes` is ignored.

# Visualize the posterior similarity matrix by constructing a graph plot of 
# the one-cluster graph. The true labels are used here (below they are not).
# Remove the subject names with .add_node_labels = FALSE
ggnet2_network_plot(.matrix_graph = partition_list$partitioned_graph_matrix,
                .subject_names = names_subjects,
                .subject_class_names = true_region,
                .class_colors = class_palette_colors,
                .class_shapes = class_palette_shapes,
                .node_size = 2,
                .add_node_labels = FALSE)
#> Warning: Duplicated `override.aes` is ignored.

# Construct a network plot without class labels
# Note: Subject labels may be suppressed using .add_node_labels = FALSE.  
ggnet2_network_plot(.matrix_graph = partition_list$partitioned_graph_matrix,
                .subject_names = names_subjects,
                .node_size = 2,
                .add_node_labels = TRUE)