Hierarchical Clustering of Dissimilarity Analysis Data Frames
Source:R/distantia_cluster_hclust.R
distantia_cluster_hclust.Rd
This function combines the dissimilarity scores computed by distantia()
, the agglomerative clustering methods provided by stats::hclust()
, and the clustering optimization method implemented in utils_cluster_hclust_optimizer()
to help group together time series with similar features.
When clusters = NULL
, the function utils_cluster_hclust_optimizer()
is run underneath to perform a parallelized grid search to find the number of clusters maximizing the overall silhouette width of the clustering solution (see utils_cluster_silhouette()
). When method = NULL
as well, the optimization also includes all methods available in stats::hclust()
in the grid search. The grid search function supports parallelization via future::plan()
and a progress bar generated by the progressr
package (see Examples).
Arguments
- df
(required, data frame) Output of
distantia()
. Default: NULL- clusters
(required, integer) Number of groups to generate. If NULL (default),
utils_cluster_kmeans_optimizer()
is used to find the number of clusters that maximizes the mean silhouette width of the clustering solution (seeutils_cluster_silhouette()
). Default: NULL- method
(optional, character string) Argument of
stats::hclust()
defining the agglomerative method. One of: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). Unambiguous abbreviations are accepted as well. If NULL (default),utils_cluster_hclust_optimizer()
finds the optimal method. Default: "complete".
Value
list:
cluster_object
: hclust object for further analyses and custom plotting.clusters
: integer, number of clusters.silhouette_width
: mean silhouette width of the clustering solution.df
: data frame with time series names, their cluster label, and their individual silhouette width scores.d
: psi distance matrix used for clustering.optimization
: only ifclusters = NULL
, data frame with optimization results fromutils_cluster_hclust_optimizer()
.
See also
Other dissimilarity_analysis:
distantia_aggregate()
,
distantia_boxplot()
,
distantia_cluster_kmeans()
,
distantia_matrix()
,
distantia_plot()
,
distantia_to_sf()
Examples
#for large datasets, parallelization accelerates cluster optimization
future::plan(
future::multisession,
workers = 2 #set to parallelly::availableWorkers() - 1
)
#progress bar
# progressr::handlers(global = TRUE)
#weekly covid prevalence in California counties
data("covid_prevalence")
#load as tsl
tsl <- tsl_initialize(
x = covid_prevalence,
name_column = "name",
time_column = "time"
)
#subset 10 elements to accelerate example execution
tsl <- tsl_subset(
tsl = tsl,
names = 1:10
)
#aggregateto monthly data to accelerate example execution
tsl <- tsl_aggregate(
tsl = tsl,
new_time = "months",
fun = sum
)
if(interactive()){
#plotting first three time series
tsl_plot(
tsl = tsl_subset(
tsl = tsl,
names = 1:3
),
guide_columns = 3
)
}
#dissimilarity analysis
distantia_df <- distantia(
tsl = tsl,
lock_step = TRUE
)
#hierarchical clustering with a given number of clusters
#-------------------------------------------------------
distantia_clust <- distantia_cluster_hclust(
df = distantia_df,
clusters = 5, #arbitrary number!
method = "complete"
)
#names of the output object
names(distantia_clust)
#> [1] "cluster_object" "clusters" "silhouette_width" "df"
#> [5] "d"
#cluster object
distantia_clust$cluster_object
#>
#> Call:
#> stats::hclust(d = d_dist, method = method)
#>
#> Cluster method : complete
#> Number of objects: 10
#>
#distance matrix used for clustering
distantia_clust$d
#> Alameda Butte Contra_Costa El_Dorado Fresno Humboldt
#> Butte 1.0467873
#> Contra_Costa 0.3651620 0.8229583
#> El_Dorado 0.8964242 0.8303951 0.8740113
#> Fresno 1.0683761 1.0541750 0.9433786 1.4500246
#> Humboldt 1.4417508 1.4913793 1.3337416 1.2920760 1.5195354
#> Imperial 1.5714863 1.5658263 1.4824695 1.6810961 1.1057192 1.5212096
#> Kern 1.1551233 0.8667292 0.9884752 1.2460465 0.6881208 1.3363140
#> Kings 1.8671845 1.7690531 1.7131840 1.9159343 1.2878686 1.7665323
#> Los_Angeles 1.1665947 1.2619658 1.0206897 1.4942772 0.6815847 1.5526553
#> Imperial Kern Kings
#> Butte
#> Contra_Costa
#> El_Dorado
#> Fresno
#> Humboldt
#> Imperial
#> Kern 1.1944075
#> Kings 0.6761740 1.4009024
#> Los_Angeles 1.2219110 0.9391916 1.2182971
#number of clusters
distantia_clust$clusters
#> [1] 5
#clustering data frame
#group label in column "cluster"
#negatives in column "silhouette_width" higlight anomalous cluster assignation
distantia_clust$df
#> name cluster silhouette_width
#> 1 Alameda 1 0.6241664
#> 2 Butte 2 0.1117560
#> 3 Contra_Costa 1 0.5696304
#> 4 El_Dorado 2 0.0619312
#> 5 Fresno 3 0.3191488
#> 6 Humboldt 4 0.0000000
#> 7 Imperial 5 0.4240487
#> 8 Kern 3 0.2297752
#> 9 Kings 5 0.4808071
#> 10 Los_Angeles 3 0.2590007
#mean silhouette width of the clustering solution
distantia_clust$silhouette_width
#> [1] 0.3080265
#plot
if(interactive()){
clust <- distantia_clust$cluster_object
k <- distantia_clust$clusters
#tree plot
plot(
x = clust,
hang = -1
)
#highlight groups
stats::rect.hclust(
tree = clust,
k = k,
cluster = stats::cutree(
tree = clust,
k = k
)
)
}
#optimized hierarchical clustering
#---------------------------------
#auto-optimization of clusters and method
distantia_clust <- distantia_cluster_hclust(
df = distantia_df,
clusters = NULL,
method = NULL
)
#names of the output object
#a new object named "optimization" should appear
names(distantia_clust)
#> [1] "cluster_object" "clusters" "silhouette_width" "df"
#> [5] "d" "optimization"
#first rows of the optimization data frame
#optimized clustering in first row
head(distantia_clust$optimization)
#> clusters method silhouette_mean
#> 1 2 average 0.3175009
#> 2 2 mcquitty 0.3175009
#> 3 2 median 0.3175009
#> 4 5 ward.D 0.3080265
#> 5 5 ward.D2 0.3080265
#> 6 5 complete 0.3080265
#plot
if(interactive()){
clust <- distantia_clust$cluster_object
k <- distantia_clust$clusters
#tree plot
plot(
x = clust,
hang = -1
)
#highlight groups
stats::rect.hclust(
tree = clust,
k = k,
cluster = stats::cutree(
tree = clust,
k = k
)
)
}
#disable parallelization
future::plan(
future::sequential
)