K-Means Clustering of Dissimilarity Analysis Data Frames
Source:R/distantia_cluster_kmeans.R
distantia_cluster_kmeans.Rd
This function combines the dissimilarity scores computed by distantia()
, the K-means clustering method implemented in stats::kmeans()
, and the clustering optimization method implemented in utils_cluster_hclust_optimizer()
to help group together time series with similar features.
When clusters = NULL
, the function utils_cluster_hclust_optimizer()
is run underneath to perform a parallelized grid search to find the number of clusters maximizing the overall silhouette width of the clustering solution (see utils_cluster_silhouette()
). The grid search function supports parallelization via future::plan()
and a progress bar generated by the progressr
package (see Examples).
Arguments
- df
(required, data frame) Output of
distantia()
. Default: NULL- clusters
(required, integer) Number of groups to generate. If NULL (default),
utils_cluster_kmeans_optimizer()
is used to find the number of clusters that maximizes the mean silhouette width of the clustering solution (seeutils_cluster_silhouette()
). Default: NULL- seed
(optional, integer) Random seed to be used during the K-means computation. Default: 1
Value
list:
cluster_object
: kmeans object object for further analyses and custom plotting.clusters
: integer, number of clusters.silhouette_width
: mean silhouette width of the clustering solution.df
: data frame with time series names, their cluster label, and their individual silhouette width scores.d
: psi distance matrix used for clustering.optimization
: only ifclusters = NULL
, data frame with optimization results fromutils_cluster_hclust_optimizer()
.
See also
Other dissimilarity_analysis:
distantia_aggregate()
,
distantia_boxplot()
,
distantia_cluster_hclust()
,
distantia_matrix()
,
distantia_plot()
,
distantia_to_sf()
Examples
#for large datasets, parallelization accelerates cluster optimization
future::plan(
future::multisession,
workers = 2 #set to parallelly::availableWorkers() - 1
)
#progress bar
# progressr::handlers(global = TRUE)
#weekly covid prevalence in California counties
data("covid_prevalence")
#load as tsl and aggregate to monthly data to accelerate example execution
tsl <- tsl_initialize(
x = covid_prevalence,
name_column = "name",
time_column = "time"
)
#subset 10 elements to accelerate example execution
tsl <- tsl_subset(
tsl = tsl,
names = 1:10
)
#aggregateto monthly data to accelerate example execution
tsl <- tsl_aggregate(
tsl = tsl,
new_time = "months",
fun = sum
)
if(interactive()){
#plotting first three time series
tsl_plot(
tsl = tsl[1:3],
guide_columns = 3
)
}
#dissimilarity analysis
distantia_df <- distantia(
tsl = tsl,
lock_step = TRUE
)
#kmeans with a given number of clusters
#-------------------------------------------------------
distantia_kmeans <- distantia_cluster_kmeans(
df = distantia_df,
clusters = 5 #arbitrary number!
)
#names of the output object
names(distantia_kmeans)
#> [1] "cluster_object" "clusters" "silhouette_width" "df"
#> [5] "d"
#kmeans object
distantia_kmeans$cluster_object
#> K-means clustering with 5 clusters of sizes 2, 1, 3, 2, 2
#>
#> Cluster means:
#> Alameda Butte Contra_Costa El_Dorado Fresno Humboldt Imperial
#> 1 1.7193354 1.6674397 1.5978268 1.7985152 1.1967939 1.643871 0.338087
#> 2 1.4417508 1.4913793 1.3337416 1.2920760 1.5195354 0.000000 1.521210
#> 3 1.1300314 1.0609567 0.9841811 1.3967828 0.4565685 1.469502 1.174013
#> 4 0.1825810 0.9348728 0.1825810 0.8852177 1.0058773 1.387746 1.526978
#> 5 0.9716057 0.4151976 0.8484848 0.4151976 1.2520998 1.391728 1.623461
#> Kern Kings Los_Angeles
#> 1 1.2976549 0.338087 1.2201040
#> 2 1.3363140 1.766532 1.5526553
#> 3 0.5424374 1.302356 0.5402588
#> 4 1.0717993 1.790184 1.0936422
#> 5 1.0563879 1.842494 1.3781215
#>
#> Clustering vector:
#> Alameda Butte Contra_Costa El_Dorado Fresno Humboldt
#> 4 5 4 5 3 2
#> Imperial Kern Kings Los_Angeles
#> 1 3 1 3
#>
#> Within cluster sum of squares by cluster:
#> [1] 0.6437771 0.0000000 1.4301178 0.2126394 0.9167289
#> (between_SS / total_SS = 86.0 %)
#>
#> Available components:
#>
#> [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
#> [6] "betweenss" "size" "iter" "ifault"
#distance matrix used for clustering
distantia_kmeans$d
#> Alameda Butte Contra_Costa El_Dorado Fresno Humboldt
#> Alameda 0.0000000 1.0467873 0.3651620 0.8964242 1.0683761 1.441751
#> Butte 1.0467873 0.0000000 0.8229583 0.8303951 1.0541750 1.491379
#> Contra_Costa 0.3651620 0.8229583 0.0000000 0.8740113 0.9433786 1.333742
#> El_Dorado 0.8964242 0.8303951 0.8740113 0.0000000 1.4500246 1.292076
#> Fresno 1.0683761 1.0541750 0.9433786 1.4500246 0.0000000 1.519535
#> Humboldt 1.4417508 1.4913793 1.3337416 1.2920760 1.5195354 0.000000
#> Imperial 1.5714863 1.5658263 1.4824695 1.6810961 1.1057192 1.521210
#> Kern 1.1551233 0.8667292 0.9884752 1.2460465 0.6881208 1.336314
#> Kings 1.8671845 1.7690531 1.7131840 1.9159343 1.2878686 1.766532
#> Los_Angeles 1.1665947 1.2619658 1.0206897 1.4942772 0.6815847 1.552655
#> Imperial Kern Kings Los_Angeles
#> Alameda 1.571486 1.1551233 1.867184 1.1665947
#> Butte 1.565826 0.8667292 1.769053 1.2619658
#> Contra_Costa 1.482470 0.9884752 1.713184 1.0206897
#> El_Dorado 1.681096 1.2460465 1.915934 1.4942772
#> Fresno 1.105719 0.6881208 1.287869 0.6815847
#> Humboldt 1.521210 1.3363140 1.766532 1.5526553
#> Imperial 0.000000 1.1944075 0.676174 1.2219110
#> Kern 1.194407 0.0000000 1.400902 0.9391916
#> Kings 0.676174 1.4009024 0.000000 1.2182971
#> Los_Angeles 1.221911 0.9391916 1.218297 0.0000000
#> attr(,"distantia_args")
#> distance diagonal weighted ignore_blocks lock_step group
#> 1 euclidean TRUE TRUE FALSE TRUE 1
#> attr(,"type")
#> [1] "distantia_matrix"
#> attr(,"distance")
#> [1] "psi"
#number of clusters
distantia_kmeans$clusters
#> [1] 5
#clustering data frame
#group label in column "cluster"
distantia_kmeans$df
#> name cluster silhouette_width
#> 1 Alameda 4 0.6241664
#> 2 Butte 5 0.1117560
#> 3 Contra_Costa 4 0.5696304
#> 4 El_Dorado 5 0.0619312
#> 5 Fresno 3 0.3191488
#> 6 Humboldt 2 0.0000000
#> 7 Imperial 1 0.4240487
#> 8 Kern 3 0.2297752
#> 9 Kings 1 0.4808071
#> 10 Los_Angeles 3 0.2590007
#mean silhouette width of the clustering solution
distantia_kmeans$silhouette_width
#> [1] 0.3080265
#kmeans plot
# factoextra::fviz_cluster(
# object = distantia_kmeans$cluster_object,
# data = distantia_kmeans$d,
# repel = TRUE
# )
#optimized kmeans
#---------------------------------
#auto-optimization of clusters and method
distantia_kmeans <- distantia_cluster_kmeans(
df = distantia_df,
clusters = NULL
)
#names of the output object
#a new object named "optimization" should appear
names(distantia_kmeans)
#> [1] "cluster_object" "clusters" "silhouette_width" "df"
#> [5] "d" "optimization"
#first rows of the optimization data frame
#optimized clustering in first row
head(distantia_kmeans$optimization)
#> clusters silhouette_mean
#> 1 2 0.3175009
#> 2 5 0.3080265
#> 3 4 0.2970758
#> 4 3 0.2618570
#> 5 6 0.2460827
#> 6 7 0.2312105
#disable parallelization
future::plan(
future::sequential
)