Skip to contents

This function combines the dissimilarity scores computed by distantia(), the K-means clustering method implemented in stats::kmeans(), and the clustering optimization method implemented in utils_cluster_hclust_optimizer() to help group together time series with similar features.

When clusters = NULL, the function utils_cluster_hclust_optimizer() is run underneath to perform a parallelized grid search to find the number of clusters maximizing the overall silhouette width of the clustering solution (see utils_cluster_silhouette()). The grid search function supports parallelization via future::plan() and a progress bar generated by the progressr package (see Examples).

Usage

distantia_cluster_kmeans(df = NULL, clusters = NULL, seed = 1)

Arguments

df

(required, data frame) Output of distantia(). Default: NULL

clusters

(required, integer) Number of groups to generate. If NULL (default), utils_cluster_kmeans_optimizer() is used to find the number of clusters that maximizes the mean silhouette width of the clustering solution (see utils_cluster_silhouette()). Default: NULL

seed

(optional, integer) Random seed to be used during the K-means computation. Default: 1

Value

list:

  • cluster_object: kmeans object object for further analyses and custom plotting.

  • clusters: integer, number of clusters.

  • silhouette_width: mean silhouette width of the clustering solution.

  • df: data frame with time series names, their cluster label, and their individual silhouette width scores.

  • d: psi distance matrix used for clustering.

  • optimization: only if clusters = NULL, data frame with optimization results from utils_cluster_hclust_optimizer().

Examples

#for large datasets, parallelization accelerates cluster optimization
future::plan(
  future::multisession,
  workers = 2 #set to parallelly::availableWorkers() - 1
)

#progress bar
# progressr::handlers(global = TRUE)

#weekly covid prevalence in California counties
data("covid_prevalence")

#load as tsl and aggregate to monthly data to accelerate example execution
tsl <- tsl_initialize(
  x = covid_prevalence,
  name_column = "name",
  time_column = "time"
)

#subset 10 elements to accelerate example execution
tsl <- tsl_subset(
  tsl = tsl,
  names = 1:10
)

#aggregateto monthly data to accelerate example execution
tsl <- tsl_aggregate(
  tsl = tsl,
  new_time = "months",
  fun = sum
)

if(interactive()){
  #plotting first three time series
  tsl_plot(
    tsl = tsl[1:3],
    guide_columns = 3
    )
}

#dissimilarity analysis
distantia_df <- distantia(
  tsl = tsl,
  lock_step = TRUE
)

#kmeans with a given number of clusters
#-------------------------------------------------------
distantia_kmeans <- distantia_cluster_kmeans(
  df = distantia_df,
  clusters = 5 #arbitrary number!
)

#names of the output object
names(distantia_kmeans)
#> [1] "cluster_object"   "clusters"         "silhouette_width" "df"              
#> [5] "d"               

#kmeans object
distantia_kmeans$cluster_object
#> K-means clustering with 5 clusters of sizes 2, 1, 3, 2, 2
#> 
#> Cluster means:
#>     Alameda     Butte Contra_Costa El_Dorado    Fresno Humboldt Imperial
#> 1 1.7193354 1.6674397    1.5978268 1.7985152 1.1967939 1.643871 0.338087
#> 2 1.4417508 1.4913793    1.3337416 1.2920760 1.5195354 0.000000 1.521210
#> 3 1.1300314 1.0609567    0.9841811 1.3967828 0.4565685 1.469502 1.174013
#> 4 0.1825810 0.9348728    0.1825810 0.8852177 1.0058773 1.387746 1.526978
#> 5 0.9716057 0.4151976    0.8484848 0.4151976 1.2520998 1.391728 1.623461
#>        Kern    Kings Los_Angeles
#> 1 1.2976549 0.338087   1.2201040
#> 2 1.3363140 1.766532   1.5526553
#> 3 0.5424374 1.302356   0.5402588
#> 4 1.0717993 1.790184   1.0936422
#> 5 1.0563879 1.842494   1.3781215
#> 
#> Clustering vector:
#>      Alameda        Butte Contra_Costa    El_Dorado       Fresno     Humboldt 
#>            4            5            4            5            3            2 
#>     Imperial         Kern        Kings  Los_Angeles 
#>            1            3            1            3 
#> 
#> Within cluster sum of squares by cluster:
#> [1] 0.6437771 0.0000000 1.4301178 0.2126394 0.9167289
#>  (between_SS / total_SS =  86.0 %)
#> 
#> Available components:
#> 
#> [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
#> [6] "betweenss"    "size"         "iter"         "ifault"      

#distance matrix used for clustering
distantia_kmeans$d
#>                Alameda     Butte Contra_Costa El_Dorado    Fresno Humboldt
#> Alameda      0.0000000 1.0467873    0.3651620 0.8964242 1.0683761 1.441751
#> Butte        1.0467873 0.0000000    0.8229583 0.8303951 1.0541750 1.491379
#> Contra_Costa 0.3651620 0.8229583    0.0000000 0.8740113 0.9433786 1.333742
#> El_Dorado    0.8964242 0.8303951    0.8740113 0.0000000 1.4500246 1.292076
#> Fresno       1.0683761 1.0541750    0.9433786 1.4500246 0.0000000 1.519535
#> Humboldt     1.4417508 1.4913793    1.3337416 1.2920760 1.5195354 0.000000
#> Imperial     1.5714863 1.5658263    1.4824695 1.6810961 1.1057192 1.521210
#> Kern         1.1551233 0.8667292    0.9884752 1.2460465 0.6881208 1.336314
#> Kings        1.8671845 1.7690531    1.7131840 1.9159343 1.2878686 1.766532
#> Los_Angeles  1.1665947 1.2619658    1.0206897 1.4942772 0.6815847 1.552655
#>              Imperial      Kern    Kings Los_Angeles
#> Alameda      1.571486 1.1551233 1.867184   1.1665947
#> Butte        1.565826 0.8667292 1.769053   1.2619658
#> Contra_Costa 1.482470 0.9884752 1.713184   1.0206897
#> El_Dorado    1.681096 1.2460465 1.915934   1.4942772
#> Fresno       1.105719 0.6881208 1.287869   0.6815847
#> Humboldt     1.521210 1.3363140 1.766532   1.5526553
#> Imperial     0.000000 1.1944075 0.676174   1.2219110
#> Kern         1.194407 0.0000000 1.400902   0.9391916
#> Kings        0.676174 1.4009024 0.000000   1.2182971
#> Los_Angeles  1.221911 0.9391916 1.218297   0.0000000
#> attr(,"distantia_args")
#>    distance diagonal weighted ignore_blocks lock_step group
#> 1 euclidean     TRUE     TRUE         FALSE      TRUE     1
#> attr(,"type")
#> [1] "distantia_matrix"
#> attr(,"distance")
#> [1] "psi"

#number of clusters
distantia_kmeans$clusters
#> [1] 5

#clustering data frame
#group label in column "cluster"
distantia_kmeans$df
#>            name cluster silhouette_width
#> 1       Alameda       4        0.6241664
#> 2         Butte       5        0.1117560
#> 3  Contra_Costa       4        0.5696304
#> 4     El_Dorado       5        0.0619312
#> 5        Fresno       3        0.3191488
#> 6      Humboldt       2        0.0000000
#> 7      Imperial       1        0.4240487
#> 8          Kern       3        0.2297752
#> 9         Kings       1        0.4808071
#> 10  Los_Angeles       3        0.2590007

#mean silhouette width of the clustering solution
distantia_kmeans$silhouette_width
#> [1] 0.3080265

#kmeans plot
# factoextra::fviz_cluster(
#   object = distantia_kmeans$cluster_object,
#   data = distantia_kmeans$d,
#   repel = TRUE
# )

#optimized kmeans
#---------------------------------
#auto-optimization of clusters and method
distantia_kmeans <- distantia_cluster_kmeans(
  df = distantia_df,
  clusters = NULL
)

#names of the output object
#a new object named "optimization" should appear
names(distantia_kmeans)
#> [1] "cluster_object"   "clusters"         "silhouette_width" "df"              
#> [5] "d"                "optimization"    

#first rows of the optimization data frame
#optimized clustering in first row
head(distantia_kmeans$optimization)
#>   clusters silhouette_mean
#> 1        2       0.3175009
#> 2        5       0.3080265
#> 3        4       0.2970758
#> 4        3       0.2618570
#> 5        6       0.2460827
#> 6        7       0.2312105

#disable parallelization
future::plan(
  future::sequential
)