Skip to contents

This function combines the dissimilarity scores computed by distantia(), the agglomerative clustering methods provided by stats::hclust(), and the clustering optimization method implemented in utils_cluster_hclust_optimizer() to help group together time series with similar features.

When clusters = NULL, the function utils_cluster_hclust_optimizer() is run underneath to perform a parallelized grid search to find the number of clusters maximizing the overall silhouette width of the clustering solution (see utils_cluster_silhouette()). When method = NULL as well, the optimization also includes all methods available in stats::hclust() in the grid search. The grid search function supports parallelization via future::plan() and a progress bar generated by the progressr package (see Examples).

Usage

distantia_cluster_hclust(df = NULL, clusters = NULL, method = "complete")

Arguments

df

(required, data frame) Output of distantia(). Default: NULL

clusters

(required, integer) Number of groups to generate. If NULL (default), utils_cluster_kmeans_optimizer() is used to find the number of clusters that maximizes the mean silhouette width of the clustering solution (see utils_cluster_silhouette()). Default: NULL

method

(optional, character string) Argument of stats::hclust() defining the agglomerative method. One of: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). Unambiguous abbreviations are accepted as well. If NULL (default), utils_cluster_hclust_optimizer() finds the optimal method. Default: "complete".

Value

list:

  • cluster_object: hclust object for further analyses and custom plotting.

  • clusters: integer, number of clusters.

  • silhouette_width: mean silhouette width of the clustering solution.

  • df: data frame with time series names, their cluster label, and their individual silhouette width scores.

  • d: psi distance matrix used for clustering.

  • optimization: only if clusters = NULL, data frame with optimization results from utils_cluster_hclust_optimizer().

Examples

#for large datasets, parallelization accelerates cluster optimization
future::plan(
  future::multisession,
  workers = 2 #set to parallelly::availableWorkers() - 1
)

#progress bar
# progressr::handlers(global = TRUE)

#weekly covid prevalence in California counties
data("covid_prevalence")

#load as tsl
tsl <- tsl_initialize(
  x = covid_prevalence,
  name_column = "name",
  time_column = "time"
)

#subset 10 elements to accelerate example execution
tsl <- tsl_subset(
  tsl = tsl,
  names = 1:10
)

#aggregateto monthly data to accelerate example execution
tsl <- tsl_aggregate(
  tsl = tsl,
  new_time = "months",
  fun = sum
)

if(interactive()){
  #plotting first three time series
  tsl_plot(
    tsl = tsl_subset(
      tsl = tsl,
      names = 1:3
    ),
    guide_columns = 3
  )
}

#dissimilarity analysis
distantia_df <- distantia(
  tsl = tsl,
  lock_step = TRUE
)

#hierarchical clustering with a given number of clusters
#-------------------------------------------------------
distantia_clust <- distantia_cluster_hclust(
  df = distantia_df,
  clusters = 5, #arbitrary number!
  method = "complete"
)

#names of the output object
names(distantia_clust)
#> [1] "cluster_object"   "clusters"         "silhouette_width" "df"              
#> [5] "d"               

#cluster object
distantia_clust$cluster_object
#> 
#> Call:
#> stats::hclust(d = d_dist, method = method)
#> 
#> Cluster method   : complete 
#> Number of objects: 10 
#> 

#distance matrix used for clustering
distantia_clust$d
#>                Alameda     Butte Contra_Costa El_Dorado    Fresno  Humboldt
#> Butte        1.0467873                                                     
#> Contra_Costa 0.3651620 0.8229583                                           
#> El_Dorado    0.8964242 0.8303951    0.8740113                              
#> Fresno       1.0683761 1.0541750    0.9433786 1.4500246                    
#> Humboldt     1.4417508 1.4913793    1.3337416 1.2920760 1.5195354          
#> Imperial     1.5714863 1.5658263    1.4824695 1.6810961 1.1057192 1.5212096
#> Kern         1.1551233 0.8667292    0.9884752 1.2460465 0.6881208 1.3363140
#> Kings        1.8671845 1.7690531    1.7131840 1.9159343 1.2878686 1.7665323
#> Los_Angeles  1.1665947 1.2619658    1.0206897 1.4942772 0.6815847 1.5526553
#>               Imperial      Kern     Kings
#> Butte                                     
#> Contra_Costa                              
#> El_Dorado                                 
#> Fresno                                    
#> Humboldt                                  
#> Imperial                                  
#> Kern         1.1944075                    
#> Kings        0.6761740 1.4009024          
#> Los_Angeles  1.2219110 0.9391916 1.2182971

#number of clusters
distantia_clust$clusters
#> [1] 5

#clustering data frame
#group label in column "cluster"
#negatives in column "silhouette_width" higlight anomalous cluster assignation
distantia_clust$df
#>            name cluster silhouette_width
#> 1       Alameda       1        0.6241664
#> 2         Butte       2        0.1117560
#> 3  Contra_Costa       1        0.5696304
#> 4     El_Dorado       2        0.0619312
#> 5        Fresno       3        0.3191488
#> 6      Humboldt       4        0.0000000
#> 7      Imperial       5        0.4240487
#> 8          Kern       3        0.2297752
#> 9         Kings       5        0.4808071
#> 10  Los_Angeles       3        0.2590007

#mean silhouette width of the clustering solution
distantia_clust$silhouette_width
#> [1] 0.3080265

#plot
if(interactive()){

  clust <- distantia_clust$cluster_object
  k <- distantia_clust$clusters

  #tree plot
  plot(
    x = clust,
    hang = -1
  )

  #highlight groups
  stats::rect.hclust(
    tree = clust,
    k = k,
    cluster = stats::cutree(
      tree = clust,
      k = k
    )
  )

}


#optimized hierarchical clustering
#---------------------------------

#auto-optimization of clusters and method
distantia_clust <- distantia_cluster_hclust(
  df = distantia_df,
  clusters = NULL,
  method = NULL
)

#names of the output object
#a new object named "optimization" should appear
names(distantia_clust)
#> [1] "cluster_object"   "clusters"         "silhouette_width" "df"              
#> [5] "d"                "optimization"    

#first rows of the optimization data frame
#optimized clustering in first row
head(distantia_clust$optimization)
#>   clusters   method silhouette_mean
#> 1        2  average       0.3175009
#> 2        2 mcquitty       0.3175009
#> 3        2   median       0.3175009
#> 4        5   ward.D       0.3080265
#> 5        5  ward.D2       0.3080265
#> 6        5 complete       0.3080265

#plot
if(interactive()){

  clust <- distantia_clust$cluster_object
  k <- distantia_clust$clusters

  #tree plot
  plot(
    x = clust,
    hang = -1
  )

  #highlight groups
  stats::rect.hclust(
    tree = clust,
    k = k,
    cluster = stats::cutree(
      tree = clust,
      k = k
    )
  )

}

#disable parallelization
future::plan(
  future::sequential
)