Skip to contents

Generates model formulas from a dataframe, a response name, and a vector of predictors that can be the output of a multicollinearity management function such as collinear_select() and the likes. Intended to help fit exploratory models from the result of a multicollinearity analysis.

The types of formulas it can generate are:

  • additive: y ~ x + z

  • polynomial: y ~ poly(x, ...) + poly(z, ...)

  • GAM: y ~ s(x) + s(z)

  • random effect: y ~ x + (1 \ z)

Usage

model_formula(
  df = NULL,
  response = NULL,
  predictors = NULL,
  term_f = NULL,
  term_args = NULL,
  random_effects = NULL,
  quiet = FALSE,
  ...
)

Arguments

df

(required; dataframe, tibble, or sf) A dataframe with responses (optional) and predictors. Must have at least 10 rows for pairwise correlation analysis, and 10 * (length(predictors) - 1) for VIF. Default: NULL.

response

(optional, character string) Name of a response variable in df. Default: NULL.

predictors

(optional; character vector or NULL) Names of the predictors in df. If NULL, all columns except responses and constant/near-zero-variance columns are used. Default: NULL.

term_f

(optional; string). Name of function to apply to each term in the formula, such as "s" for mgcv::s() or any other smoothing function, "poly" for stats::poly(). Default: NULL

term_args

(optional; string). Arguments of the function applied to each term. For example, for "poly" it can be "degree = 2, raw = TRUE". Default: NULL

random_effects

(optional, string or character vector). Names of variables to be used as random effects. Each element is added to the final formula as +(1 | random_effect_name). Default: NULL

quiet

(optional; logical) If FALSE, messages are printed. Default: FALSE.

...

(optional) Internal args (e.g. function_name for validate_arg_function_name, a precomputed correlation matrix m, or cross-validation args for preference_order).

Value

list if predictors is a list or length of response is higher than one, and character vector otherwise.

See also

Examples

data(vi_smol, package = "spatialData")
data(vi_predictors, package = "spatialData")
vi_predictors_numeric <- identify_numeric_variables(
  df = vi_smol,
  predictors = vi_predictors
)$valid

#reduce collinearity
x <- collinear_select(
  df = vi_smol,
  predictors = vi_predictors_numeric
)
#> 
#> collinear::collinear_select()
#> └── collinear::validate_arg_df()
#>     └── collinear::drop_geometry_column(): dropping geometry column from 'df'.
#> 
#> collinear::collinear_select()
#> └── collinear::validate_arg_preference_order()
#>     └── collinear::preference_order(): ranking 47 'predictors' from lower to higher multicollinearity.

#additive formula
y <- model_formula(
  df = vi_smol,
  response = "vi_numeric",
  predictors = x
)
#> 
#> collinear::model_formula()
#> └── collinear::drop_geometry_column(): dropping geometry column from 'df'.

y
#> vi_numeric ~ topo_elevation + topo_slope + soil_clay + topo_diversity + 
#>     humidity_range + cloud_cover_range + swi_range + soil_silt + 
#>     rainfall_min + rainfall_range + soil_soc + swi_mean + solar_rad_range + 
#>     solar_rad_max + temperature_max
#> <environment: 0x559e835c1ad0>

#using a formula in a model
m <- stats::lm(
 formula = y,
 data = vi_smol
 )

summary(m)
#> 
#> Call:
#> stats::lm(formula = y, data = vi_smol)
#> 
#> Residuals:
#>       Min        1Q    Median        3Q       Max 
#> -0.280253 -0.053727  0.004544  0.053260  0.268436 
#> 
#> Coefficients:
#>                     Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)        1.407e-01  6.682e-02   2.106 0.035645 *  
#> topo_elevation    -2.859e-05  8.769e-06  -3.260 0.001180 ** 
#> topo_slope         3.367e-03  1.319e-03   2.553 0.010940 *  
#> soil_clay         -1.859e-04  5.068e-04  -0.367 0.713921    
#> topo_diversity     4.302e-03  9.956e-04   4.322 1.83e-05 ***
#> humidity_range    -4.322e-03  6.997e-04  -6.177 1.25e-09 ***
#> cloud_cover_range  7.861e-04  3.645e-04   2.157 0.031427 *  
#> swi_range          2.869e-03  4.160e-04   6.897 1.43e-11 ***
#> soil_silt         -1.354e-03  3.848e-04  -3.519 0.000467 ***
#> rainfall_min       7.734e-04  1.243e-04   6.223 9.54e-10 ***
#> rainfall_range     1.297e-04  3.911e-05   3.316 0.000971 ***
#> soil_soc           3.653e-04  2.260e-04   1.616 0.106588    
#> swi_mean           5.618e-03  4.375e-04  12.840  < 2e-16 ***
#> solar_rad_range   -2.840e-03  1.091e-03  -2.603 0.009477 ** 
#> solar_rad_max      1.744e-03  1.940e-03   0.899 0.369004    
#> temperature_max   -3.778e-03  1.357e-03  -2.784 0.005544 ** 
#> ---
#> Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#> 
#> Residual standard error: 0.08573 on 564 degrees of freedom
#> Multiple R-squared:  0.7792,	Adjusted R-squared:  0.7733 
#> F-statistic: 132.7 on 15 and 564 DF,  p-value: < 2.2e-16
#> 

#classification formula (character response)
y <- model_formula(
  df = vi_smol,
  response = "vi_categorical",
  predictors = x
)
#> 
#> collinear::model_formula()
#> └── collinear::drop_geometry_column(): dropping geometry column from 'df'.

y
#> vi_categorical ~ topo_elevation + topo_slope + soil_clay + topo_diversity + 
#>     humidity_range + cloud_cover_range + swi_range + soil_silt + 
#>     rainfall_min + rainfall_range + soil_soc + swi_mean + solar_rad_range + 
#>     solar_rad_max + temperature_max
#> <environment: 0x559e83bba510>


#polynomial formula (3rd degree)
y <- model_formula(
  df = vi_smol,
  response = "vi_numeric",
  predictors = x,
  term_f = "poly",
  term_args = "degree = 3, raw = TRUE"
)
#> 
#> collinear::model_formula()
#> └── collinear::drop_geometry_column(): dropping geometry column from 'df'.

y
#> vi_numeric ~ poly(topo_elevation, degree = 3, raw = TRUE) + poly(topo_slope, 
#>     degree = 3, raw = TRUE) + poly(soil_clay, degree = 3, raw = TRUE) + 
#>     poly(topo_diversity, degree = 3, raw = TRUE) + poly(humidity_range, 
#>     degree = 3, raw = TRUE) + poly(cloud_cover_range, degree = 3, 
#>     raw = TRUE) + poly(swi_range, degree = 3, raw = TRUE) + poly(soil_silt, 
#>     degree = 3, raw = TRUE) + poly(rainfall_min, degree = 3, 
#>     raw = TRUE) + poly(rainfall_range, degree = 3, raw = TRUE) + 
#>     poly(soil_soc, degree = 3, raw = TRUE) + poly(swi_mean, degree = 3, 
#>     raw = TRUE) + poly(solar_rad_range, degree = 3, raw = TRUE) + 
#>     poly(solar_rad_max, degree = 3, raw = TRUE) + poly(temperature_max, 
#>     degree = 3, raw = TRUE)
#> <environment: 0x559e84842378>

#gam formula
y <- model_formula(
  df = vi_smol,
  response = "vi_numeric",
  predictors = x,
  term_f = "s"
)
#> 
#> collinear::model_formula()
#> └── collinear::drop_geometry_column(): dropping geometry column from 'df'.

y
#> vi_numeric ~ s(topo_elevation) + s(topo_slope) + s(soil_clay) + 
#>     s(topo_diversity) + s(humidity_range) + s(cloud_cover_range) + 
#>     s(swi_range) + s(soil_silt) + s(rainfall_min) + s(rainfall_range) + 
#>     s(soil_soc) + s(swi_mean) + s(solar_rad_range) + s(solar_rad_max) + 
#>     s(temperature_max)
#> <environment: 0x559e82c3f740>

#random effect
y <- model_formula(
  df = vi_smol,
  response = "vi_numeric",
  predictors = x,
  random_effects = "country_name" #from vi_smol$country_name
)
#> 
#> collinear::model_formula()
#> └── collinear::drop_geometry_column(): dropping geometry column from 'df'.

y
#> vi_numeric ~ topo_elevation + topo_slope + soil_clay + topo_diversity + 
#>     humidity_range + cloud_cover_range + swi_range + soil_silt + 
#>     rainfall_min + rainfall_range + soil_soc + swi_mean + solar_rad_range + 
#>     solar_rad_max + temperature_max + (1 | country_name)
#> <environment: 0x559e8606f1d0>