Generates model formulas from a dataframe, a response name, and a vector of predictors that can be the output of a multicollinearity management function such as collinear_select() and the likes. Intended to help fit exploratory models from the result of a multicollinearity analysis.
The types of formulas it can generate are:
additive:
y ~ x + zpolynomial:
y ~ poly(x, ...) + poly(z, ...)GAM:
y ~ s(x) + s(z)random effect:
y ~ x + (1 \ z)
Usage
model_formula(
df = NULL,
response = NULL,
predictors = NULL,
term_f = NULL,
term_args = NULL,
random_effects = NULL,
quiet = FALSE,
...
)Arguments
- df
(required; dataframe, tibble, or sf) A dataframe with responses (optional) and predictors. Must have at least 10 rows for pairwise correlation analysis, and
10 * (length(predictors) - 1)for VIF. Default: NULL.- response
(optional, character string) Name of a response variable in
df. Default: NULL.- predictors
(optional; character vector or NULL) Names of the predictors in
df. If NULL, all columns exceptresponsesand constant/near-zero-variance columns are used. Default: NULL.- term_f
(optional; string). Name of function to apply to each term in the formula, such as "s" for
mgcv::s()or any other smoothing function, "poly" forstats::poly(). Default: NULL- term_args
(optional; string). Arguments of the function applied to each term. For example, for "poly" it can be "degree = 2, raw = TRUE". Default: NULL
- random_effects
(optional, string or character vector). Names of variables to be used as random effects. Each element is added to the final formula as
+(1 | random_effect_name). Default: NULL- quiet
(optional; logical) If FALSE, messages are printed. Default: FALSE.
- ...
(optional) Internal args (e.g.
function_nameforvalidate_arg_function_name, a precomputed correlation matrixm, or cross-validation args forpreference_order).
Value
list if predictors is a list or length of response is higher than one, and character vector otherwise.
See also
Other modelling_tools:
case_weights(),
score_auc(),
score_cramer(),
score_r2()
Examples
data(vi_smol, package = "spatialData")
data(vi_predictors, package = "spatialData")
vi_predictors_numeric <- identify_numeric_variables(
df = vi_smol,
predictors = vi_predictors
)$valid
#reduce collinearity
x <- collinear_select(
df = vi_smol,
predictors = vi_predictors_numeric
)
#>
#> collinear::collinear_select()
#> └── collinear::validate_arg_preference_order()
#> └── collinear::preference_order(): ranking 47 'predictors' from lower to higher multicollinearity.
#additive formula
y <- model_formula(
df = vi_smol,
response = "vi_numeric",
predictors = x
)
y
#> vi_numeric ~ topo_elevation + humidity_range + topo_diversity +
#> topo_slope + soil_clay + cloud_cover_range + soil_silt +
#> rainfall_min + growing_season_temperature + swi_max + soil_nitrogen +
#> evapotranspiration_range
#> <environment: 0x55e8ba899028>
#using a formula in a model
m <- stats::lm(
formula = y,
data = vi_smol
)
summary(m)
#>
#> Call:
#> stats::lm(formula = y, data = vi_smol)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -0.43415 -0.06399 -0.00387 0.06028 0.28264
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 2.881e-01 5.376e-02 5.359 1.22e-07 ***
#> topo_elevation -6.188e-05 8.563e-06 -7.227 1.61e-12 ***
#> humidity_range -7.066e-03 7.210e-04 -9.800 < 2e-16 ***
#> topo_diversity 4.616e-03 1.053e-03 4.385 1.38e-05 ***
#> topo_slope 1.544e-03 1.664e-03 0.928 0.354033
#> soil_clay 9.599e-04 5.623e-04 1.707 0.088324 .
#> cloud_cover_range 1.457e-03 3.878e-04 3.758 0.000189 ***
#> soil_silt -1.000e-03 4.447e-04 -2.249 0.024927 *
#> rainfall_min 3.613e-04 1.013e-04 3.567 0.000392 ***
#> growing_season_temperature -2.778e-03 1.052e-03 -2.641 0.008495 **
#> swi_max 4.817e-03 2.792e-04 17.250 < 2e-16 ***
#> soil_nitrogen 2.561e-03 4.217e-03 0.607 0.543967
#> evapotranspiration_range -1.375e-03 1.440e-04 -9.546 < 2e-16 ***
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 0.0966 on 567 degrees of freedom
#> Multiple R-squared: 0.7946, Adjusted R-squared: 0.7903
#> F-statistic: 182.8 on 12 and 567 DF, p-value: < 2.2e-16
#>
#classification formula (character response)
y <- model_formula(
df = vi_smol,
response = "vi_categorical",
predictors = x
)
y
#> vi_categorical ~ topo_elevation + humidity_range + topo_diversity +
#> topo_slope + soil_clay + cloud_cover_range + soil_silt +
#> rainfall_min + growing_season_temperature + swi_max + soil_nitrogen +
#> evapotranspiration_range
#> <environment: 0x55e8ba899028>
#polynomial formula (3rd degree)
y <- model_formula(
df = vi_smol,
response = "vi_numeric",
predictors = x,
term_f = "poly",
term_args = "degree = 3, raw = TRUE"
)
y
#> vi_numeric ~ poly(topo_elevation, degree = 3, raw = TRUE) + poly(humidity_range,
#> degree = 3, raw = TRUE) + poly(topo_diversity, degree = 3,
#> raw = TRUE) + poly(topo_slope, degree = 3, raw = TRUE) +
#> poly(soil_clay, degree = 3, raw = TRUE) + poly(cloud_cover_range,
#> degree = 3, raw = TRUE) + poly(soil_silt, degree = 3, raw = TRUE) +
#> poly(rainfall_min, degree = 3, raw = TRUE) + poly(growing_season_temperature,
#> degree = 3, raw = TRUE) + poly(swi_max, degree = 3, raw = TRUE) +
#> poly(soil_nitrogen, degree = 3, raw = TRUE) + poly(evapotranspiration_range,
#> degree = 3, raw = TRUE)
#> <environment: 0x55e8ba899028>
#gam formula
y <- model_formula(
df = vi_smol,
response = "vi_numeric",
predictors = x,
term_f = "s"
)
y
#> vi_numeric ~ s(topo_elevation) + s(humidity_range) + s(topo_diversity) +
#> s(topo_slope) + s(soil_clay) + s(cloud_cover_range) + s(soil_silt) +
#> s(rainfall_min) + s(growing_season_temperature) + s(swi_max) +
#> s(soil_nitrogen) + s(evapotranspiration_range)
#> <environment: 0x55e8ba899028>
#random effect
y <- model_formula(
df = vi_smol,
response = "vi_numeric",
predictors = x,
random_effects = "country_name" #from vi_smol$country_name
)
y
#> vi_numeric ~ topo_elevation + humidity_range + topo_diversity +
#> topo_slope + soil_clay + cloud_cover_range + soil_silt +
#> rainfall_min + growing_season_temperature + swi_max + soil_nitrogen +
#> evapotranspiration_range + (1 | country_name)
#> <environment: 0x55e8ba899028>
