Generates model formulas from a dataframe, a response name, and a vector of predictors that can be the output of a multicollinearity management function such as collinear_select() and the likes. Intended to help fit exploratory models from the result of a multicollinearity analysis.
The types of formulas it can generate are:
additive:
y ~ x + zpolynomial:
y ~ poly(x, ...) + poly(z, ...)GAM:
y ~ s(x) + s(z)random effect:
y ~ x + (1 \ z)
Usage
model_formula(
df = NULL,
response = NULL,
predictors = NULL,
term_f = NULL,
term_args = NULL,
random_effects = NULL,
quiet = FALSE,
...
)Arguments
- df
(required; dataframe, tibble, or sf) A dataframe with responses (optional) and predictors. Must have at least 10 rows for pairwise correlation analysis, and
10 * (length(predictors) - 1)for VIF. Default: NULL.- response
(optional, character string) Name of a response variable in
df. Default: NULL.- predictors
(optional; character vector or NULL) Names of the predictors in
df. If NULL, all columns exceptresponsesand constant/near-zero-variance columns are used. Default: NULL.- term_f
(optional; string). Name of function to apply to each term in the formula, such as "s" for
mgcv::s()or any other smoothing function, "poly" forstats::poly(). Default: NULL- term_args
(optional; string). Arguments of the function applied to each term. For example, for "poly" it can be "degree = 2, raw = TRUE". Default: NULL
- random_effects
(optional, string or character vector). Names of variables to be used as random effects. Each element is added to the final formula as
+(1 | random_effect_name). Default: NULL- quiet
(optional; logical) If FALSE, messages are printed. Default: FALSE.
- ...
(optional) Internal args (e.g.
function_nameforvalidate_arg_function_name, a precomputed correlation matrixm, or cross-validation args forpreference_order).
Value
list if predictors is a list or length of response is higher than one, and character vector otherwise.
See also
Other modelling_tools:
case_weights(),
score_auc(),
score_cramer(),
score_r2()
Examples
data(vi_smol, package = "spatialData")
data(vi_predictors, package = "spatialData")
vi_predictors_numeric <- identify_numeric_variables(
df = vi_smol,
predictors = vi_predictors
)$valid
#reduce collinearity
x <- collinear_select(
df = vi_smol,
predictors = vi_predictors_numeric
)
#>
#> collinear::collinear_select()
#> └── collinear::validate_arg_df()
#> └── collinear::drop_geometry_column(): dropping geometry column from 'df'.
#>
#> collinear::collinear_select()
#> └── collinear::validate_arg_preference_order()
#> └── collinear::preference_order(): ranking 47 'predictors' from lower to higher multicollinearity.
#additive formula
y <- model_formula(
df = vi_smol,
response = "vi_numeric",
predictors = x
)
#>
#> collinear::model_formula()
#> └── collinear::drop_geometry_column(): dropping geometry column from 'df'.
y
#> vi_numeric ~ topo_elevation + topo_slope + soil_clay + topo_diversity +
#> humidity_range + cloud_cover_range + swi_range + soil_silt +
#> rainfall_min + rainfall_range + soil_soc + swi_mean + solar_rad_range +
#> solar_rad_max + temperature_max
#> <environment: 0x559e835c1ad0>
#using a formula in a model
m <- stats::lm(
formula = y,
data = vi_smol
)
summary(m)
#>
#> Call:
#> stats::lm(formula = y, data = vi_smol)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -0.280253 -0.053727 0.004544 0.053260 0.268436
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 1.407e-01 6.682e-02 2.106 0.035645 *
#> topo_elevation -2.859e-05 8.769e-06 -3.260 0.001180 **
#> topo_slope 3.367e-03 1.319e-03 2.553 0.010940 *
#> soil_clay -1.859e-04 5.068e-04 -0.367 0.713921
#> topo_diversity 4.302e-03 9.956e-04 4.322 1.83e-05 ***
#> humidity_range -4.322e-03 6.997e-04 -6.177 1.25e-09 ***
#> cloud_cover_range 7.861e-04 3.645e-04 2.157 0.031427 *
#> swi_range 2.869e-03 4.160e-04 6.897 1.43e-11 ***
#> soil_silt -1.354e-03 3.848e-04 -3.519 0.000467 ***
#> rainfall_min 7.734e-04 1.243e-04 6.223 9.54e-10 ***
#> rainfall_range 1.297e-04 3.911e-05 3.316 0.000971 ***
#> soil_soc 3.653e-04 2.260e-04 1.616 0.106588
#> swi_mean 5.618e-03 4.375e-04 12.840 < 2e-16 ***
#> solar_rad_range -2.840e-03 1.091e-03 -2.603 0.009477 **
#> solar_rad_max 1.744e-03 1.940e-03 0.899 0.369004
#> temperature_max -3.778e-03 1.357e-03 -2.784 0.005544 **
#> ---
#> Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#>
#> Residual standard error: 0.08573 on 564 degrees of freedom
#> Multiple R-squared: 0.7792, Adjusted R-squared: 0.7733
#> F-statistic: 132.7 on 15 and 564 DF, p-value: < 2.2e-16
#>
#classification formula (character response)
y <- model_formula(
df = vi_smol,
response = "vi_categorical",
predictors = x
)
#>
#> collinear::model_formula()
#> └── collinear::drop_geometry_column(): dropping geometry column from 'df'.
y
#> vi_categorical ~ topo_elevation + topo_slope + soil_clay + topo_diversity +
#> humidity_range + cloud_cover_range + swi_range + soil_silt +
#> rainfall_min + rainfall_range + soil_soc + swi_mean + solar_rad_range +
#> solar_rad_max + temperature_max
#> <environment: 0x559e83bba510>
#polynomial formula (3rd degree)
y <- model_formula(
df = vi_smol,
response = "vi_numeric",
predictors = x,
term_f = "poly",
term_args = "degree = 3, raw = TRUE"
)
#>
#> collinear::model_formula()
#> └── collinear::drop_geometry_column(): dropping geometry column from 'df'.
y
#> vi_numeric ~ poly(topo_elevation, degree = 3, raw = TRUE) + poly(topo_slope,
#> degree = 3, raw = TRUE) + poly(soil_clay, degree = 3, raw = TRUE) +
#> poly(topo_diversity, degree = 3, raw = TRUE) + poly(humidity_range,
#> degree = 3, raw = TRUE) + poly(cloud_cover_range, degree = 3,
#> raw = TRUE) + poly(swi_range, degree = 3, raw = TRUE) + poly(soil_silt,
#> degree = 3, raw = TRUE) + poly(rainfall_min, degree = 3,
#> raw = TRUE) + poly(rainfall_range, degree = 3, raw = TRUE) +
#> poly(soil_soc, degree = 3, raw = TRUE) + poly(swi_mean, degree = 3,
#> raw = TRUE) + poly(solar_rad_range, degree = 3, raw = TRUE) +
#> poly(solar_rad_max, degree = 3, raw = TRUE) + poly(temperature_max,
#> degree = 3, raw = TRUE)
#> <environment: 0x559e84842378>
#gam formula
y <- model_formula(
df = vi_smol,
response = "vi_numeric",
predictors = x,
term_f = "s"
)
#>
#> collinear::model_formula()
#> └── collinear::drop_geometry_column(): dropping geometry column from 'df'.
y
#> vi_numeric ~ s(topo_elevation) + s(topo_slope) + s(soil_clay) +
#> s(topo_diversity) + s(humidity_range) + s(cloud_cover_range) +
#> s(swi_range) + s(soil_silt) + s(rainfall_min) + s(rainfall_range) +
#> s(soil_soc) + s(swi_mean) + s(solar_rad_range) + s(solar_rad_max) +
#> s(temperature_max)
#> <environment: 0x559e82c3f740>
#random effect
y <- model_formula(
df = vi_smol,
response = "vi_numeric",
predictors = x,
random_effects = "country_name" #from vi_smol$country_name
)
#>
#> collinear::model_formula()
#> └── collinear::drop_geometry_column(): dropping geometry column from 'df'.
y
#> vi_numeric ~ topo_elevation + topo_slope + soil_clay + topo_diversity +
#> humidity_range + cloud_cover_range + swi_range + soil_silt +
#> rainfall_min + rainfall_range + soil_soc + swi_mean + solar_rad_range +
#> solar_rad_max + temperature_max + (1 | country_name)
#> <environment: 0x559e8606f1d0>
