Quick Start Guide: Heuristic Regressors

This is an R version of the corresponding Heuristics quick start guide.

In this example we will use regressors from Heuristics on the yacht hydrodynamics dataset. First we load in the data and split into training and test datasets:

df <- read.table(
    "yacht_hydrodynamics.data",
    col.names = c("position", "prismatic", "length_displacement",
                  "beam_draught", "length_beam", "froude", "resistance"),
)
  position prismatic length_displacement beam_draught length_beam froude
1     -2.3     0.568                4.78         3.99        3.17  0.125
2     -2.3     0.568                4.78         3.99        3.17  0.150
3     -2.3     0.568                4.78         3.99        3.17  0.175
4     -2.3     0.568                4.78         3.99        3.17  0.200
5     -2.3     0.568                4.78         3.99        3.17  0.225
6     -2.3     0.568                4.78         3.99        3.17  0.250
7     -2.3     0.568                4.78         3.99        3.17  0.275
8     -2.3     0.568                4.78         3.99        3.17  0.300
  resistance
1       0.11
2       0.27
3       0.47
4       0.78
5       1.18
6       1.82
7       2.61
8       3.76
 [ reached 'max' / getOption("max.print") -- omitted 300 rows ]
X <- df[, 1:6]
y <- df[, 7]
split <- iai::split_data("regression", X, y, seed = 1)
train_X <- split$train$X
train_y <- split$train$y
test_X <- split$test$X
test_y <- split$test$y

Random Forest Regressor

We will use a grid_search to fit a random_forest_regressor:

grid <- iai::grid_search(
    iai::random_forest_regressor(
        random_seed = 1,
    ),
    max_depth = 5:10,
)
iai::fit(grid, train_X, train_y)

We can make predictions on new data using predict:

iai::predict(grid, test_X)
 [1]  0.09691999  0.28367277  1.27294232  2.77383666  5.08356500 12.91213286
 [7] 21.03160167  0.09782665  0.28199285  0.49682971  1.77856218  3.39283254
[13]  4.93092667 12.81255286 51.64830833  0.49504705  0.79381551  5.07728643
[19] 21.20838500 33.37613595  0.09628776  0.51801915  1.30719161  2.77196261
[25]  5.07719833  7.69981254  1.85259290  4.99798000 33.56582500  1.70427105
[31]  2.50171214  4.74597167 12.75123619 33.39946667  5.06899833  3.97601311
[37]  0.25943276  0.49114647  1.28675125 12.90923952 33.14091500 49.32153667
[43]  0.25665538  0.76506468  2.80367111  5.33649476 34.41753095  7.75091869
[49] 12.66252405  1.33414134  5.32317333 14.43079667  0.27903480  0.51743243
[55]  1.23306934  1.90676091  2.78081567 34.09669762  0.51744731 13.63693500
 [ reached getOption("max.print") -- omitted 32 entries ]

We can evaluate the quality of the model using score with any of the supported loss functions. For example, the $R^2$ on the training set:

iai::score(grid, train_X, train_y, criterion = "mse")
[1] 0.9993066

Or on the test set:

iai::score(grid, test_X, test_y, criterion = "mse")
[1] 0.9937779

We can also look at the variable importance:

iai::variable_importance(iai::get_learner(grid))
              Feature   Importance
1              froude 0.9906817399
2           prismatic 0.0040447210
3        beam_draught 0.0024306702
4            position 0.0014151009
5 length_displacement 0.0012264214
6         length_beam 0.0002013466

XGBoost Regressor

We will use a grid_search to fit an xgboost_regressor:

grid <- iai::grid_search(
    iai::xgboost_regressor(
        random_seed = 1,
    ),
    max_depth = 2:5,
    num_round = c(20, 50, 100),
)
iai::fit(grid, train_X, train_y)

We can make predictions on new data using predict:

iai::predict(grid, test_X)
 [1]  0.2334633  0.3780546  1.2658739  2.8110404  5.4151907 12.5989904
 [7] 20.5250225  0.2109690  0.2571363  0.4242792  1.6431427  3.3103671
[13]  4.9388671 12.4505177 51.1063309  0.5053463  0.8192463  5.6796961
[19] 21.4824142 33.4262848  0.2369986  0.5487328  1.2674398  2.8126078
[25]  5.4332857  7.7935810  2.1362286  5.8265414 33.6861458  1.7849541
[31]  2.5079226  5.2016802 12.5041161 34.9638176  5.3456426  3.5210896
[37]  0.1147327  0.3478556  1.1607790 12.9112959 33.2657661 51.1251106
[43]  0.2578640  0.8772039  2.9592719  6.8994789 36.7811089  7.9863768
[49] 12.8663092  1.2572222  6.6050653 14.7291107  0.2380447  0.4051876
[55]  1.1617727  1.7950068  2.7069402 32.5592384  0.3430033 13.7243605
 [ reached getOption("max.print") -- omitted 32 entries ]

We can evaluate the quality of the model using score with any of the supported loss functions. For example, the $R^2$ on the training set:

iai::score(grid, train_X, train_y, criterion = "mse")
[1] 0.9995068

Or on the test set:

iai::score(grid, test_X, test_y, criterion = "mse")
[1] 0.9973451

We can also look at the variable importance:

iai::variable_importance(iai::get_learner(grid))
              Feature   Importance
1              froude 0.9936649042
2           prismatic 0.0025468329
3        beam_draught 0.0013717075
4 length_displacement 0.0011539730
5            position 0.0006484114
6         length_beam 0.0006141710

GLMNet Regressor

We can use a glmnetcv_regressor to fit a GLMNet model using cross-validation:

lnr <- iai::glmnetcv_regressor(
    random_seed = 1,
    nfolds=10,
)
iai::fit(lnr, train_X, train_y)
Julia Object of type GLMNetCVRegressor.
Fitted GLMNetCVRegressor:
  Constant: -22.0757
  Weights:
    froude:  113.256

We can access the coefficients from the fitted model with get_prediction_weights and get_prediction_constant:

weights <- iai::get_prediction_weights(lnr)
weights$numeric
$froude
[1] 113.2565
weights$categoric
named list()
iai::get_prediction_constant(lnr)
[1] -22.0757

We can make predictions on new data using predict:

iai::predict(lnr, test_X)
 [1] -7.9186331 -5.0872206  3.4070168  9.0698417 14.7326667 20.3954916
 [7] 23.2269041 -7.9186331 -5.0872206 -2.2558082  6.2384293 11.9012542
[13] 14.7326667 20.3954916 28.8897291 -2.2558082  0.5756043 14.7326667
[19] 23.2269041 26.0583166 -7.9186331 -2.2558082  3.4070168  9.0698417
[25] 14.7326667 17.5640792  6.2384293 14.7326667 26.0583166  6.2384293
[31]  9.0698417 14.7326667 20.3954916 26.0583166 14.7326667 11.9012542
[37] -5.0872206 -2.2558082  3.4070168 20.3954916 26.0583166 28.8897291
[43] -5.0872206  0.5756043  9.0698417 14.7326667 26.0583166 17.5640792
[49] 20.3954916  3.4070168 14.7326667 20.3954916 -5.0872206 -2.2558082
[55]  3.4070168  6.2384293  9.0698417 26.0583166 -2.2558082 20.3954916
 [ reached getOption("max.print") -- omitted 32 entries ]

We can evaluate the quality of the model using score with any of the supported loss functions. For example, the $R^2$ on the training set:

iai::score(lnr, train_X, train_y, criterion = "mse")
[1] 0.654152

Or on the test set:

iai::score(lnr, test_X, test_y, criterion = "mse")
[1] 0.6504196

We can also look at the variable importance:

iai::variable_importance(lnr)
              Feature Importance
1              froude          1
2        beam_draught          0
3         length_beam          0
4 length_displacement          0
5            position          0
6           prismatic          0