Quick Start Guide: Heuristic Regressors

This is an R version of the corresponding Heuristics quick start guide.

In this example we will use regressors from Heuristics on the yacht hydrodynamics dataset. First we load in the data and split into training and test datasets:

df <- read.table(
    "yacht_hydrodynamics.data",
    col.names = c("position", "prismatic", "length_displacement",
                  "beam_draught", "length_beam", "froude", "resistance"),
)
  position prismatic length_displacement beam_draught length_beam froude
1     -2.3     0.568                4.78         3.99        3.17  0.125
2     -2.3     0.568                4.78         3.99        3.17  0.150
3     -2.3     0.568                4.78         3.99        3.17  0.175
4     -2.3     0.568                4.78         3.99        3.17  0.200
5     -2.3     0.568                4.78         3.99        3.17  0.225
6     -2.3     0.568                4.78         3.99        3.17  0.250
7     -2.3     0.568                4.78         3.99        3.17  0.275
8     -2.3     0.568                4.78         3.99        3.17  0.300
  resistance
1       0.11
2       0.27
3       0.47
4       0.78
5       1.18
6       1.82
7       2.61
8       3.76
 [ reached 'max' / getOption("max.print") -- omitted 300 rows ]
X <- df[, 1:6]
y <- df[, 7]
split <- iai::split_data("regression", X, y, seed = 1)
train_X <- split$train$X
train_y <- split$train$y
test_X <- split$test$X
test_y <- split$test$y

Random Forest Regressor

We will use a grid_search to fit a random_forest_regressor:

grid <- iai::grid_search(
    iai::random_forest_regressor(
        random_seed = 1,
    ),
    max_depth = 5:10,
)
iai::fit(grid, train_X, train_y)

We can make predictions on new data using predict:

iai::predict(grid, test_X)
 [1]  0.1283619  0.2459160  1.2930502  2.8546143  5.2011561 13.5863069
 [7] 20.9891182  0.1275619  0.2459160  0.4974986  1.9201760  3.6621406
[13]  5.1676178 13.4662440 51.1435831  0.4928556  0.7681789  5.2331082
[19] 21.4792283 34.3279356  0.1283619  0.4953686  1.2913152  2.8518560
[25]  5.1682077  8.1514252  1.9243579  5.2273678 33.5052606  1.9201760
[31]  2.6493058  5.1731095 13.0934685 35.6403919  5.2331082  3.8209244
[37]  0.2459160  0.4963656  1.2852632 13.8610744 35.0543452 50.9581238
[43]  0.2459160  0.7642915  2.8761564  5.1849077 34.3279356  8.2681760
[49] 13.2135314  1.2913152  5.1571194 13.4662440  0.2459160  0.4974253
[55]  1.2892152  1.9263162  2.8348060 34.3635092  0.4988786 13.4647702
 [ reached getOption("max.print") -- omitted 32 entries ]

We can evaluate the quality of the model using score with any of the supported loss functions. For example, the $R^2$ on the training set:

iai::score(grid, train_X, train_y, criterion = "mse")
[1] 0.9954617

Or on the test set:

iai::score(grid, test_X, test_y, criterion = "mse")
[1] 0.9898502

We can also look at the variable importance:

iai::variable_importance(iai::get_learner(grid))
              Feature   Importance
1              froude 0.9944054062
2           prismatic 0.0016570417
3 length_displacement 0.0016171265
4        beam_draught 0.0014286269
5         length_beam 0.0006639340
6            position 0.0002278647

XGBoost Regressor

We will use a grid_search to fit an xgboost_regressor:

grid <- iai::grid_search(
    iai::xgboost_regressor(
        random_seed = 1,
    ),
    max_depth = 2:5,
    num_round = c(20, 50, 100),
)
iai::fit(grid, train_X, train_y)

We can make predictions on new data using predict:

iai::predict(grid, test_X)
 [1]  0.2334659  0.3780577  1.2658763  2.8110390  5.4151912 12.5989885
 [7] 20.5250244  0.2109738  0.2571414  0.4242853  1.6431477  3.3103681
[13]  4.9388676 12.4505129 51.1063232  0.5053502  0.8192500  5.6796966
[19] 21.4824181 33.4262810  0.2370025  0.5487384  1.2674444  2.8126078
[25]  5.4332862  7.7935791  2.1362300  5.8265419 33.6861458  1.7849571
[31]  2.5079229  5.2016807 12.5041151 34.9638062  5.3456430  3.5210898
[37]  0.1147331  0.3478574  1.1607797 12.9112940 33.2657661 51.1251106
[43]  0.2578683  0.8772089  2.9592717  6.8994780 36.7811012  7.9863753
[49] 12.8663063  1.2572267  6.6050649 14.7291059  0.2380494  0.4051932
[55]  1.1617763  1.7950108  2.7069402 32.5592308  0.3430066 13.7243576
 [ reached getOption("max.print") -- omitted 32 entries ]

We can evaluate the quality of the model using score with any of the supported loss functions. For example, the $R^2$ on the training set:

iai::score(grid, train_X, train_y, criterion = "mse")
[1] 0.9995068

Or on the test set:

iai::score(grid, test_X, test_y, criterion = "mse")
[1] 0.9973451

We can also look at the variable importance:

iai::variable_importance(iai::get_learner(grid))
              Feature  Importance
1              froude 0.954647255
2 length_displacement 0.018292905
3           prismatic 0.013921611
4        beam_draught 0.006395428
5            position 0.004111475
6         length_beam 0.002631326

We can calculate the SHAP values:

iai::predict_shap(grid, test_X)
$expected_value
[1] 10.31493

$features
   position prismatic length_displacement beam_draught length_beam froude
1      -2.3     0.568                4.78         3.99        3.17  0.125
2      -2.3     0.568                4.78         3.99        3.17  0.150
3      -2.3     0.568                4.78         3.99        3.17  0.225
4      -2.3     0.568                4.78         3.99        3.17  0.275
5      -2.3     0.568                4.78         3.99        3.17  0.325
6      -2.3     0.568                4.78         3.99        3.17  0.375
7      -2.3     0.568                4.78         3.99        3.17  0.400
8      -2.3     0.569                4.78         3.04        3.64  0.125
9      -2.3     0.569                4.78         3.04        3.64  0.150
10     -2.3     0.569                4.78         3.04        3.64  0.175
 [ reached 'max' / getOption("max.print") -- omitted 82 rows ]

$shap_values
             [,1]          [,2]         [,3]          [,4]         [,5]
 [1,] -0.06328079 -0.0114607625 -0.039164387 -0.0184658561 -0.081526875
 [2,] -0.06645517 -0.0114607625 -0.037850615 -0.0076839048 -0.081526875
 [3,] -0.07239565 -0.0127419746 -0.043004155  0.0076098423 -0.134582460
 [4,] -0.07239565 -0.0052559059 -0.043004155  0.0375998765 -0.134582460
 [5,] -0.17576683  0.0002705725 -0.039003402  0.0418154374 -0.175719991
 [6,] -0.21455890 -0.1960725635 -0.034859344  0.0003986559 -0.160192043
 [7,] -0.20534906 -0.4937427938 -0.034859344 -0.0975698680 -0.204385787
 [8,] -0.06328079 -0.0534273535 -0.038051382  0.0455295406  0.026065968
 [9,] -0.06645517 -0.0534273535 -0.036737613  0.0070993965  0.026065968
[10,] -0.07239565 -0.0534273535 -0.036737613  0.0070993965  0.008674378
            [,6]
 [1,]  -9.867568
 [2,]  -9.731897
 [3,]  -8.793941
 [4,]  -7.286256
 [5,]  -4.551339
 [6,]   2.889342
 [7,]  11.245992
 [8,] -10.020795
 [9,]  -9.934337
[10,]  -9.743861
 [ reached getOption("max.print") -- omitted 82 rows ]

We can then use the SHAP library to visualize these results in whichever way we prefer.

GLMNet Regressor

We can use a glmnetcv_regressor to fit a GLMNet model using cross-validation:

lnr <- iai::glmnetcv_regressor(
    random_seed = 1,
    n_folds = 10,
)
iai::fit(lnr, train_X, train_y)
Julia Object of type GLMNetCVRegressor.
Fitted GLMNetCVRegressor:
  Constant: -22.2638
  Weights:
    froude:  113.914

We can access the coefficients from the fitted model with get_prediction_weights and get_prediction_constant:

weights <- iai::get_prediction_weights(lnr)
weights$numeric
$froude
[1] 113.9142
weights$categoric
named list()
iai::get_prediction_constant(lnr)
[1] -22.2638

We can make predictions on new data using predict:

iai::predict(lnr, test_X)
 [1] -8.0245221 -5.1766668  3.3668992  9.0626099 14.7583206 20.4540313
 [7] 23.3018866 -8.0245221 -5.1766668 -2.3288115  6.2147546 11.9104653
[13] 14.7583206 20.4540313 28.9975973 -2.3288115  0.5190439 14.7583206
[19] 23.3018866 26.1497420 -8.0245221 -2.3288115  3.3668992  9.0626099
[25] 14.7583206 17.6061759  6.2147546 14.7583206 26.1497420  6.2147546
[31]  9.0626099 14.7583206 20.4540313 26.1497420 14.7583206 11.9104653
[37] -5.1766668 -2.3288115  3.3668992 20.4540313 26.1497420 28.9975973
[43] -5.1766668  0.5190439  9.0626099 14.7583206 26.1497420 17.6061759
[49] 20.4540313  3.3668992 14.7583206 20.4540313 -5.1766668 -2.3288115
[55]  3.3668992  6.2147546  9.0626099 26.1497420 -2.3288115 20.4540313
 [ reached getOption("max.print") -- omitted 32 entries ]

We can evaluate the quality of the model using score with any of the supported loss functions. For example, the $R^2$ on the training set:

iai::score(lnr, train_X, train_y, criterion = "mse")
[1] 0.6545717

Or on the test set:

iai::score(lnr, test_X, test_y, criterion = "mse")
[1] 0.6510068

We can also look at the variable importance:

iai::variable_importance(lnr)
              Feature Importance
1              froude          1
2        beam_draught          0
3         length_beam          0
4 length_displacement          0
5            position          0
6           prismatic          0