Skip to main content

Table 1 Summary of key Tasks and accompanying R code for a machine learning analysis

From: Machine learning in medicine: a practical introduction to techniques for data pre-processing, hyperparameter tuning, and model comparison

Task

Code in R language

1) Data preparation

 1.1) Import new dataset called „db “

db <—read.csv (url ("https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data"))

 1.2) Label columns in „db”

Colname <—c("BIRADS", "Age", "Shape", "Margin", "Density", "outcome")

 1.3) Assign factor-levels for factors (categorial variables) and set reference category

db$Margin <—as.factor (ifelse (db$Margin =  = "1", "Circumscribed",

ifelse (db$Margin =  = "2","microbulated",

ifelse (db$Margin =  = "3", "Obscured",

ifelse (db$Margin =  = "4", "Ill-defined", "Spiculated")))))

db$Margin <—relevel (db$Margin, ref = "Circumscribed")

 1.4) Assign numeric status for numerical variables

db$Age <—as.numeric (db$Age)

 1.5) Replace unlogical age values as missing values

AGE_upper <—120

AGE_lower <—0

db$Age <—ifelse (db$Age >  = AGE_upper, NA, db$Age)

db$Age <—ifelse (db$Age <  = AGE_lower, NA, db$Age)

 1.6) Remove variables that have over 50% missing values

missing_col <—colMeans (is.na (db))

remove < -vector()

for(i in 1:length(missing_col)){

if(missing_col[i] >  = 0.5){remove < -append(remove, names(missing_col[i]))}}

if(!is.logical(remove)) db < -db % > % dplyr::select(-!!remove)

 1.7) Split dataset into development and validation sets

train_index <—createDataPartition(db$outcome, p = .8, list = FALSE,

times = 1)

db_train <—db[train_index,]

db_test <—db[-train_index,]

 1.8) Define recipe for data pre-processing

recipe <—recipe (outcome ~ Age + Shape + Margin + Density, data = db_train)

recipe <—recipe % > %

step_knnimpute(all_predictors(), neighbors = 5) % > %

step_BoxCox(all_numeric(),-all_outcomes()) % > %

step_other(all_nominal(), threshold = .1, other = "other")

step_zv(all_predictors(),-all_outcomes()) % > %

step_nzv(all_predictors(),-all_outcomes())% > %

step_normalize(all_numeric(),-all_outcomes())% > %

step_dummy(all_nominal(),-all_outcomes()) % > %

step_corr(all_predictors(),-all_outcomes(), threshold = 0.9)

 1.9) Show all steps in the recipe

prep <—prep (recipe, db_train)

tidy (prep)

 1.10) Examine changes of a specific pre-processing step (number 6 = normalize)

tidy (prep, number = 6)

 1.11) Examine pre-processed training data

prep[["template"]]

2) Algorithm Development

 2.1) Define multiple performance metrics for model training

MySummary <—function (data, lev = NULL, model = NULL){

a1 <—defaultSummary(data, lev, model)

b1 <—twoClassSummary(data, lev, model)

c1 <—prSummary(data, lev, model)

out <—c(a1, b1, c1)

out}

 2.2) Define general training parameters for cross-validation and hypergrid-search

cv <—trainControl (method = "repeatedcv",

number = 10,

repeats = 3,

search = "grid",

verboseIter = TRUE,

classProbs = TRUE,

returnResamp = "final",

savePredictions = "final",

summaryFunction = MySummary,

selectionFunction = "tolerance",

allowParallel = TRUE)

 2.3) Define general training parameters for cross-validation and random grid- search

cv <—trainControl(

method = "repeatedcv",

number = 10,

repeats = 3,

search = "random",

verboseIter = TRUE,

classProbs = TRUE,

returnResamp = "final",

savePredictions = "final",

summaryFunction = MySummary,

selectionFunction = "tolerance",

allowParallel = TRUE)

 2.4) Define general training parameters for adaptive resampling for hyperparameter tuing

adaptControl <—trainControl(

method = "adaptive_cv",

number = 10,

repeats = 3,

adaptive = list(min = 5, alpha = 0.05, method = "gls", complete = TRUE),

search = "random",

verboseIter = TRUE,

classProbs = TRUE,

returnResamp = "final",

savePredictions = "final",

summaryFunction = MySummary,

selectionFunction = "tolerance",

allowParallel = TRUE)

 2.5) Hypergrid for Logistic Regression with Elastic Net Penalty

hyper_grid_glm <—expand.grid(

alpha = seq(from = 0.01, to = 1, by = 0.01),

lambda = seq(from = 0.01, to = 1, by = 0.01))

 2.6) Hypergrid for XGBoost Tree

hyper_grid_xgboost <—expand.grid(

nrounds = seq(from = 25, to = 100, by = 25),

max_depth = seq(from = 5, to = 35, by = 10),

eta = seq(from = 0.2, to = 1, by = 0.2),

gamma = seq(from = 1, to = 10, by = 1),

colsample_bytree = seq(from = 0.6, to = 1, by = 0.2),

min_child_weight = seq(from = 2, to = 5, by = 1),

subsample = 1)

 2.7) Hypergrid for MARS algorithm

hyper_grid_mars <—expand.grid(

degree = seq(from = 1, to = 3, by = 1),

nprune = seq(from = 1, to = 10, by = 1))

 2.8) Hypergrid for SVM with polynomial kernel

hyper_grid_svm <—expand.grid(

degree = seq(from = 1, to = 11, by = 2),

scale = seq(from = 0.1, to = 1, by = 0.1),

C = seq(from = 0.5, to = 8, by = 0.5))

 2.9) Hypergrid for multi-layer perceptron with dropout cost (deep neural network)

hyper_grid_nn <—expand.grid(

size = seq(from = 1, to = 21, by = 10),

dropout = seq(from = 0.1, to = 0.3, by = 0.1),

batch_size = seq(from = 1, to = 11, by = 5),

lr = seq(from = 0.25, to = 1, by = 0.25),

rho = seq(from = 0.25, to = 1, by = 0.25),

decay = seq(from = 0.1, to = 0.5, by = 0.2),

cost = seq(from = 0.25, to = 1, by = 0.25),

activation = 'relu')

 2.10) Train Logistic Regression with Elastic Net Penalty (hypergrid search)

cv_glm <—caret::train(recipe,

data = db_train,

method = "glmnet",

metric = "Kappa",

trControl = cv,

tuneGrid = hyper_grid_glm)

 2.11) Train XGBoost Tree (hypergrid search)

cv_xgboost <—caret::train(recipe,

data = db_train,

method = "xgbTree",

metric = "Kappa",

trControl = cv,

tuneGrid = hyper_grid_xgboost)

 2.12) Train MARS algorithm (hypergrid search)

cv_mars <—caret::train(recipe,

data = db_train,

method = "earth",

metric = "Kappa",

trControl = cv,

tuneGrid = hyper_grid_mars)

 2.13) Train SVM with polynomial kernel (random grid search)

cv_svm <—caret::train(recipe,

data = db_train,

method = "svmPoly",

metric = "Kappa",

tuneLength = 30,

trControl = cv_svm)

 2.14) Train multi-layer perceptron with dropout cost (deep neural network) (random grid search)

cv_nn <—caret::train(recipe,

data = db_train,

method = "mlpKerasDropoutCost",

metric = "Kappa",

tuneLength = 30,

trControl = cv_nn)

3) Internal Testing

 3.1) Show final model with optimal hyperparamters for Logistic Regression with Elastic Net Penalty

cv_glm$bestTune

 3.2) Show internal testing results for final model with optimal hyperparamters (Logistic Regression with Elastic Net Penalty)

cv_glm$results[c(#bestTune),]

4) (External) Validation

 4.1) Use trained model to predict outcome probabilities in the validation set

predict (cv_glm, db_test, type = "prob")

 4.2) Use trained model to predict outcome classes in the validation set

predict (cv_glm, db_test)

 4.3) Calculate area under the ROC curve for outcome predictions in the validation set

roc_glm_validation = roc (as.vector (db_test$outcome), as.matrix (predict (cv_glm, db_test, type = "prob")$"Malignant"))

auc_glm_validation = pROC::auc (roc_glm_validation) auc_CI_glm_validation = pROC::ci.auc (roc_glm_validation, method = "bootstrap", boot.stratified = TRUE)

 4.4) Plot ROC curves

plot.roc (roc_glm_validation, legacy.axes = TRUE)

lines (roc_glm_validation, col = "blue")

lines (roc_xgboost_validation, col = "red")

lines (roc_mars_validation, col = "orange")

lines (roc_svm_validation, col = "black")

lines (roc_nn_validation, col = "grey60")

legend ("bottomright", legend = c("LR with Elastic Net Penalty", "XGBoost Tree", "MARS", "SVM", "neural network"), col = c("blue", "red", "orange", "black", "grey60"))

 4.5) Create confusion matrix

confusionMatrix (as.factor (predict (cv_glm, db_test)), factor (db_test$outcome), positive = "Malignant")

 4.6) Create calibration plot

glm_calplot_validation <—calibration(factor (db_test$outcome) ~ as.matrix (predict (cv_glm, db_test, type = "prob")$"Malignant"), data = db_test, cuts = 10)

xyplot(glm_calplot_validation, auto.key = list(columns = 2))

 4.7) Calculate Spiegelhalter’s Z score to assess model calibration

Spiegelhalter_z = function(y, prob){

alpha = 0.05

z_score = sum((y-prob)*(1–2*prob))/sqrt(sum(((1–2*prob)^2)*prob*(1-prob)))

print(z_score)

if (abs(z_score) > qnorm(1-alpha/2)){

print('reject null. NOT calibrated')

} else{

print('fail to reject. calibrated')

}

cat('z score: ', z_score, '\n')

cat('p value: ', 1-pnorm(abs(z_score)), '\n')

return(z_score)}

Spiegelhalter_z (unfactor(revalue(db_test$outcome, c("Malignant" = 1, "Benign" = 0))), as.matrix(predict(cv_glm, db_test, type = "prob")$"Malignant"))

 4.8) Test for differences in area under the curve between two algorithms

roc.test (roc_glm_validation, roc_mars_validation, method = "bootstrap", alternative = "two.sided", boot.n = 2000, boot.stratified = TRUE)

 4.9) Test for differences in diagnostic performance between two algorithms using a McNemar test

mcnemar.test (predict (cv_glm, db_test), predict(cv_mars, db_test), correct = TRUE)