binary classification using the ionosphere data

The following examples illustrate the functionality of the KernelKnn package for classification tasks. I’ll make use of the ionosphere data set,

data(ionosphere, package = 'KernelKnn')

apply(ionosphere, 2, function(x) length(unique(x)))

##    V1    V2    V3    V4    V5    V6    V7    V8    V9   V10   V11   V12   V13 
##     2     1   219   269   204   259   231   260   244   267   246   269   238 
##   V14   V15   V16   V17   V18   V19   V20   V21   V22   V23   V24   V25   V26 
##   266   234   270   254   280   254   266   248   265   248   264   256   273 
##   V27   V28   V29   V30   V31   V32   V33   V34 class 
##   256   281   244   266   243   263   245   263     2

# the second column will be removed as it has a single unique value

ionosphere = ionosphere[, -2]

When using an algorithm where the ouput depends on distance calculation (as is the case in k-nearest-neighbors) it is recommended to first scale the data,

# recommended is to scale the data

X = scale(ionosphere[, -ncol(ionosphere)])
y = ionosphere[, ncol(ionosphere)]

important note : In classification, both functions KernelKnn and KernelKnnCV accept a numeric vector as a response variable (here y) and the unique values of the labels should begin from 1. This is important otherwise the internal functions do not work. Furthermore, both functions (by default) return predictions in form of probabilities, which can be converted to labels by using either a threshold (if binary classification) or the maximum value of each column (if multiclass classification).

# labels should be numeric and begin from 1:Inf

y = c(1:length(unique(y)))[ match(ionosphere$class, sort(unique(ionosphere$class))) ]

# random split of data in train and test

spl_train = sample(1:length(y), round(length(y) * 0.75))
spl_test = setdiff(1:length(y), spl_train)
str(spl_train)

##  int [1:263] 154 183 270 147 251 301 252 85 329 87 ...

str(spl_test)

##  int [1:88] 2 3 7 9 12 17 21 22 27 35 ...

# evaluation metric

acc = function (y_true, preds) {
  
  out = table(y_true, max.col(preds, ties.method = "random"))
  
  acc = sum(diag(out))/sum(out)
  
  acc
}

The KernelKnn function

The KernelKnn function takes a number of arguments. To read details for each one of the arguments type ?KernelKnn::KernelKnn in the console.

A simple k-nearest-neighbors can be run with weights_function = NULL and the parameter ‘regression’ should be set to FALSE. In classification the Levels parameter takes the unique values of the response variable,

library(KernelKnn)

preds_TEST = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 5 , 
                       
                       method = 'euclidean', weights_function = NULL, regression = F,
                       
                       Levels = unique(y))
head(preds_TEST)

##      class_1 class_2
## [1,]     0.0     1.0
## [2,]     0.0     1.0
## [3,]     0.0     1.0
## [4,]     0.0     1.0
## [5,]     0.2     0.8
## [6,]     0.0     1.0

There are two ways to use a kernel in the KernelKnn function. The first option is to choose one of the existing kernels (uniform, triangular, epanechnikov, biweight, triweight, tricube, gaussian, cosine, logistic, silverman, inverse, gaussianSimple, exponential). Here, I use the canberra metric and the tricube kernel because they give optimal results (according to my RandomSearchR package),

preds_TEST_tric = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 10 , 
                            
                            method = 'canberra', weights_function = 'tricube', regression = F,  
                            
                            Levels = unique(y))
head(preds_TEST_tric)

##         class_1   class_2
## [1,] 0.01745564 0.9825444
## [2,] 0.00000000 1.0000000
## [3,] 0.00000000 1.0000000
## [4,] 0.00000000 1.0000000
## [5,] 0.52660920 0.4733908
## [6,] 0.00000000 1.0000000

The second option is to give a self defined kernel function. Here, I’ll pick the density function of the normal distribution with mean = 0.0 and standard deviation = 1.0 (the data are scaled to have mean zero and unit variance),

norm_kernel = function(W) {
  
  W = dnorm(W, mean = 0, sd = 1.0)
  
  W = W / rowSums(W)
  
  return(W)
}


preds_TEST_norm = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 10 , 
                            
                            method = 'canberra', weights_function = norm_kernel, regression = F, 
                            
                            Levels = unique(y))
head(preds_TEST_norm)

##        class_1   class_2
## [1,] 0.2615000 0.7385000
## [2,] 0.0000000 1.0000000
## [3,] 0.0000000 1.0000000
## [4,] 0.0000000 1.0000000
## [5,] 0.4257038 0.5742962
## [6,] 0.0000000 1.0000000

The computations can be speed up by using the parameter threads (multiple cores can be run in parallel). There is also the option to exclude extrema (minimum and maximum distances) during the calculation of the k-nearest-neighbor distances using extrema = TRUE. The bandwidth of the existing kernels can be tuned using the h parameter.

K-nearest-neigbor calculations in the KernelKnn function can be accomplished using the following distance metrics : euclidean, manhattan, chebyshev, canberra, braycurtis, minkowski (by default the order ‘p’ of the minkowski parameter equals k), hamming, mahalanobis, pearson_correlation, simple_matching_coefficient, jaccard_coefficient and Rao_coefficient. The last four are similarity measures and are appropriate for binary data [0,1].

I employed my RandomSearchR package to find the optimal parameters for the KernelKnn function and the following two pairs of parameters give an optimal accuracy,

k	method	kernel
10	canberra	tricube
9	canberra	epanechnikov

The KernelKnnCV function

I’ll use the KernelKnnCV function to calculate the accuracy using 5-fold cross-validation for the previous mentioned parameter pairs,

fit_cv_pair1 = KernelKnnCV(X, y, k = 10 , folds = 5, method = 'canberra', 
                           
                           weights_function = 'tricube', regression = F, 
                           
                           Levels = unique(y), threads = 5, seed_num = 5)

str(fit_cv_pair1)

## List of 2
##  $ preds:List of 5
##   ..$ : num [1:71, 1:2] 0.00 5.02e-01 5.90e-18 9.16e-01 0.00 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 0.0846 0.2507 0.0112 1 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 0 0 0.056 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 0 0 0 0 0 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 0.986 1 1 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##  $ folds:List of 5
##   ..$ fold_1: int [1:71] 242 237 7 20 25 24 232 31 44 42 ...
##   ..$ fold_2: int [1:70] 267 59 84 75 253 269 270 50 73 257 ...
##   ..$ fold_3: int [1:70] 102 293 114 124 120 279 113 288 281 132 ...
##   ..$ fold_4: int [1:70] 142 302 160 304 321 305 172 138 325 319 ...
##   ..$ fold_5: int [1:70] 204 217 219 211 194 190 183 343 225 196 ...

fit_cv_pair2 = KernelKnnCV(X, y, k = 9 , folds = 5,method = 'canberra',
                           
                           weights_function = 'epanechnikov', regression = F,
                           
                           Levels = unique(y), threads = 5, seed_num = 5)

str(fit_cv_pair2)

## List of 2
##  $ preds:List of 5
##   ..$ : num [1:71, 1:2] 0 0.459 0 0.913 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0.00 1.18e-01 2.01e-01 3.42e-07 1.00 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 0 0 0.0825 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 0 0 0 0 0 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 0.957 1 1 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##  $ folds:List of 5
##   ..$ fold_1: int [1:71] 242 237 7 20 25 24 232 31 44 42 ...
##   ..$ fold_2: int [1:70] 267 59 84 75 253 269 270 50 73 257 ...
##   ..$ fold_3: int [1:70] 102 293 114 124 120 279 113 288 281 132 ...
##   ..$ fold_4: int [1:70] 142 302 160 304 321 305 172 138 325 319 ...
##   ..$ fold_5: int [1:70] 204 217 219 211 194 190 183 343 225 196 ...

Each cross-validated object returns a list of length 2 ( the first sublist includes the predictions for each fold whereas the second gives the indices of the folds)

acc_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds), 
                          
                          function(x) acc(y[fit_cv_pair1$folds[[x]]], 
                                          
                                          fit_cv_pair1$preds[[x]])))
acc_pair1

## [1] 0.9154930 0.9142857 0.9142857 0.9285714 0.9571429

cat('accurcay for params_pair1 is :', mean(acc_pair1), '\n')

## accurcay for params_pair1 is : 0.9259557

acc_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds), 
                          
                          function(x) acc(y[fit_cv_pair2$folds[[x]]], 
                                          
                                          fit_cv_pair2$preds[[x]])))
acc_pair2

## [1] 0.9014085 0.9142857 0.9000000 0.9142857 0.9571429

cat('accuracy for params_pair2 is :', mean(acc_pair2), '\n')

## accuracy for params_pair2 is : 0.9174245

Adding or multiplying kernels

In the KernelKnn package there is also the option to combine kernels (adding or multiplying) from the existing ones. For instance, if I want to multiply the tricube with the gaussian kernel, then I’ll give the following character string to the weights_function, “tricube_gaussian_MULT”. On the other hand, If I want to add the same kernels then the weights_function will be “tricube_gaussian_ADD”. I experimented with my RandomSearchR package combining the different kernels and the following two parameter settings gave optimal results,

k	method	kernel
16	canberra	biweight_triweight_gaussian_MULT
5	canberra	triangular_triweight_MULT

fit_cv_pair1 = KernelKnnCV(X, y, k = 16, folds = 5, method = 'canberra', 
                           
                           weights_function = 'biweight_triweight_gaussian_MULT', 
                           
                           regression = F, Levels = unique(y), threads = 5, 
                           
                           seed_num = 5)

str(fit_cv_pair1)

## List of 2
##  $ preds:List of 5
##   ..$ : num [1:71, 1:2] 0.00 4.70e-01 5.79e-05 9.26e-01 4.85e-06 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 0.0476 0.2602 0.0132 0.9991 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0.00 7.39e-08 0.00 1.78e-02 0.00 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 0.99181 1 1 0.00127 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##  $ folds:List of 5
##   ..$ fold_1: int [1:71] 242 237 7 20 25 24 232 31 44 42 ...
##   ..$ fold_2: int [1:70] 267 59 84 75 253 269 270 50 73 257 ...
##   ..$ fold_3: int [1:70] 102 293 114 124 120 279 113 288 281 132 ...
##   ..$ fold_4: int [1:70] 142 302 160 304 321 305 172 138 325 319 ...
##   ..$ fold_5: int [1:70] 204 217 219 211 194 190 183 343 225 196 ...

fit_cv_pair2 = KernelKnnCV(X, y, k = 5, folds = 5, method = 'canberra', 
                           
                           weights_function = 'triangular_triweight_MULT', 
                           
                           regression = F, Levels = unique(y), threads = 5,
                           
                           seed_num = 5)

str(fit_cv_pair2)

## List of 2
##  $ preds:List of 5
##   ..$ : num [1:71, 1:2] 0 0.363 0 1 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 0.00076 0.18937 0 1 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0.00 0.00 0.00 3.44e-24 0.00 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 0 0 0 0 0 0 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##   ..$ : num [1:70, 1:2] 0 1 1 1 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "class_1" "class_2"
##  $ folds:List of 5
##   ..$ fold_1: int [1:71] 242 237 7 20 25 24 232 31 44 42 ...
##   ..$ fold_2: int [1:70] 267 59 84 75 253 269 270 50 73 257 ...
##   ..$ fold_3: int [1:70] 102 293 114 124 120 279 113 288 281 132 ...
##   ..$ fold_4: int [1:70] 142 302 160 304 321 305 172 138 325 319 ...
##   ..$ fold_5: int [1:70] 204 217 219 211 194 190 183 343 225 196 ...

acc_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds), 
                          
                          function(x) acc(y[fit_cv_pair1$folds[[x]]], 
                                          
                                          fit_cv_pair1$preds[[x]])))
acc_pair1

## [1] 0.9014085 0.9142857 0.9285714 0.9285714 0.9571429

cat('accuracy for params_pair1 is :', mean(acc_pair1), '\n')

## accuracy for params_pair1 is : 0.925996

acc_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds), 
                          
                          function(x) acc(y[fit_cv_pair2$folds[[x]]],
                                          
                                          fit_cv_pair2$preds[[x]])))
acc_pair2

## [1] 0.9014085 0.9285714 0.9285714 0.9142857 0.9714286

cat('accuracy for params_pair2 is :', mean(acc_pair2), '\n')

## accuracy for params_pair2 is : 0.9288531