cmahalanobis: An R package to calculate distances between factors

Introduction

The package provides a suite of functions for computing various distance metrics between pairs of groups within a list of data frames. Each data frame represents observations of a species, including multiple factors. In addition to the Mahalanobis distance, which is a dissimilarity measure based on the covariance matrix and useful for statistical matching or data merging, the package includes:

These metrics are fundamental in various fields, such as cluster analysis, classification, and other applications of machine learning and data mining, where assessing similarity or dissimilarity between data is crucial. The package is designed to be flexible and easily integrated into data analysis workflows, providing reliable tools for evaluating distances in multidimensional contexts.

Applications

Application on iris dataset

library(cmahalanobis)
# Load iris dataset
data(iris)
# Split data into three parts
setosa <- subset(iris, Species == "setosa")
setosa <- setosa[,-5] # Remove the column of specie
versicolor <- subset(iris, Species == "versicolor")
versicolor <- versicolor[,-5] # Remove the column of specie
virginica <- subset(iris, Species == "virginica")
virginica <- virginica[,-5] # Remove the column of specie
# Create a list with the three groups of flowers
groups <- list(setosa, versicolor, virginica)
cmahalanobis(groups, plot = TRUE, p.value = TRUE)

#> $distances
#>          [,1]      [,2]      [,3]
#> [1,]   0.0000 335.19989 727.42056
#> [2,] 107.1736   0.00000  26.71618
#> [3,] 171.7689  16.88654   0.00000
#> 
#> $p_values
#>              [,1]         [,2]          [,3]
#> [1,]           NA 2.748687e-71 4.023276e-156
#> [2,] 2.915001e-22           NA  2.268568e-05
#> [3,] 4.363119e-36 2.033555e-03            NA
ceuclide(groups, plot = TRUE, p.value = TRUE, num.permutations = 10)

#> $distances
#>          [,1]     [,2]     [,3]
#> [1,] 0.000000 3.208281 4.754507
#> [2,] 3.208281 0.000000 1.620489
#> [3,] 4.754507 1.620489 0.000000
#> 
#> $p_values
#>      [,1] [,2] [,3]
#> [1,]   NA  0.6    0
#> [2,]  0.9   NA    1
#> [3,]  0.0  1.0   NA
cmanhattan(groups, plot = TRUE, p.value = TRUE, num.permutations = 10)

#> $distances
#>       [,1]  [,2]  [,3]
#> [1,] 0.000 5.466 7.906
#> [2,] 5.466 0.000 2.848
#> [3,] 7.906 2.848 0.000
#> 
#> $p_values
#>      [,1] [,2] [,3]
#> [1,]   NA  0.0  0.0
#> [2,]    0   NA  0.8
#> [3,]    0  0.9   NA
cchebyshev(groups, plot = TRUE, p.value = TRUE, num.permutations = 50)

#> $distances
#>       [,1]  [,2]  [,3]
#> [1,] 0.000 2.798 4.090
#> [2,] 2.798 0.000 1.292
#> [3,] 4.090 1.292 0.000
#> 
#> $p_values
#>      [,1] [,2] [,3]
#> [1,]   NA 0.98 0.42
#> [2,] 0.96   NA 1.00
#> [3,] 0.44 1.00   NA

Application on mtcars dataset

# Split the data into 2 parts for each type of transmission
auto <- subset(mtcars, am == 0)
auto <- auto[,-9]
manual <- subset(mtcars, am == 1)
manual <- manual[,-9]

# Create a list with the two groups of cars
groups <- list(auto, manual)
cmahalanobis(groups, plot = TRUE, p.value = TRUE)

#> $distances
#>          [,1]     [,2]
#> [1,]   0.0000 156.1163
#> [2,] 735.5919   0.0000
#> 
#> $p_values
#>               [,1]         [,2]
#> [1,]            NA 2.050145e-28
#> [2,] 1.429549e-151           NA
ceuclide(groups, plot = TRUE, p.value = TRUE, num.permutations = 10)

#> $distances
#>          [,1]     [,2]
#> [1,]   0.0000 150.8032
#> [2,] 150.8032   0.0000
#> 
#> $p_values
#>      [,1] [,2]
#> [1,]   NA    1
#> [2,]    0   NA
cmanhattan(groups, plot = TRUE, p.value = TRUE, num.permutations = 10)

#> $distances
#>          [,1]     [,2]
#> [1,]   0.0000 193.8557
#> [2,] 193.8557   0.0000
#> 
#> $p_values
#>      [,1] [,2]
#> [1,]   NA    1
#> [2,]    0   NA
cchebyshev(groups, plot = TRUE, p.value = TRUE, num.permutations = 50)

#> $distances
#>          [,1]     [,2]
#> [1,]   0.0000 146.8482
#> [2,] 146.8482   0.0000
#> 
#> $p_values
#>      [,1] [,2]
#> [1,]   NA    1
#> [2,]    0   NA

Application on simulated data

# Load cmahalanobis package
library(cmahalanobis)
# Define the number of observations and variables for each groups
num_observations <- 100
num_variables <- 5
# We generate three groups of simulated data with normal distribution
set.seed(123) # For the reproducibility of results
group1 <- as.data.frame(matrix(rnorm(num_observations * num_variables), nrow = num_observations))
group2 <- as.data.frame(matrix(rnorm(num_observations * num_variables), nrow = num_observations))
group3 <- as.data.frame(matrix(rnorm(num_observations * num_variables), nrow = num_observations))
# Create a list of three groups of data
groups <- list(group1, group2, group3)
# Calculate Mahalanobis distance with cmahalanobis function
cmahalanobis(groups, plot = TRUE, p.value = TRUE)

#> $distances
#>          [,1]     [,2]     [,3]
#> [1,] 0.000000 5.639257 5.567479
#> [2,] 4.722923 0.000000 5.029954
#> [3,] 5.329901 5.783087 0.000000
#> 
#> $p_values
#>           [,1]      [,2]      [,3]
#> [1,]        NA 0.3429174 0.3506032
#> [2,] 0.4506217        NA 0.4122355
#> [3,] 0.3769584 0.3279009        NA
ceuclide(groups, plot = TRUE, p.value = TRUE, num.permutations = 190)

#> $distances
#>           [,1]      [,2]     [,3]
#> [1,] 0.0000000 0.2282174 0.156693
#> [2,] 0.2282174 0.0000000 0.302220
#> [3,] 0.1566930 0.3022200 0.000000
#> 
#> $p_values
#>           [,1]       [,2]       [,3]
#> [1,]        NA 0.32105263 0.65789474
#> [2,] 0.3789474         NA 0.06315789
#> [3,] 0.5736842 0.02105263         NA
cmanhattan(groups, plot = TRUE, p.value = TRUE, num.permutations = 10)

#> $distances
#>           [,1]      [,2]      [,3]
#> [1,] 0.0000000 0.4442511 0.2777603
#> [2,] 0.4442511 0.0000000 0.6671049
#> [3,] 0.2777603 0.6671049 0.0000000
#> 
#> $p_values
#>      [,1] [,2] [,3]
#> [1,]   NA    0  0.3
#> [2,]  0.0   NA  0.0
#> [3,]  0.1    0   NA
cchebyshev(groups, plot = TRUE, p.value = TRUE, num.permutations = 50)

#> $distances
#>           [,1]      [,2]      [,3]
#> [1,] 0.0000000 0.1327059 0.1230044
#> [2,] 0.1327059 0.0000000 0.1622405
#> [3,] 0.1230044 0.1622405 0.0000000
#> 
#> $p_values
#>      [,1] [,2] [,3]
#> [1,]   NA 0.94 0.88
#> [2,] 0.92   NA 0.66
#> [3,] 0.80 0.54   NA