Skip to content

Commit

Permalink
Merge pull request #99 from JanLinnenbrink/master
Browse files Browse the repository at this point in the history
Mahalanobis distances added for kNNDM (experimental)
  • Loading branch information
HannaMeyer authored Mar 13, 2024
2 parents faed78b + e4b57e3 commit f8583c2
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 19 deletions.
113 changes: 96 additions & 17 deletions R/knndm.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#' Only required if modeldomain is used instead of predpoints.
#' @param sampling character. How to draw prediction points from the modeldomain? See `sf::st_sample`.
#' Only required if modeldomain is used instead of predpoints.
#' @param useMD boolean. Only for `space`=feature: shall the Mahalanobis distance be calculated instead of Euclidean?
#' Only works with numerical variables.
#'
#' @return An object of class \emph{knndm} consisting of a list of eight elements:
#' indx_train, indx_test (indices of the observations to use as
Expand Down Expand Up @@ -61,10 +63,11 @@
#' When using either `modeldomain` or `predpoints`, we advise to plot the study area polygon and the training/prediction points as a previous step to ensure they are aligned.
#'
#' `knndm` can also be performed in the feature space by setting `space` to "feature".
#' Euclidean distances or Mahalanobis distances can be used for distance calculation, but only Euclidean are tested.
#' In this case, nearest neighbour distances are calculated in n-dimensional feature space rather than in geographical space.
#' `tpoints` and `predpoints` can be data frames or sf objects containing the values of the features. Note that the names of `tpoints` and `predpoints` must be the same.
#' `predpoints` can also be missing, if `modeldomain` is of class SpatRaster. In this case, the values of of the SpatRaster will be extracted to the `predpoints`.
#' In the case of any categorical features, Gower distances will be used to calculate the Nearest Neighbour distances. If categorical
#' In the case of any categorical features, Gower distances will be used to calculate the Nearest Neighbour distances [Experimental]. If categorical
#' features are present, and `clustering` = "kmeans", K-Prototype clustering will be performed instead.
#'
#' @references
Expand Down Expand Up @@ -202,7 +205,7 @@ knndm <- function(tpoints, modeldomain = NULL, predpoints = NULL,
space = "geographical",
k = 10, maxp = 0.5,
clustering = "hierarchical", linkf = "ward.D2",
samplesize = 1000, sampling = "regular"){
samplesize = 1000, sampling = "regular", useMD=FALSE){

# create sample points from modeldomain
if(is.null(predpoints)&!is.null(modeldomain)){
Expand Down Expand Up @@ -314,9 +317,9 @@ knndm <- function(tpoints, modeldomain = NULL, predpoints = NULL,
} else if (isTRUE(space == "feature")) {

# prior checks
check_knndm_feature(tpoints, predpoints, space, k, maxp, clustering, islonglat, catVars)
check_knndm_feature(tpoints, predpoints, space, k, maxp, clustering, islonglat, catVars,useMD)
# kNNDM in feature space
knndm_res <- knndm_feature(tpoints, predpoints, k, maxp, clustering, linkf, catVars)
knndm_res <- knndm_feature(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD)

}

Expand Down Expand Up @@ -346,7 +349,12 @@ check_knndm_geo <- function(tpoints, predpoints, space, k, maxp, clustering, isl
}
}

check_knndm_feature <- function(tpoints, predpoints, space, k, maxp, clustering, islonglat, catVars){
check_knndm_feature <- function(tpoints, predpoints, space, k, maxp, clustering, islonglat, catVars, useMD){

if(!is.null(catVars) & isTRUE(useMD)) {
warning("Mahalanobis distances not supported for categorical features, Gower distances will be used")
useMD <- FALSE
}

if (!(maxp < 1 & maxp > 1/k)) {
stop("maxp must be strictly between 1/k and 1")
Expand Down Expand Up @@ -501,7 +509,7 @@ knndm_geo <- function(tpoints, predpoints, k, maxp, clustering, linkf, islonglat


# kNNDM in the feature space
knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVars) {
knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVars, useMD) {

# rescale data
if(is.null(catVars)) {
Expand Down Expand Up @@ -533,9 +541,45 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa
# Gj and Gij calculation
if(is.null(catVars)) {

# use FNN with Euclidean distances if no categorical variables are present
Gj <- c(FNN::knn.dist(tpoints, k = 1))
Gij <- c(FNN::knnx.dist(query = predpoints, data = tpoints, k = 1))

if(isTRUE(useMD)) {

tpoints_mat <- as.matrix(tpoints)
predpoints_mat <- as.matrix(predpoints)

# use Mahalanobis distances
if (dim(tpoints_mat)[2] == 1) {
S <- matrix(stats::var(tpoints_mat), 1, 1)
tpoints_mat <- as.matrix(tpoints_mat, ncol = 1)
} else {
S <- stats::cov(tpoints_mat)
}
S_inv <- MASS::ginv(S)

# calculate distance matrix
distmat <- matrix(nrow=nrow(tpoints), ncol=nrow(tpoints))
distmat <- sapply(1:nrow(distmat), function(i) {
sapply(1:nrow(distmat), function(j) {
sqrt(t(tpoints_mat[i,] - tpoints_mat[j,]) %*% S_inv %*% (tpoints_mat[i,] - tpoints_mat[j,]))
})
})
diag(distmat) <- NA

Gj <- apply(distmat, 1, min, na.rm=TRUE)

Gij <- sapply(1:dim(predpoints_mat)[1], function(y) {
min(sapply(1:dim(tpoints_mat)[1], function(x) {
sqrt(t(predpoints_mat[y,] - tpoints_mat[x,]) %*% S_inv %*% (predpoints_mat[y,] - tpoints_mat[x,]))
}))
})


} else {
# use FNN with Euclidean distances if no categorical variables are present
Gj <- c(FNN::knn.dist(tpoints, k = 1))
Gij <- c(FNN::knnx.dist(query = predpoints, data = tpoints, k = 1))
}


} else {

Expand All @@ -553,7 +597,12 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa
clust <- sample(rep(1:k, ceiling(nrow(tpoints)/k)), size = nrow(tpoints), replace=F)

if(is.null(catVars)) {
Gjstar <- distclust_euclidean(tpoints, clust)
if(isTRUE(useMD)) {
Gjstar <- distclust_MD(tpoints, clust)
} else {
Gjstar <- distclust_euclidean(tpoints, clust)
}

} else {
Gjstar <- distclust_gower(tpoints, clust)
}
Expand All @@ -569,9 +618,12 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa
# calculate distance matrix which is needed for hierarchical clustering
if(is.null(catVars)) {

# calculate distance matrix with Euclidean distances if no categorical variables are present
distmat <- stats::dist(tpoints, upper=TRUE, diag=TRUE) |> as.matrix()
diag(distmat) <- NA
if(isFALSE(useMD)) {
# calculate distance matrix with Euclidean distances if no categorical variables are present
# for MD: distance matrix was already calculated
distmat <- stats::dist(tpoints, upper=TRUE, diag=TRUE) |> as.matrix()
diag(distmat) <- NA
}

} else {

Expand Down Expand Up @@ -672,7 +724,11 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa

if(clustering == "kmeans") {
if(is.null(catVars)) {
Gjstar_i <- distclust_euclidean(tpoints, clust_k)
if(isTRUE(useMD)){
Gjstar_i <- distclust_MD(tpoints, clust_k)
} else {
Gjstar_i <- distclust_euclidean(tpoints, clust_k)
}
} else {
Gjstar_i <- distclust_gower(tpoints, clust_k)
}
Expand All @@ -696,7 +752,12 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa

if(clustering == "kmeans") {
if(is.null(catVars)) {
Gjstar <- distclust_euclidean(tpoints, clust)
if(isTRUE(useMD)) {
Gjstar <- distclust_MD(tpoints, clust)
} else {
Gjstar <- distclust_euclidean(tpoints, clust)
}

} else {
Gjstar <- distclust_gower(tpoints, clust)
}
Expand All @@ -707,8 +768,6 @@ knndm_feature <- function(tpoints, predpoints, k, maxp, clustering, linkf, catVa
}




# Output
cfolds <- CAST::CreateSpacetimeFolds(data.frame(clust=clust), spacevar = "clust", k = k)
res <- list(clusters = clust,
Expand Down Expand Up @@ -749,3 +808,23 @@ distclust_gower <- function(tr_coords, folds){
}
unlist(alldist)
}

# Helper function: Compute out-of-fold NN distance (Mahalanobian distance)
distclust_MD <- function(tr_coords, folds){

tr_mat <- as.matrix(tr_coords)

S <- stats::cov(tr_mat)
S_inv <- MASS::ginv(S)

alldist <- rep(NA, length(folds))
for(f in unique(folds)) {

alldist[f == folds] <- apply(tr_mat[f==folds,,drop=FALSE], 1, function(y) {
min(apply(tr_mat[f!=folds,,drop=FALSE], 1, function(x) {
sqrt(t(y - x) %*% S_inv %*% (y - x))
}))
})
}
unlist(alldist)
}
9 changes: 7 additions & 2 deletions man/knndm.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 48 additions & 0 deletions tests/testthat/test-knndm.R
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ test_that("kNNDM works in feature space with hierarchical clustering and raster

expect_equal(round(as.numeric(knndm_folds$Gjstar[40]),4), 0.2132)


})

test_that("kNNDM works in feature space with clustered training points", {
Expand Down Expand Up @@ -336,3 +337,50 @@ test_that("kNNDM works in feature space with clustered training points, categori

})


test_that("kNNDM works in feature space with Mahalanobis distance", {


data(splotdata)
splotdata <- splotdata[splotdata$Country == "Chile",]

predictors <- c("bio_1", "bio_4", "bio_5", "bio_6",
"bio_8", "bio_9", "bio_12", "bio_13",
"bio_14", "bio_15", "elev")
trainDat <- sf::st_drop_geometry(splotdata)
predictors_sp <- terra::rast(system.file("extdata", "predictors_chile.tif",package="CAST"))

set.seed(1234)
knndm_folds <- knndm(trainDat[,predictors], modeldomain = predictors_sp, space = "feature",
clustering="kmeans", k=4, maxp=0.8, useMD=TRUE)

expect_equal(round(as.numeric(knndm_folds$Gjstar[40]),4), 1.1258)

})


test_that("kNNDM works in feature space with Mahalanobis distance without clustering", {


set.seed(1234)

# prepare sample data
dat <- readRDS(system.file("extdata","Cookfarm.RDS",package="CAST"))
dat <- terra::aggregate(dat[,c("DEM","TWI", "NDRE.M", "Easting", "Northing","VW")],
by=list(as.character(dat$SOURCEID)),mean)
pts <- dat[,-1]
pts <- sf::st_as_sf(pts,coords=c("Easting","Northing"))
sf::st_crs(pts) <- 26911
studyArea <- terra::rast(system.file("extdata","predictors_2012-03-25.tif",package="CAST"))
pts <- sf::st_transform(pts, crs = sf::st_crs(studyArea))

studyArea <- studyArea[[names(studyArea) %in% names(pts)]]
train_points <- pts[,names(pts) %in% names(studyArea)]

expect_message(knndm(train_points, modeldomain = studyArea, space="feature", clustering = "kmeans", useMD = TRUE),
"Gij <= Gj; a random CV assignment is returned")

expect_message(knndm(train_points, modeldomain = studyArea, space="feature", clustering = "hierarchical", useMD = TRUE),
"Gij <= Gj; a random CV assignment is returned")

})

0 comments on commit f8583c2

Please sign in to comment.