diff --git a/R/classInt.R b/R/classInt.R index d06803d..651d07a 100644 --- a/R/classInt.R +++ b/R/classInt.R @@ -105,7 +105,7 @@ classIntervals <- function(var, n, style="quantile", rtimes=3, ..., intervalClos nobs <- length(unique(var)) if (nobs == 1) stop("single unique value") # Fix 22: Diego Hernangómez - needn <- !(style %in% c("dpih", "headtails")) + needn <- !(style %in% c("dpih", "headtails", "box")) if (missing(n)) n <- nclass.Sturges(var) if (n < 2 & needn) stop("n less than 2") @@ -349,6 +349,42 @@ classIntervals <- function(var, n, style="quantile", rtimes=3, ..., intervalClos brks <- c(min(x_sort), rowSums(m) / 2, max(x_sort)) + } else if (style == "box"){ + # 2022-09-22 Diego Hernangomez, see: + # https://github.com/r-spatial/classInt/issues/18 + # Adapted from: + # https://spatialanalysis.github.io/lab_tutorials/4_R_Mapping.html#box-map + + dots <- list(...) + iqr_mult <- ifelse(is.null(dots$iqr_mult), 1.5, dots$iqr_mult) + + qv <- unname(quantile(var)) + iqr <- iqr_mult * (qv[4] - qv[2]) + upfence <- qv[4] + iqr + lofence <- qv[2] - iqr + + # initialize break points vector + bb <- vector(mode="numeric",length=7) + + # logic for lower and upper fences + if (lofence < qv[1]) { # no lower outliers + bb[1] <- lofence + bb[2] <- floor(qv[1]) + } else { + bb[2] <- lofence + bb[1] <- qv[1] + } + if (upfence > qv[5]) { # no upper outliers + bb[7] <- upfence + bb[6] <- ceiling(qv[5]) + } else { + bb[6] <- upfence + bb[7] <- qv[5] + } + bb[3:5] <- qv[2:4] + + brks <- bb + } else stop(paste(style, "unknown")) } if (is.null(brks)) stop("Null breaks") diff --git a/man/classIntervals.Rd b/man/classIntervals.Rd index b3e0641..fb0e005 100644 --- a/man/classIntervals.Rd +++ b/man/classIntervals.Rd @@ -69,6 +69,13 @@ classIntervals2shingle(x) The "headtails" style uses the algorithm proposed by Bin Jiang (2013), in order to find groupings or hierarchy for data with a heavy-tailed distribution. This classification scheme partitions all of the data values around the mean into two parts and continues the process iteratively for the values (above the mean) in the head until the head part values are no longer heavy-tailed distributed. Thus, the number of classes and the class intervals are both naturally determined. By default the algorithm uses \code{thr = 0.4}, meaning that when the head represents more than 40\% of the observations the distribution is not considered heavy-tailed. The threshold argument \code{thr} may be modified through \code{\dots} (see Examples). The "maximum" style uses the Maximum Breaks method of classification finding the k - 1 largest differences in \code{var}. The mean of the values that generated the largest splits is used as the interval boundary. + + The "box" style generate 7 breaks (therefore 6 categories) based on a box-and-whisker plot. First and last categories + includes the data values considered as outliers, and the four remaining categories are defined by the percentiles 25, + 50 and 75 of the data distribution. By default, the identification of outliers is based on the interquantile range + (IQR), so values lower than percentile 25 - 1.5 * IQR or higher than percentile 75 + 1.5 * IQR are considered as outliers. + The multiplier applied to the IQR \code{iqr_mult = 1.5} may be modified through \code{\dots}. + } \value{ @@ -140,7 +147,13 @@ plot(classIntervals(jenks71$jenks71, n=5, style="jenks"), pal=pal1, main="dpih method") plot(classIntervals(jenks71$jenks71, style="headtails", thr = 1), pal=pal1, main="Head Tails method") -par(opar) + } +if (run) { + plot(classIntervals(jenks71$jenks71, style="maximum"), pal=pal1, + main="Maximum method") + plot(classIntervals(jenks71$jenks71, style="box"), pal=pal1, + main="Box method") + par(opar) } if (run) { print(classIntervals(jenks71$jenks71, n=5, style="fixed", @@ -203,6 +216,15 @@ if (run) { if (run) { print(classIntervals(jenks71$jenks71, style="headtails", thr = .45)) } +if (run) { + print(classIntervals(jenks71$jenks71, style="maximum")) +} +if (run) { + print(classIntervals(jenks71$jenks71, style="box")) +} +if (run) { + print(classIntervals(jenks71$jenks71, style="box", iqr_mult = 0.25)) +} x <- c(0, 0, 0, 1, 2, 50) print(classIntervals(x, n=3, style="fisher")) print(classIntervals(x, n=3, style="jenks")) @@ -262,6 +284,9 @@ classIntervals(x_units, n=5, style="fisher") if (have_units) { classIntervals(x_units, style="headtails") } +if (have_units) { +classIntervals(x_units, style="box") +} \dontrun{ st <- Sys.time() x_POSIXt <- sample(st+((0:500)*3600), 100) @@ -274,6 +299,8 @@ classIntervals(x_POSIXt, n=5, style="quantile") classIntervals(x_POSIXt, n=5, style="kmeans") classIntervals(x_POSIXt, n=5, style="fisher") classIntervals(x_POSIXt, style="headtails") +classIntervals(x_POSIXt, style="maximum") +classIntervals(x_POSIXt, style="box") } # see vignette for further details \dontrun{ diff --git a/tests/test_Unique.R b/tests/test_Unique.R index 9f78c32..6e6eded 100644 --- a/tests/test_Unique.R +++ b/tests/test_Unique.R @@ -39,6 +39,8 @@ print(classIntervals(data_censored, n=5, style="jenks"), unique=TRUE) print(classIntervals(data_censored, style="headtails"), unique=TRUE) print(classIntervals(data_censored, style="headtails", thr = 1)) print(classIntervals(data_censored, style="headtails", thr = 0)) +print(classIntervals(data_censored, style="box", iqr_mult = 0)) +print(classIntervals(data_censored, style="box")) x <- c(0, 0, 0, 1, 2, 50) print(classIntervals(x, n=3, style="fisher"), unique=TRUE) print(classIntervals(x, n=3, style="jenks"), unique=TRUE) diff --git a/tests/test_Unique.Rout.save b/tests/test_Unique.Rout.save index 3f2bd6a..84b33cc 100644 --- a/tests/test_Unique.Rout.save +++ b/tests/test_Unique.Rout.save @@ -1,7 +1,7 @@ -R version 3.5.3 (2020-03-20) -- "Great Truth" -Copyright (C) 2019 The R Foundation for Statistical Computing -Platform: x86_64-pc-linux-gnu (64-bit) +R version 4.2.1 (2022-06-23 ucrt) -- "Funny-Looking Kid" +Copyright (C) 2022 The R Foundation for Statistical Computing +Platform: x86_64-w64-mingw32/x64 (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. @@ -223,6 +223,20 @@ style: headtails one of 101 possible partitions of this variable into 2 classes [0,18.92407) [18.92407,26] 20 100 +> print(classIntervals(data_censored, style="box", iqr_mult = 0)) +style: box + one of 79,208,745 possible partitions of this variable into 6 classes + [0,19.38567) [19.38567,19.38567) [19.38567,20.11391) [20.11391,20.77193) + 30 0 30 30 +[20.77193,20.77193) [20.77193,26] + 0 30 +> print(classIntervals(data_censored, style="box")) +style: box + one of 79,208,745 possible partitions of this variable into 6 classes + [0,17.30627) [17.30627,19.38567) [19.38567,20.11391) [20.11391,20.77193) + 10 20 30 30 +[20.77193,22.85133) [22.85133,26] + 20 10 > x <- c(0, 0, 0, 1, 2, 50) > print(classIntervals(x, n=3, style="fisher"), unique=TRUE) style: fisher @@ -276,4 +290,4 @@ Class found with one single (possibly repeated) value: changed label > > proc.time() user system elapsed - 0.164 0.028 0.183 + 0.37 0.06 0.68