diff --git a/collects/math/private/statistics/counting.rkt b/collects/math/private/statistics/counting.rkt index 69f4eeb578..00cc3fe433 100644 --- a/collects/math/private/statistics/counting.rkt +++ b/collects/math/private/statistics/counting.rkt @@ -12,7 +12,7 @@ sample-bin-compact sample-bin-total bin-samples - bin-weighted-samples) + bin-samples/key) ;; =================================================================================================== ;; Hashing @@ -100,65 +100,71 @@ (cond [(pred? x) (loop (rest xs) (cons x ys))] [else (values (reverse ys) xs)])]))) -(: bin-samples - (All (A B) - (case-> ((Sequenceof A) (A A -> Any) (Sequenceof A) -> (Listof (sample-bin A A))) - ((Sequenceof B) (B B -> Any) (Sequenceof A) (A -> B) -> (Listof (sample-bin A B)))))) -(define bin-samples - (case-lambda - [(bnds lte? xs) (bin-samples bnds lte? xs (λ: ([x : A]) x))] - [(bnds lte? xs key) - (let* ([bnds (sort (sequence->list bnds) (λ: ([b1 : B] [b2 : B]) (and (lte? b1 b2) #t)))] - [xs (sequence->list xs)] - [xks (map (λ: ([x : A]) (cons x (key x))) xs)] - [xks (sort xks (λ: ([xk1 : (Pair A B)] [xk2 : (Pair A B)]) - (and (lte? (cdr xk1) (cdr xk2)) #t)))]) - (cond - [(empty? bnds) - (cond [(empty? xks) empty] - [else (define min (cdr (first xks))) - (define max (cdr (last xks))) - (list (sample-bin min max (map (inst car A B) xks) #f))])] - [else - (let: loop : (Listof (sample-bin A B)) ([min : (U #f B) #f] - [max : B (first bnds)] - [bnds : (Listof B) (rest bnds)] - [xks : (Listof (Pair A B)) xks] - [bins : (Listof (sample-bin A B)) empty]) - (let-values ([(yks xks) (list-split-after xks (λ: ([xk : (Pair A B)]) - (lte? (cdr xk) max)))]) - (define maybe-bin - (cond [min (list (sample-bin min max (map (inst car A B) yks) #f))] - [(empty? yks) empty] - [else (list (sample-bin (cdr (first yks)) max (map (inst car A B) yks) #f))])) - (cond [(empty? bnds) - (cond [(empty? xks) (reverse (append maybe-bin bins))] - [else - (define bin2 - (sample-bin max (cdr (last xks)) (map (inst car A B) xks) #f)) - (reverse (append (cons bin2 maybe-bin) bins))])] - [else - (loop max (first bnds) (rest bnds) xks (append maybe-bin bins))])))]))])) +(: bin-unweighted-samples/key + (All (A B) ((Sequenceof B) (B B -> Any) (A -> B) (Sequenceof A) -> (Listof (sample-bin A B))))) +(define (bin-unweighted-samples/key bnds lte? key xs) + (: lt? (B B -> Boolean)) + (define (lt? x1 x2) + (and (lte? x1 x2) (not (lte? x2 x1)))) + (let* ([bnds (sort (sequence->list bnds) lt?)] + [xs (sequence->list xs)] + [xks (map (λ: ([x : A]) (cons x (key x))) xs)] + [xks (sort xks (λ: ([xk1 : (Pair A B)] [xk2 : (Pair A B)]) + (lt? (cdr xk1) (cdr xk2))))]) + (cond + [(empty? bnds) + (cond [(empty? xks) empty] + [else (define min (cdr (first xks))) + (define max (cdr (last xks))) + (list (sample-bin min max (map (inst car A B) xks) #f))])] + [else + (let: loop : (Listof (sample-bin A B)) ([min : (U #f B) #f] + [max : B (first bnds)] + [bnds : (Listof B) (rest bnds)] + [xks : (Listof (Pair A B)) xks] + [bins : (Listof (sample-bin A B)) empty]) + (let-values ([(yks xks) (list-split-after xks (λ: ([xk : (Pair A B)]) + (lte? (cdr xk) max)))]) + (define maybe-bin + (cond [min (list (sample-bin min max (map (inst car A B) yks) #f))] + [(empty? yks) empty] + [else (list (sample-bin (cdr (first yks)) max (map (inst car A B) yks) #f))])) + (cond [(empty? bnds) + (cond [(empty? xks) (reverse (append maybe-bin bins))] + [else + (define bin2 + (sample-bin max (cdr (last xks)) (map (inst car A B) xks) #f)) + (reverse (append (cons bin2 maybe-bin) bins))])] + [else + (loop max (first bnds) (rest bnds) xks (append maybe-bin bins))])))]))) -(: bin-weighted-samples - (All (A B) (case-> ((Sequenceof A) (A A -> Any) (Sequenceof A) (U #f (Sequenceof Real)) - -> (Listof (sample-bin A A))) - ((Sequenceof B) (B B -> Any) (Sequenceof A) (U #f (Sequenceof Real)) (A -> B) - -> (Listof (sample-bin A B)))))) -(define bin-weighted-samples - (case-lambda - [(bnds lte? xs ws) - (cond [ws (bin-weighted-samples bnds lte? xs ws (λ: ([x : A]) x))] - [else (bin-samples bnds lte? xs)])] - [(bnds lte? xs ws key) - (cond [ws (let-values ([(xs ws) (sequences->weighted-samples 'bin-samples xs ws)]) - (define xws (map (inst cons A Nonnegative-Real) xs ws)) - (define xw-key (λ: ([xw : (Pair A Nonnegative-Real)]) (key (car xw)))) - (map (λ: ([bin : (sample-bin (Pair A Nonnegative-Real) B)]) - (define xws (sample-bin-values bin)) - (sample-bin (sample-bin-min bin) - (sample-bin-max bin) - (map (inst car A Nonnegative-Real) xws) - (map (inst cdr A Nonnegative-Real) xws))) - (bin-samples bnds lte? xws xw-key)))] - [else (bin-samples bnds lte? xs key)])])) +(: bin-weighted-samples/key + (All (A B) ((Sequenceof B) (B B -> Any) (A -> B) (Sequenceof A) (Sequenceof Real) + -> (Listof (sample-bin A B))))) +(define (bin-weighted-samples/key bnds lte? key xs ws) + (let-values ([(xs ws) (sequences->weighted-samples 'bin-samples/key xs ws)]) + (define xws (map (inst cons A Nonnegative-Real) xs ws)) + (define xw-key (λ: ([xw : (Pair A Nonnegative-Real)]) (key (car xw)))) + (map (λ: ([bin : (sample-bin (Pair A Nonnegative-Real) B)]) + (define xws (sample-bin-values bin)) + (sample-bin (sample-bin-min bin) + (sample-bin-max bin) + (map (inst car A Nonnegative-Real) xws) + (map (inst cdr A Nonnegative-Real) xws))) + (bin-unweighted-samples/key bnds lte? xw-key xws)))) + +(: bin-samples/key + (All (A B) + (case-> ((Sequenceof B) (B B -> Any) (A -> B) (Sequenceof A) -> (Listof (sample-bin A B))) + ((Sequenceof B) (B B -> Any) (A -> B) (Sequenceof A) (U #f (Sequenceof Real)) + -> (Listof (sample-bin A B)))))) +(define (bin-samples/key bnds lte? key xs [ws #f]) + (cond [ws (bin-weighted-samples/key bnds lte? key xs ws)] + [else (bin-unweighted-samples/key bnds lte? key xs)])) + +(: bin-samples + (All (A) (case-> ((Sequenceof A) (A A -> Any) (Sequenceof A) -> (Listof (sample-bin A A))) + ((Sequenceof A) (A A -> Any) (Sequenceof A) (Sequenceof Real) + -> (Listof (sample-bin A A)))))) +(define (bin-samples bnds lte? xs [ws #f]) + (bin-samples/key bnds lte? (λ: ([x : A]) x) xs ws)) diff --git a/collects/math/private/statistics/order-statistics.rkt b/collects/math/private/statistics/order-statistics.rkt index 1baba8df78..10e9e4e431 100644 --- a/collects/math/private/statistics/order-statistics.rkt +++ b/collects/math/private/statistics/order-statistics.rkt @@ -1,69 +1,80 @@ #lang typed/racket/base (require racket/sequence + racket/list racket/fixnum "../../base.rkt" "quickselect.rkt" "statistics-utils.rkt") -(provide quantile +(provide sort-samples + quantile median - real-quantile - real-median - absolute-deviation/median - absolute-deviation) + absdev/median + absdev) + +(: sort-weighted-samples + (All (A) (Symbol (A A -> Any) (Sequenceof A) (Sequenceof Real) + -> (Values (Listof A) (Listof Nonnegative-Real))))) +(define (sort-weighted-samples name lt? xs ws) + (let-values ([(xs ws) (sequences->weighted-samples name xs ws)]) + (define xws ((inst sort (Pair A Nonnegative-Real) Real) + (map (inst cons A Nonnegative-Real) xs ws) + (λ: ([xw1 : (Pair A Nonnegative-Real)] + [xw2 : (Pair A Nonnegative-Real)]) + (and (lt? (car xw1) (car xw2)) #t)))) + (values (map (inst car A Nonnegative-Real) xws) + (map (inst cdr A Nonnegative-Real) xws)))) + +(: sort-samples (All (A) (case-> ((A A -> Any) (Sequenceof A) -> (Listof A)) + ((A A -> Any) (Sequenceof A) (U #f (Sequenceof Real)) + -> (Values (Listof A) (Listof Nonnegative-Real)))))) +(define sort-samples + (case-lambda + [(lt? xs) + (sort (sequence->list xs) + (λ: ([x1 : A] [x2 : A]) + (and (lt? x1 x2) #t)))] + [(lt? xs ws) + (cond [ws (sort-weighted-samples 'sort-samples lt? xs ws)] + [else (define ys (sort (sequence->list xs) + (λ: ([x1 : A] [x2 : A]) + (and (lt? x1 x2) #t)))) + (values ys (make-list (length ys) 1))])])) (: quantile (All (A) (case-> (Real (A A -> Any) (Sequenceof A) -> A) - (Real (A A -> Any) (Sequenceof A) (Option (Sequenceof Real)) -> A)))) + (Real (A A -> Any) (Sequenceof A) (U #f (Sequenceof Real)) -> A)))) (define (quantile p lt? xs [ws #f]) (cond [(or (p . < . 0) (p . > . 1)) - (raise-argument-error 'quantile "Real in [0,1]" 0 p < xs ws)] + (raise-argument-error 'quantile "Real in [0,1]" 0 p lt? xs ws)] [ws - (let-values ([(xs ws) (sequences->weighted-samples 'median xs ws)]) - (define xws ((inst sort (Pair A Nonnegative-Real) Real) - (map (inst cons A Nonnegative-Real) xs ws) - (λ: ([xw1 : (Pair A Nonnegative-Real)] - [xw2 : (Pair A Nonnegative-Real)]) - (if (lt? (car xw1) (car xw2)) #t #f)))) - (let ([xs (map (inst car A Nonnegative-Real) xws)] - [ws (map (inst cdr A Nonnegative-Real) xws)]) - (define total-w (sum ws)) - (cond [(zero? total-w) - (raise-argument-error 'quantile "weights with positive sum" 3 p lt? xs ws)] - [else - (define thresh (* p total-w)) - (let loop ([xs (cdr xs)] [ws (cdr ws)] [x (car xs)] [s (car ws)]) - (cond [(s . > . thresh) x] - [(null? xs) x] - [else (loop (cdr xs) (cdr ws) (car xs) (+ s (car ws)))]))])))] + (let-values ([(xs ws) (sort-weighted-samples 'quantile lt? xs ws)]) + (define total-w (sum ws)) + (cond [(zero? total-w) + (raise-argument-error 'quantile "weights with positive sum" 3 p lt? xs ws)] + [else + (let loop ([xs (cdr xs)] [ws (cdr ws)] [x (car xs)] [s (car ws)]) + (cond [((/ s total-w) . >= . p) x] + [(null? xs) x] + [else (loop (cdr xs) (cdr ws) (car xs) (+ s (car ws)))]))]))] [else (let ([xs (sequence->vector xs)]) (define n (vector-length xs)) (cond [(n . fx<= . 0) (raise-argument-error 'quantile "nonempty Sequence" 2 p lt? xs)] - [else (kth-value! xs (min (- n 1) (exact-ceiling (* p n))) lt?)]))])) + [else + (define i (max 0 (- (exact-ceiling (* p n)) 1))) + (kth-value! xs i lt?)]))])) (: median (All (A) (case-> ((A A -> Any) (Sequenceof A) -> A) - ((A A -> Any) (Sequenceof A) (Option (Sequenceof Real)) -> A)))) + ((A A -> Any) (Sequenceof A) (U #f (Sequenceof Real)) -> A)))) (define (median lt? xs [ws #f]) - (cond [ws (quantile 1/2 lt? xs ws)] - [else - (let ([xs (sequence->vector xs)]) - (define n (vector-length xs)) - (cond [(n . fx<= . 0) (raise-argument-error 'median "nonempty Sequence" xs)] - [else (kth-value! xs (quotient n 2) lt?)]))])) + (quantile 1/2 lt? xs ws)) -(: real-quantile (case-> (Real (Sequenceof Real) -> Real) - (Real (Sequenceof Real) (Option (Sequenceof Real)) -> Real))) -(define (real-quantile p xs [ws #f]) - (quantile p < xs ws)) +;; =================================================================================================== +;; Absolute deviation -(: real-median (case-> ((Sequenceof Real) -> Real) - ((Sequenceof Real) (Option (Sequenceof Real)) -> Real))) -(define (real-median xs [ws #f]) - (median < xs ws)) - -(: absolute-deviation* (Symbol Real (Sequenceof Real) (Option (Sequenceof Real)) -> Nonnegative-Real)) -(define (absolute-deviation* name m xs ws) +(: absdev* (Symbol Real (Sequenceof Real) (Option (Sequenceof Real)) -> Nonnegative-Real)) +(define (absdev* name m xs ws) (define-values (axs n) (cond [ws (let-values ([(xs ws) (sequences->weighted-samples name xs ws)]) (values (map (λ: ([x : Real] [w : Real]) (* w (abs (- x m)))) xs ws) @@ -75,14 +86,14 @@ [else (max 0 (/ (sum axs) n))])) -(: absolute-deviation/median +(: absdev/median (case-> (Real (Sequenceof Real) -> Nonnegative-Real) (Real (Sequenceof Real) (Option (Sequenceof Real)) -> Nonnegative-Real))) -(define (absolute-deviation/median m xs [ws #f]) - (absolute-deviation* 'absolute-deviation/median m xs ws)) - -(: absolute-deviation +(define (absdev/median m xs [ws #f]) + (absdev* 'absdev/median m xs ws)) + +(: absdev (case-> ((Sequenceof Real) -> Nonnegative-Real) ((Sequenceof Real) (Option (Sequenceof Real)) -> Nonnegative-Real))) -(define (absolute-deviation xs [ws #f]) - (absolute-deviation* 'absolute-deviation (real-median xs ws) xs ws)) +(define (absdev xs [ws #f]) + (absdev* 'absdev (median < xs ws) xs ws)) diff --git a/collects/math/private/statistics/statistics-struct.rkt b/collects/math/private/statistics/statistics-struct.rkt index 822ffae200..6fbdfb0c22 100644 --- a/collects/math/private/statistics/statistics-struct.rkt +++ b/collects/math/private/statistics/statistics-struct.rkt @@ -60,11 +60,12 @@ (case-> (statistics (Sequenceof Real) -> statistics) (statistics (Sequenceof Real) (Option (Sequenceof Real)) -> statistics))) (define (update-statistics* e xs [ws #f]) - (cond [ws (let-values ([(xs ws) (sequences->weighted-samples 'update-statistics* xs ws)]) - (for/fold: ([e : statistics e]) ([x (in-list xs)] [w (in-list ws)]) - (update-statistics e x w)))] - [else (for/fold: ([e : statistics e]) ([x xs]) - (update-statistics e x 1.0))])) + (cond [ws + (for/fold: ([e : statistics e]) ([x xs] [w ws]) + (update-statistics e x w))] + [else + (for/fold: ([e : statistics e]) ([x xs]) + (update-statistics e x 1.0))])) ;; =================================================================================================== diff --git a/collects/math/private/statistics/statistics-utils.rkt b/collects/math/private/statistics/statistics-utils.rkt index dbb3703a57..52c6f1d9bd 100644 --- a/collects/math/private/statistics/statistics-utils.rkt +++ b/collects/math/private/statistics/statistics-utils.rkt @@ -9,6 +9,10 @@ ;; =================================================================================================== +(: lte?->lt? (All (A) ((A A -> Any) -> (A A -> Boolean)))) +(define ((lte?->lt? lte?) x1 x2) + (and (lte? x1 x2) (not (lte? x2 x1)))) + (: find-near-pow2 (Real -> Nonnegative-Exact-Rational)) (define (find-near-pow2 x) (expt 2 (max -1073 (min 1023 (exact-round (/ (log (abs x)) (fllog 2.0))))))) diff --git a/collects/math/scribblings/math-statistics.scrbl b/collects/math/scribblings/math-statistics.scrbl index b052b638ce..f4f9eb31c8 100644 --- a/collects/math/scribblings/math-statistics.scrbl +++ b/collects/math/scribblings/math-statistics.scrbl @@ -5,10 +5,11 @@ (for-label racket/base racket/promise racket/list math plot (only-in typed/racket/base - ann + ann inst Flonum Real Boolean Any Listof Integer case-> -> U - Sequenceof Positive-Flonum Nonnegative-Flonum - HashTable Positive-Integer Nonnegative-Real Values)) + Sequenceof Positive-Flonum Nonnegative-Flonum Symbol + HashTable Positive-Integer Nonnegative-Real Values + String)) "utils.rkt") @(define typed-eval (make-math-eval)) @@ -34,89 +35,6 @@ See @secref{stats:expected-values} for a discussion. @local-table-of-contents[] -@section{Counting and Binning} - -@defproc*[([(samples->hash [xs (Sequenceof A)]) (HashTable A Positive-Integer)] - [(samples->hash [xs (Sequenceof A)] [ws (U #f (Sequenceof Real))]) - (HashTable A Nonnegative-Real)])]{ -@examples[#:eval typed-eval - (samples->hash '(1 2 3 4 4)) - (samples->hash '(1 1 2 3 4) '(1/2 1/2 1 1 2))] -} - -@defproc*[([(count-samples [xs (Sequenceof A)]) (Values (Listof A) (Listof Positive-Integer))] - [(count-samples [xs (Sequenceof A)] [ws (U #f (Sequenceof Real))]) - (Values (Listof A) (Listof Nonnegative-Real))])]{ -@examples[#:eval typed-eval - (count-samples '(1 2 3 4 4)) - (count-samples '(1 1 2 3 4) '(1/2 1/2 1 1 2))] -} - -@defstruct*[sample-bin ([min B] - [max B] - [values (Listof A)] - [weights (U #f (Listof Nonnegative-Real))])]{ -Represents a @italic{bin}, or a group of samples within an interval in a total order. -The values and bounds have a different type to allow @racket[bin-samples] and -@racket[bin-weighted-samples] to group elements based on a function of their values (a @racket[key], -like in @racket[sort]). -} - -@defproc*[([(bin-samples [bounds (Sequenceof A)] - [lte? (A A -> Any)] - [xs (Sequenceof A)]) - (Listof (sample-bin A A))] - [(bin-samples [bounds (Sequenceof B)] - [lte? (B B -> Any)] - [xs (Sequenceof A)] - [key (A -> B)]) - (Listof (sample-bin A B))])]{ -Like @racket[(sort xs lte? #:key key)], but additionally groups samples into bins. -Keys are always cached, and @racket[bounds] is sorted before binning. - -If @racket[n = (length bounds)], then @racket[bin-samples] returns @italic{at least} @racket[(- n 1)] -bins, one for each pair of adjacent (sorted) bounds. -If some values in @racket[xs] are less than the smallest bound, they are grouped into a single bin in -front. -If some are greater than the largest bound, they are grouped into a single bin at the end. - -@examples[#:eval typed-eval - (bin-samples '() <= '(0 1 2 3 4 5 6)) - (bin-samples '(3) <= '(0 1 2 3 4 5 6)) - (bin-samples '(2 4) <= '(0 1 2 3 4 5 6))] - -Note that @racket[bin-samples] always returns bins with @racket[#f] weights, meaning they contain -unweighted samples. -} - -@defproc*[([(bin-weighted-samples [xs (Sequenceof A)] - [ws (Sequenceof Real)] - [bounds (Sequenceof A)] - [lte? (A A -> Any)]) - (Listof (sample-bin A A))] - [(bin-weighted-samples [xs (Sequenceof A)] - [ws (Sequenceof Real)] - [bounds (Sequenceof B)] - [lte? (B B -> Any)] - [key (A -> B)]) - (Listof (sample-bin A B))])]{ -Like @racket[bin-samples], but for weighted samples. -} - -@defproc[(sample-bin-compact [bin (sample-bin A B)]) (sample-bin A B)]{ -Compacts @racket[bin] by applying @racket[count-samples] to its values and weights. -@examples[#:eval typed-eval - (sample-bin-compact (sample-bin 1 4 '(1 2 3 4 4) #f))] -} - -@defproc[(sample-bin-total [bin (sample-bin A B)]) Nonnegative-Real]{ -If @racket[(sample-bin-weights bin)] is @racket[#f], returns the number of samples in @racket[bin]; -otherwise, returns the sum of their weights. -@examples[#:eval typed-eval - (sample-bin-total (sample-bin 1 4 '(1 2 3 4 4) #f)) - (sample-bin-total (sample-bin-compact (sample-bin 1 4 '(1 2 3 4 4) #f)))] -} - @section[#:tag "stats:expected-values"]{Expected Values} Functions documented in this section that compute higher central moments, such as @racket[variance], @@ -206,7 +124,7 @@ using known mean @racket[mean]. @section[#:tag "stats:running"]{Running Expected Values} The @racket[statistics] object allows computing the sample minimum, maximum, count, mean, variance, -skewness, and excess kurtosis of any number of samples in O(1) space. +skewness, and excess kurtosis of a sequence of samples in O(1) space. To use it, start with @racket[empty-statistics], then use @racket[update-statistics] to obtain a new statistics object with updated values. Use @racket[statistics-min], @racket[statistics-mean], @@ -326,6 +244,148 @@ Like @racket[covariance] and @racket[correlation], but computed using known mean @racket[μx] and @racket[μy]. } + @section{Counting and Binning} + +@defproc*[([(samples->hash [xs (Sequenceof A)]) (HashTable A Positive-Integer)] + [(samples->hash [xs (Sequenceof A)] [ws (U #f (Sequenceof Real))]) + (HashTable A Nonnegative-Real)])]{ +@examples[#:eval typed-eval + (samples->hash '(1 2 3 4 4)) + (samples->hash '(1 1 2 3 4) '(1/2 1/2 1 1 2))] +} + +@defproc*[([(count-samples [xs (Sequenceof A)]) (Values (Listof A) (Listof Positive-Integer))] + [(count-samples [xs (Sequenceof A)] [ws (U #f (Sequenceof Real))]) + (Values (Listof A) (Listof Nonnegative-Real))])]{ +@examples[#:eval typed-eval + (count-samples '(1 2 3 4 4)) + (count-samples '(1 1 2 3 4) '(1/2 1/2 1 1 2))] +} + +@defstruct*[sample-bin ([min B] + [max B] + [values (Listof A)] + [weights (U #f (Listof Nonnegative-Real))])]{ +Represents a @italic{bin}, or a group of samples within an interval in a total order. +The values and bounds have a different type to allow @racket[bin-samples/key] +to group elements based on a function of their values. +} + +@defproc[(bin-samples [bounds (Sequenceof A)] + [lte? (A A -> Any)] + [xs (Sequenceof A)] + [ws (U #f (Sequenceof Real))]) + (Listof (sample-bin A A))]{ +Like @racket[(sort xs lte?)], but additionally groups samples into bins. +The bins' @racket[bounds] are sorted before binning @racket[xs]. + +If @racket[n = (length bounds)], then @racket[bin-samples] returns @italic{at least} @racket[(- n 1)] +bins, one for each pair of adjacent (sorted) bounds. +If some values in @racket[xs] are less than the smallest bound, they are grouped into a single bin in +front. +If some are greater than the largest bound, they are grouped into a single bin at the end. + +@examples[#:eval typed-eval + (bin-samples '() <= '(0 1 2 3 4 5 6)) + (bin-samples '(3) <= '(0 1 2 3 4 5 6)) + (bin-samples '(2 4) <= '(0 1 2 3 4 5 6)) + (bin-samples '(2 4) <= + '(0 1 2 3 4 5 6) + '(10 20 30 40 50 60 70))] + +If @racket[lte?] is a less-than-or-equal relation, the bins represent half-open intervals +(@racket[min], @racket[max]] (except possibly the first, which may be closed). +If @racket[lte?] is a less-than relation, the bins represent half-open intervals +[@racket[min], @racket[max]) (except possibly the last, which may be closed). +In either case, the sorts applied to @racket[bounds] and @racket[xs] are stable. + +Because intervals used in probability measurements are normally open on the left, prefer to use +less-than-or-equal relations for @racket[lte?]. + +If @racket[ws] is @racket[#f], @racket[bin-samples] returns bins with @racket[#f] weights. +} + +@defproc[(bin-samples/key [bounds (Sequenceof B)] + [lte? (B B -> Any)] + [key (A -> B)] + [xs (Sequenceof A)] + [ws (U #f (Sequenceof Real))]) + (Listof (sample-bin A B))]{ +Like @racket[(sort xs lte? #:key key #:cache-keys? #t)], but additionally groups samples into bins. +@examples[#:eval typed-eval + (bin-samples/key '(2 4) <= (inst car Real String) + '((1 . "1") (2 . "2") (3 . "3") (4 . "4") (5 . "5")))] +See @racket[bin-samples] for the simpler, one-type variant. +} + +@defproc[(sample-bin-compact [bin (sample-bin A B)]) (sample-bin A B)]{ +Compacts @racket[bin] by applying @racket[count-samples] to its values and weights. +@examples[#:eval typed-eval + (sample-bin-compact (sample-bin 1 4 '(1 2 3 4 4) #f))] +} + +@defproc[(sample-bin-total [bin (sample-bin A B)]) Nonnegative-Real]{ +If @racket[(sample-bin-weights bin)] is @racket[#f], returns the number of samples in @racket[bin]; +otherwise, returns the sum of their weights. +@examples[#:eval typed-eval + (sample-bin-total (sample-bin 1 4 '(1 2 3 4 4) #f)) + (sample-bin-total (sample-bin-compact (sample-bin 1 4 '(1 2 3 4 4) #f)))] +} + @section{Order Statistics} +@defproc*[([(sort-samples [lt? (A A -> Any)] [xs (Sequenceof A)]) (Listof A)] + [(sort-samples [lt? (A A -> Any)] + [xs (Sequenceof A)] + [ws (U #f (Sequenceof Real))]) + (Values (Listof A) (Listof Nonnegative-Real))])]{ +Sorts possibly weighted samples according to @racket[lt?], which is assumed to define a total +order over the elements in @racket[xs]. +@examples[#:eval typed-eval + (sort-samples < '(5 2 3 1)) + (sort-samples < '(5 2 3 1) '(50 20 30 10)) + (sort-samples < '(5 2 3 1) #f)] +Because @racket[sort-samples] is defined in terms of @racket[sort], the sort is only guaranteed +to be stable if @racket[lt?] is strictly a less-than relation. +} + +@defproc[(median [lt? (A A -> Any)] [xs (Sequenceof A)] [ws (U #f (Sequenceof Real)) #f]) A]{ +Equivalent to @racket[(quantile 1/2 lt? xs ws)]. +} + +@defproc[(quantile [p Real] + [lt? (A A -> Any)] + [xs (Sequenceof A)] + [ws (U #f (Sequenceof Real)) #f]) + A]{ +Computes the inverse of the empirical @tech{cdf} represented by the samples @racket[xs], +which are optionally weighted by @racket[ws]. + +@examples[#:eval typed-eval + (quantile 0 < '(1 3 5)) + (quantile 0.5 < '(1 2 3 4)) + (quantile 0.5 < '(1 2 3 4) '(0.25 0.20 0.20 0.35))] + +If @racket[p = 0], @racket[quantile] returns the smallest element of @racket[xs] under the +ordering relation @racket[lt?]. If @racket[p = 1], it returns the largest element. + +For weighted samples, @racket[quantile] sorts @racket[xs] and @racket[ws] together +(using @racket[sort-samples]), then finds the least @racket[x] for which the proportion of its +cumulative weight is greater than or equal to @racket[p]. + +For unweighted samples, @racket[quantile] uses the quickselect algorithm to find the element that +would be at index @racket[(ceiling (- (* p n) 1))] if @racket[xs] were sorted, where @racket[n] +is the length of @racket[xs]. +} + +@defproc[(absdev [xs (Sequenceof Real)] [ws (U #f (Sequenceof Real)) #f]) Nonnegative-Real]{ +Computes the average absolute difference between the elements in @racket[xs] and +@racket[(median < xs ws)]. If @racket[ws] is not @racket[#f], it is a weighted average. +} + +@defproc[(absdev/median [median Real] [xs (Sequenceof Real)] [ws (U #f (Sequenceof Real)) #f]) + Nonnegative-Real]{ +Like @racket[(absdev xs ws)], but computed using known median @racket[median]. +} + @(close-eval typed-eval) diff --git a/collects/math/tests/statistics-tests.rkt b/collects/math/tests/statistics-tests.rkt index a1a80a390f..0454031199 100644 --- a/collects/math/tests/statistics-tests.rkt +++ b/collects/math/tests/statistics-tests.rkt @@ -92,3 +92,26 @@ (list (sample-bin 1 3 '(1 1 2 2 2 3) #f) (sample-bin 3 8 '(4 5 5 5 5 6 7 8) #f) (sample-bin 8 9 '(9 9) #f))) + +(check-equal? + (bin-samples '(3 8) <= + '(1 1 2 2 2 3 4 5 5 5 5 6 7 8 9 9) + '(1 1.1 2 2.1 2.2 3 4 5 5.1 5.2 5.3 6 7 8 9 9.1)) + (list (sample-bin 1 3 '(1 1 2 2 2 3) '(1 1.1 2 2.1 2.2 3)) + (sample-bin 3 8 '(4 5 5 5 5 6 7 8) '(4 5 5.1 5.2 5.3 6 7 8)) + (sample-bin 8 9 '(9 9) '(9 9.1)))) + +(check-equal? + (bin-samples/key '(2 4) <= (inst car Real Symbol) + '((1 . a) (2 . b) (3 . c) (4 . d) (5 . e))) + (list (sample-bin 1 2 '((1 . a) (2 . b)) #f) + (sample-bin 2 4 '((3 . c) (4 . d)) #f) + (sample-bin 4 5 '((5 . e)) #f))) + +(for: ([p '(0 1/6 2/6 3/6 4/6 5/6 6/6)]) + (check-equal? (quantile p < '(1 2 3) '(1 1 1)) + (quantile p < '(1 2 3)))) + +(for: ([p '(0 1/8 2/8 3/8 4/8 5/8 6/8 7/8 8/8)]) + (check-equal? (quantile p < '(1 2 3 4) '(1 1 1 1)) + (quantile p < '(1 2 3 4))))