From 73395eed943796e5b7eaa7ee96f136802d5bec46 Mon Sep 17 00:00:00 2001 From: Neil Toronto Date: Sat, 8 Dec 2012 22:11:12 -0700 Subject: [PATCH] Finished and documented counting and binning functions in `math/statistics' --- collects/math/private/statistics/counting.rkt | 186 ++++++++++++++---- .../math/scribblings/math-statistics.scrbl | 89 ++++++++- collects/math/statistics.rkt | 2 +- collects/math/tests/statistics-tests.rkt | 94 +++++++++ 4 files changed, 322 insertions(+), 49 deletions(-) create mode 100644 collects/math/tests/statistics-tests.rkt diff --git a/collects/math/private/statistics/counting.rkt b/collects/math/private/statistics/counting.rkt index 9c9fb99caf..51004b9d8f 100644 --- a/collects/math/private/statistics/counting.rkt +++ b/collects/math/private/statistics/counting.rkt @@ -3,54 +3,162 @@ (require racket/list racket/sequence "../../base.rkt" - "../unsafe.rkt") + "../unsafe.rkt" + "statistics-utils.rkt") (provide samples->hash count-samples (struct-out sample-bin) - Real-Bin - bin-real-samples) + sample-bin-compact + sample-bin-total + bin-samples + bin-weighted-samples) -(struct: (A) sample-bin ([min : A] [max : A] [values : (Listof A)]) #:transparent) +;; =================================================================================================== +;; Hashing -(define-type Real-Bin (sample-bin Real)) - -(: samples->hash (All (A) ((Sequenceof A) -> (HashTable A Positive-Integer)))) -(define (samples->hash xs) +(: unweighted-samples->hash (All (A) ((Sequenceof A) -> (HashTable A Positive-Integer)))) +(define (unweighted-samples->hash xs) (define: h : (HashTable A Positive-Integer) (make-hash)) (for: ([x : A xs]) (hash-set! h x (unsafe-fx+ 1 (hash-ref h x (λ () 0))))) h) -(: count-samples (All (A) ((Sequenceof A) -> (Values (Listof A) (Listof Positive-Integer))))) -(define (count-samples xs) - (define h (samples->hash xs)) - (define xws (hash-map h (λ: ([x : A] [c : Positive-Integer]) (cons x c)))) - (values (map (λ: ([xw : (Pair A Positive-Integer)]) (car xw)) xws) - (map (λ: ([xw : (Pair A Positive-Integer)]) (cdr xw)) xws))) +(: weighted-samples->hash (All (A) ((Sequenceof A) (Sequenceof Real) + -> (HashTable A Nonnegative-Real)))) +(define (weighted-samples->hash xs ws) + (let-values ([(xs ws) (sequences->weighted-samples 'samples->hash xs ws)]) + (define: h : (HashTable A Nonnegative-Real) (make-hash)) + (for: ([x : A xs] [w : Nonnegative-Real ws]) + (hash-set! h x (+ w (hash-ref h x (λ () 0))))) + h)) -(: bin-real-samples ((Sequenceof Real) (Sequenceof Real) -> (Listof Real-Bin))) -(define (bin-real-samples bin-bounds xs) - (let* ([bin-bounds (list* -inf.0 +inf.0 (sequence->list bin-bounds))] - [bin-bounds (filter (λ: ([x : Real]) (not (eqv? x +nan.0))) - (remove-duplicates bin-bounds))] - [bin-bounds (sort bin-bounds <)] - [x-min (first bin-bounds)] - [x-max (last bin-bounds)] - [xs (sequence->list xs)] - [xs (filter (λ: ([x : Real]) (<= x-min x x-max)) xs)] - [xs (sort xs <)]) - (define-values (res rest-xs) - (for/fold: ([res : (Listof Real-Bin) empty] - [xs : (Listof Real) xs] - ) ([x1 (in-list bin-bounds)] - [x2 (in-list (rest bin-bounds))]) - (define-values (lst rest-xs) - (let: loop : (Values (Listof Real) (Listof Real)) ([lst : (Listof Real) empty] - [xs : (Listof Real) xs]) - (if (and (not (empty? xs)) (<= x1 (first xs) x2)) - (loop (cons (first xs) lst) (rest xs)) - (values lst xs)))) - (cond [(empty? lst) (values res rest-xs)] - [else (values (cons (sample-bin x1 x2 (reverse lst)) res) rest-xs)]))) - (reverse res))) +(: samples->hash + (All (A) (case-> ((Sequenceof A) -> (HashTable A Positive-Integer)) + ((Sequenceof A) (U #f (Sequenceof Real)) -> (HashTable A Nonnegative-Real))))) +(define samples->hash + (case-lambda + [(xs) (unweighted-samples->hash xs)] + [(xs ws) (cond [ws (weighted-samples->hash xs ws)] + [else (weighted-samples->hash xs (sequence-map (λ (_) 1) xs))])])) + +;; =================================================================================================== +;; Counting + +(: count-unweighted-samples + (All (A) ((Sequenceof A) -> (Values (Listof A) (Listof Positive-Integer))))) +(define (count-unweighted-samples xs) + (let ([xs (sequence->list xs)]) + (define h (samples->hash xs)) + (let ([xs (remove-duplicates xs)]) + (values xs (map (λ: ([x : A]) (hash-ref h x)) xs))))) + +(: count-weighted-samples + (All (A) ((Sequenceof A) (Sequenceof Real) -> (Values (Listof A) (Listof Nonnegative-Real))))) +(define (count-weighted-samples xs ws) + (let-values ([(xs ws) (sequences->weighted-samples 'count-samples xs ws)]) + (define h (weighted-samples->hash xs ws)) + (let ([xs (remove-duplicates xs)]) + (values xs (map (λ: ([x : A]) (hash-ref h x)) xs))))) + +(: count-samples + (All (A) (case-> ((Sequenceof A) -> (Values (Listof A) (Listof Positive-Integer))) + ((Sequenceof A) (U #f (Sequenceof Real)) + -> (Values (Listof A) (Listof Nonnegative-Real)))))) +(define count-samples + (case-lambda + [(xs) (count-unweighted-samples xs)] + [(xs ws) (if ws (count-weighted-samples xs ws) (count-unweighted-samples xs))])) + +;; =================================================================================================== +;; Bins + +(struct: (A B) sample-bin + ([values : (Listof A)] [weights : (U #f (Listof Nonnegative-Real))] [min : B] [max : B]) + #:transparent) + +(: sample-bin-compact (All (A B) ((sample-bin A B) -> (sample-bin A B)))) +(define (sample-bin-compact bin) + (let-values ([(xs ws) (count-samples (sample-bin-values bin) (sample-bin-weights bin))]) + (sample-bin xs ws (sample-bin-min bin) (sample-bin-max bin)))) + +(: sample-bin-total (All (A B) ((sample-bin A B) -> Nonnegative-Real))) +(define (sample-bin-total bin) + (define ws (sample-bin-weights bin)) + (if ws (assert (sum ws) nonnegative?) (length (sample-bin-values bin)))) + +;; =================================================================================================== +;; Binning + +(: list-split-after (All (A) ((Listof A) (A -> Any) -> (Values (Listof A) (Listof A))))) +(define (list-split-after xs pred?) + (let: loop : (Values (Listof A) (Listof A)) ([xs : (Listof A) xs] + [ys : (Listof A) empty]) + (cond [(empty? xs) (values (reverse ys) xs)] + [else + (define x (first xs)) + (cond [(pred? x) (loop (rest xs) (cons x ys))] + [else (values (reverse ys) xs)])]))) + +(: bin-samples + (All (A B) + (case-> ((Sequenceof A) (Sequenceof A) (A A -> Any) -> (Listof (sample-bin A A))) + ((Sequenceof A) (Sequenceof B) (B B -> Any) (A -> B) -> (Listof (sample-bin A B)))))) +(define bin-samples + (case-lambda + [(xs bnds lte?) (bin-samples xs bnds lte? (λ: ([x : A]) x))] + [(xs bnds lte? key) + (let* ([bnds (sort (sequence->list bnds) (λ: ([b1 : B] [b2 : B]) (and (lte? b1 b2) #t)))] + [xs (sequence->list xs)] + [xks (map (λ: ([x : A]) (cons x (key x))) xs)] + [xks (sort xks (λ: ([xk1 : (Pair A B)] [xk2 : (Pair A B)]) + (and (lte? (cdr xk1) (cdr xk2)) #t)))]) + (cond + [(empty? bnds) + (cond [(empty? xks) empty] + [else (define min (cdr (first xks))) + (define max (cdr (last xks))) + (list (sample-bin (map (inst car A B) xks) #f min max))])] + [else + (let: loop : (Listof (sample-bin A B)) ([min : (U #f B) #f] + [max : B (first bnds)] + [bnds : (Listof B) (rest bnds)] + [xks : (Listof (Pair A B)) xks] + [bins : (Listof (sample-bin A B)) empty]) + (let-values ([(yks xks) (list-split-after xks (λ: ([xk : (Pair A B)]) + (lte? (cdr xk) max)))]) + (define maybe-bin + (cond [min (list (sample-bin (map (inst car A B) yks) #f min max))] + [(empty? yks) empty] + [else (list (sample-bin (map (inst car A B) yks) #f (cdr (first yks)) max))])) + (cond [(empty? bnds) + (cond [(empty? xks) (reverse (append maybe-bin bins))] + [else + (define bin2 + (sample-bin (map (inst car A B) xks) #f max (cdr (last xks)))) + (reverse (append (cons bin2 maybe-bin) bins))])] + [else + (loop max (first bnds) (rest bnds) xks (append maybe-bin bins))])))]))])) + +(: bin-weighted-samples + (All (A B) (case-> ((Sequenceof A) (U #f (Sequenceof Real)) (Sequenceof A) (A A -> Any) + -> (Listof (sample-bin A A))) + ((Sequenceof A) (U #f (Sequenceof Real)) (Sequenceof B) (B B -> Any) (A -> B) + -> (Listof (sample-bin A B)))))) +(define bin-weighted-samples + (case-lambda + [(xs ws bnds lte?) + (cond [ws (bin-weighted-samples xs ws bnds lte? (λ: ([x : A]) x))] + [else (bin-samples xs bnds lte?)])] + [(xs ws bnds lte? key) + (cond [ws (let-values ([(xs ws) (sequences->weighted-samples 'bin-samples xs ws)]) + (define xws (map (inst cons A Nonnegative-Real) xs ws)) + (define xw-key (λ: ([xw : (Pair A Nonnegative-Real)]) (key (car xw)))) + (map (λ: ([bin : (sample-bin (Pair A Nonnegative-Real) B)]) + (define xws (sample-bin-values bin)) + (sample-bin (map (inst car A Nonnegative-Real) xws) + (map (inst cdr A Nonnegative-Real) xws) + (sample-bin-min bin) + (sample-bin-max bin))) + (bin-samples xws bnds lte? xw-key)))] + [else (bin-samples xs bnds lte? key)])])) diff --git a/collects/math/scribblings/math-statistics.scrbl b/collects/math/scribblings/math-statistics.scrbl index cd802593cc..96d73b2eee 100644 --- a/collects/math/scribblings/math-statistics.scrbl +++ b/collects/math/scribblings/math-statistics.scrbl @@ -7,7 +7,7 @@ (only-in typed/racket/base Flonum Real Boolean Any Listof Integer case-> -> U Sequenceof Positive-Flonum Nonnegative-Flonum - Nonnegative-Real)) + HashTable Positive-Integer Nonnegative-Real Values)) "utils.rkt") @(define typed-eval (make-math-eval)) @@ -18,12 +18,13 @@ @defmodule[math/statistics] -This module exports functions that compute summary values for collections of data, or +This module exports functions that compute summary values for collections of samples, or @deftech{statistics}, such as means, standard devations, medians, and @italic{k}th order -statistics. It also exports functions for managing collections of sample values. +statistics. +It also exports functions for managing collections of samples. Most of the functions that compute statistics also accept a sequence of nonnegative reals -that correspond one-to-one with the data values. +that correspond one-to-one with samples. These are used as weights; equivalently counts, pseudocounts or proportions. While this makes it easy to work with weighted samples, it introduces some subtleties in bias correction. @@ -32,14 +33,84 @@ See @secref{stats:expected-values} for a discussion. @local-table-of-contents[] -@section{Counting} +@section{Counting and Binning} -@defthing[samples->hash Any]{ -This stub represents forthcoming documentation. +@defproc*[([(samples->hash [xs (Sequenceof A)]) (HashTable A Positive-Integer)] + [(samples->hash [xs (Sequenceof A)] [ws (U #f (Sequenceof Real))]) + (HashTable A Nonnegative-Real)])]{ +@examples[#:eval typed-eval + (samples->hash '(1 2 3 4 4)) + (samples->hash '(1 1 2 3 4) '(1/2 1/2 1 1 2))] } -@defthing[count-samples Any]{ -This stub represents forthcoming documentation. +@defproc*[([(count-samples [xs (Sequenceof A)]) (Values (Listof A) (Listof Positive-Integer))] + [(count-samples [xs (Sequenceof A)] [ws (U #f (Sequenceof Real))]) + (Values (Listof A) (Listof Nonnegative-Real))])]{ +@examples[#:eval typed-eval + (count-samples '(1 2 3 4 4)) + (count-samples '(1 1 2 3 4) '(1/2 1/2 1 1 2))] +} + +@defstruct*[sample-bin ([values (Listof A)] + [weights (U #f (Listof Nonnegative-Real))] + [min B] [max B])]{ +Represents a @italic{bin}, or a group of samples within an interval in a total order. +The values and bounds have a different type to allow @racket[bin-samples] and +@racket[bin-weighted-samples] to group elements based on a function of their values (a @racket[key], +like in @racket[sort]). +} + +@defproc*[([(bin-samples [xs (Sequenceof A)] [bounds (Sequenceof A)] [lte? (A A -> Any)]) + (Listof (sample-bin A A))] + [(bin-samples [xs (Sequenceof A)] + [bounds (Sequenceof B)] + [lte? (B B -> Any)] + [key (A -> B)]) + (Listof (sample-bin A B))])]{ +Like @racket[(sort xs lte? #:key key)], but additionally groups samples into bins. +Keys are always cached, and @racket[bounds] is sorted before binning. + +If @racket[n = (length bounds)], then @racket[bin-samples] returns @italic{at least} @racket[(- n 1)] +bins, one for each pair of adjacent (sorted) bounds. +If some values in @racket[xs] are less than the smallest bound, they are grouped into a single bin in +front. +If some are greater than the largest bound, they are grouped into a single bin at the end. + +@examples[#:eval typed-eval + (bin-samples '(0 1 2 3 4 5 6) '() <=) + (bin-samples '(0 1 2 3 4 5 6) '(3) <=) + (bin-samples '(0 1 2 3 4 5 6) '(2 4) <=)] + +Note that @racket[bin-samples] always returns bins with @racket[#f] weights, or bins containing +unweighted samples. +} + +@defproc*[([(bin-weighted-samples [xs (Sequenceof A)] + [ws (Sequenceof Real)] + [bounds (Sequenceof A)] + [lte? (A A -> Any)]) + (Listof (sample-bin A A))] + [(bin-weighted-samples [xs (Sequenceof A)] + [ws (Sequenceof Real)] + [bounds (Sequenceof B)] + [lte? (B B -> Any)] + [key (A -> B)]) + (Listof (sample-bin A B))])]{ +Like @racket[bin-samples], but for weighted samples. +} + +@defproc[(sample-bin-compact [bin (sample-bin A B)]) (sample-bin A B)]{ +Compacts @racket[bin] by applying @racket[count-samples] to its values and weights. +@examples[#:eval typed-eval + (sample-bin-compact (sample-bin '(1 2 3 4 4) #f 1 4))] +} + +@defproc[(sample-bin-total [bin (sample-bin A B)]) Nonnegative-Real]{ +If @racket[(sample-bin-weights bin)] is @racket[#f], returns the number of samples in @racket[bin]; +otherwise, returns the sum of their weights. +@examples[#:eval typed-eval + (sample-bin-total (sample-bin '(1 2 3 4 4) #f 1 4)) + (sample-bin-total (sample-bin-compact (sample-bin '(1 2 3 4 4) #f 1 4)))] } @section[#:tag "stats:expected-values"]{Expected Values} diff --git a/collects/math/statistics.rkt b/collects/math/statistics.rkt index 895606b834..363fd3aa23 100644 --- a/collects/math/statistics.rkt +++ b/collects/math/statistics.rkt @@ -1,4 +1,4 @@ -#lang typed/racket/base +#lang racket/base (require "private/statistics/statistics-struct.rkt" "private/statistics/expected-values.rkt" diff --git a/collects/math/tests/statistics-tests.rkt b/collects/math/tests/statistics-tests.rkt new file mode 100644 index 0000000000..b0816b2d6d --- /dev/null +++ b/collects/math/tests/statistics-tests.rkt @@ -0,0 +1,94 @@ +#lang typed/racket + +(require math/statistics + typed/rackunit) + +(check-equal? + (bin-samples '() '(5) <=) + empty) + +(check-equal? + (bin-samples '(4) '(5) <=) + (list (sample-bin '(4) #f 4 5))) + +(check-equal? + (bin-samples '(5) '(5) <=) + (list (sample-bin '(5) #f 5 5))) + +(check-equal? + (bin-samples '(6) '(5) <=) + (list (sample-bin '(6) #f 5 6))) + +(check-equal? + (bin-samples '(4 5) '(5) <=) + (list (sample-bin '(4 5) #f 4 5))) + +(check-equal? + (bin-samples '(5 6) '(5) <=) + (list (sample-bin '(5) #f 5 5) + (sample-bin '(6) #f 5 6))) + +(check-equal? + (bin-samples '(4 6) '(5) <=) + (list (sample-bin '(4) #f 4 5) + (sample-bin '(6) #f 5 6))) + +(check-equal? + (bin-samples '(4 5 6) '(5) <=) + (list (sample-bin '(4 5) #f 4 5) + (sample-bin '(6) #f 5 6))) + +(check-equal? + (bin-samples '() '(4 8) <=) + (list (sample-bin '() #f 4 8))) + +(check-equal? + (bin-samples '(2) '(4 8) <=) + (list (sample-bin '(2) #f 2 4) + (sample-bin '() #f 4 8))) + +(check-equal? + (bin-samples '(4) '(4 8) <=) + (list (sample-bin '(4) #f 4 4) + (sample-bin '() #f 4 8))) + +(check-equal? + (bin-samples '(6) '(4 8) <=) + (list (sample-bin '(6) #f 4 8))) + +(check-equal? + (bin-samples '(8) '(4 8) <=) + (list (sample-bin '(8) #f 4 8))) + +(check-equal? + (bin-samples '(10) '(4 8) <=) + (list (sample-bin '() #f 4 8) + (sample-bin '(10) #f 8 10))) + +(check-equal? + (bin-samples '(4 8) '(4 8) <=) + (list (sample-bin '(4) #f 4 4) + (sample-bin '(8) #f 4 8))) + +(check-equal? + (bin-samples '(4 10) '(4 8) <=) + (list (sample-bin '(4) #f 4 4) + (sample-bin '() #f 4 8) + (sample-bin '(10) #f 8 10))) + +(check-equal? + (bin-samples '(8 10) '(4 8) <=) + (list (sample-bin '(8) #f 4 8) + (sample-bin '(10) #f 8 10))) + +(check-equal? + (bin-samples '(4 8 10) '(4 8) <=) + (list (sample-bin '(4) #f 4 4) + (sample-bin '(8) #f 4 8) + (sample-bin '(10) #f 8 10))) + +(check-equal? + (bin-samples '(1 1 2 2 2 3 4 5 5 5 5 6 7 8 9 9) '(3 8) <=) + (list (sample-bin '(1 1 2 2 2 3) #f 1 3) + (sample-bin '(4 5 5 5 5 6 7 8) #f 3 8) + (sample-bin '(9 9) #f 8 9)))