Finished and documented counting and binning functions in `math/statistics'

This commit is contained in:
Neil Toronto 2012-12-08 22:11:12 -07:00
parent 6babc9ec56
commit 73395eed94
4 changed files with 322 additions and 49 deletions

View File

@ -3,54 +3,162 @@
(require racket/list
racket/sequence
"../../base.rkt"
"../unsafe.rkt")
"../unsafe.rkt"
"statistics-utils.rkt")
(provide samples->hash
count-samples
(struct-out sample-bin)
Real-Bin
bin-real-samples)
sample-bin-compact
sample-bin-total
bin-samples
bin-weighted-samples)
(struct: (A) sample-bin ([min : A] [max : A] [values : (Listof A)]) #:transparent)
;; ===================================================================================================
;; Hashing
(define-type Real-Bin (sample-bin Real))
(: samples->hash (All (A) ((Sequenceof A) -> (HashTable A Positive-Integer))))
(define (samples->hash xs)
(: unweighted-samples->hash (All (A) ((Sequenceof A) -> (HashTable A Positive-Integer))))
(define (unweighted-samples->hash xs)
(define: h : (HashTable A Positive-Integer) (make-hash))
(for: ([x : A xs])
(hash-set! h x (unsafe-fx+ 1 (hash-ref h x (λ () 0)))))
h)
(: count-samples (All (A) ((Sequenceof A) -> (Values (Listof A) (Listof Positive-Integer)))))
(define (count-samples xs)
(define h (samples->hash xs))
(define xws (hash-map h (λ: ([x : A] [c : Positive-Integer]) (cons x c))))
(values (map (λ: ([xw : (Pair A Positive-Integer)]) (car xw)) xws)
(map (λ: ([xw : (Pair A Positive-Integer)]) (cdr xw)) xws)))
(: weighted-samples->hash (All (A) ((Sequenceof A) (Sequenceof Real)
-> (HashTable A Nonnegative-Real))))
(define (weighted-samples->hash xs ws)
(let-values ([(xs ws) (sequences->weighted-samples 'samples->hash xs ws)])
(define: h : (HashTable A Nonnegative-Real) (make-hash))
(for: ([x : A xs] [w : Nonnegative-Real ws])
(hash-set! h x (+ w (hash-ref h x (λ () 0)))))
h))
(: bin-real-samples ((Sequenceof Real) (Sequenceof Real) -> (Listof Real-Bin)))
(define (bin-real-samples bin-bounds xs)
(let* ([bin-bounds (list* -inf.0 +inf.0 (sequence->list bin-bounds))]
[bin-bounds (filter (λ: ([x : Real]) (not (eqv? x +nan.0)))
(remove-duplicates bin-bounds))]
[bin-bounds (sort bin-bounds <)]
[x-min (first bin-bounds)]
[x-max (last bin-bounds)]
[xs (sequence->list xs)]
[xs (filter (λ: ([x : Real]) (<= x-min x x-max)) xs)]
[xs (sort xs <)])
(define-values (res rest-xs)
(for/fold: ([res : (Listof Real-Bin) empty]
[xs : (Listof Real) xs]
) ([x1 (in-list bin-bounds)]
[x2 (in-list (rest bin-bounds))])
(define-values (lst rest-xs)
(let: loop : (Values (Listof Real) (Listof Real)) ([lst : (Listof Real) empty]
[xs : (Listof Real) xs])
(if (and (not (empty? xs)) (<= x1 (first xs) x2))
(loop (cons (first xs) lst) (rest xs))
(values lst xs))))
(cond [(empty? lst) (values res rest-xs)]
[else (values (cons (sample-bin x1 x2 (reverse lst)) res) rest-xs)])))
(reverse res)))
(: samples->hash
(All (A) (case-> ((Sequenceof A) -> (HashTable A Positive-Integer))
((Sequenceof A) (U #f (Sequenceof Real)) -> (HashTable A Nonnegative-Real)))))
(define samples->hash
(case-lambda
[(xs) (unweighted-samples->hash xs)]
[(xs ws) (cond [ws (weighted-samples->hash xs ws)]
[else (weighted-samples->hash xs (sequence-map (λ (_) 1) xs))])]))
;; ===================================================================================================
;; Counting
(: count-unweighted-samples
(All (A) ((Sequenceof A) -> (Values (Listof A) (Listof Positive-Integer)))))
(define (count-unweighted-samples xs)
(let ([xs (sequence->list xs)])
(define h (samples->hash xs))
(let ([xs (remove-duplicates xs)])
(values xs (map (λ: ([x : A]) (hash-ref h x)) xs)))))
(: count-weighted-samples
(All (A) ((Sequenceof A) (Sequenceof Real) -> (Values (Listof A) (Listof Nonnegative-Real)))))
(define (count-weighted-samples xs ws)
(let-values ([(xs ws) (sequences->weighted-samples 'count-samples xs ws)])
(define h (weighted-samples->hash xs ws))
(let ([xs (remove-duplicates xs)])
(values xs (map (λ: ([x : A]) (hash-ref h x)) xs)))))
(: count-samples
(All (A) (case-> ((Sequenceof A) -> (Values (Listof A) (Listof Positive-Integer)))
((Sequenceof A) (U #f (Sequenceof Real))
-> (Values (Listof A) (Listof Nonnegative-Real))))))
(define count-samples
(case-lambda
[(xs) (count-unweighted-samples xs)]
[(xs ws) (if ws (count-weighted-samples xs ws) (count-unweighted-samples xs))]))
;; ===================================================================================================
;; Bins
(struct: (A B) sample-bin
([values : (Listof A)] [weights : (U #f (Listof Nonnegative-Real))] [min : B] [max : B])
#:transparent)
(: sample-bin-compact (All (A B) ((sample-bin A B) -> (sample-bin A B))))
(define (sample-bin-compact bin)
(let-values ([(xs ws) (count-samples (sample-bin-values bin) (sample-bin-weights bin))])
(sample-bin xs ws (sample-bin-min bin) (sample-bin-max bin))))
(: sample-bin-total (All (A B) ((sample-bin A B) -> Nonnegative-Real)))
(define (sample-bin-total bin)
(define ws (sample-bin-weights bin))
(if ws (assert (sum ws) nonnegative?) (length (sample-bin-values bin))))
;; ===================================================================================================
;; Binning
(: list-split-after (All (A) ((Listof A) (A -> Any) -> (Values (Listof A) (Listof A)))))
(define (list-split-after xs pred?)
(let: loop : (Values (Listof A) (Listof A)) ([xs : (Listof A) xs]
[ys : (Listof A) empty])
(cond [(empty? xs) (values (reverse ys) xs)]
[else
(define x (first xs))
(cond [(pred? x) (loop (rest xs) (cons x ys))]
[else (values (reverse ys) xs)])])))
(: bin-samples
(All (A B)
(case-> ((Sequenceof A) (Sequenceof A) (A A -> Any) -> (Listof (sample-bin A A)))
((Sequenceof A) (Sequenceof B) (B B -> Any) (A -> B) -> (Listof (sample-bin A B))))))
(define bin-samples
(case-lambda
[(xs bnds lte?) (bin-samples xs bnds lte? (λ: ([x : A]) x))]
[(xs bnds lte? key)
(let* ([bnds (sort (sequence->list bnds) (λ: ([b1 : B] [b2 : B]) (and (lte? b1 b2) #t)))]
[xs (sequence->list xs)]
[xks (map (λ: ([x : A]) (cons x (key x))) xs)]
[xks (sort xks (λ: ([xk1 : (Pair A B)] [xk2 : (Pair A B)])
(and (lte? (cdr xk1) (cdr xk2)) #t)))])
(cond
[(empty? bnds)
(cond [(empty? xks) empty]
[else (define min (cdr (first xks)))
(define max (cdr (last xks)))
(list (sample-bin (map (inst car A B) xks) #f min max))])]
[else
(let: loop : (Listof (sample-bin A B)) ([min : (U #f B) #f]
[max : B (first bnds)]
[bnds : (Listof B) (rest bnds)]
[xks : (Listof (Pair A B)) xks]
[bins : (Listof (sample-bin A B)) empty])
(let-values ([(yks xks) (list-split-after xks (λ: ([xk : (Pair A B)])
(lte? (cdr xk) max)))])
(define maybe-bin
(cond [min (list (sample-bin (map (inst car A B) yks) #f min max))]
[(empty? yks) empty]
[else (list (sample-bin (map (inst car A B) yks) #f (cdr (first yks)) max))]))
(cond [(empty? bnds)
(cond [(empty? xks) (reverse (append maybe-bin bins))]
[else
(define bin2
(sample-bin (map (inst car A B) xks) #f max (cdr (last xks))))
(reverse (append (cons bin2 maybe-bin) bins))])]
[else
(loop max (first bnds) (rest bnds) xks (append maybe-bin bins))])))]))]))
(: bin-weighted-samples
(All (A B) (case-> ((Sequenceof A) (U #f (Sequenceof Real)) (Sequenceof A) (A A -> Any)
-> (Listof (sample-bin A A)))
((Sequenceof A) (U #f (Sequenceof Real)) (Sequenceof B) (B B -> Any) (A -> B)
-> (Listof (sample-bin A B))))))
(define bin-weighted-samples
(case-lambda
[(xs ws bnds lte?)
(cond [ws (bin-weighted-samples xs ws bnds lte? (λ: ([x : A]) x))]
[else (bin-samples xs bnds lte?)])]
[(xs ws bnds lte? key)
(cond [ws (let-values ([(xs ws) (sequences->weighted-samples 'bin-samples xs ws)])
(define xws (map (inst cons A Nonnegative-Real) xs ws))
(define xw-key (λ: ([xw : (Pair A Nonnegative-Real)]) (key (car xw))))
(map (λ: ([bin : (sample-bin (Pair A Nonnegative-Real) B)])
(define xws (sample-bin-values bin))
(sample-bin (map (inst car A Nonnegative-Real) xws)
(map (inst cdr A Nonnegative-Real) xws)
(sample-bin-min bin)
(sample-bin-max bin)))
(bin-samples xws bnds lte? xw-key)))]
[else (bin-samples xs bnds lte? key)])]))

View File

@ -7,7 +7,7 @@
(only-in typed/racket/base
Flonum Real Boolean Any Listof Integer case-> -> U
Sequenceof Positive-Flonum Nonnegative-Flonum
Nonnegative-Real))
HashTable Positive-Integer Nonnegative-Real Values))
"utils.rkt")
@(define typed-eval (make-math-eval))
@ -18,12 +18,13 @@
@defmodule[math/statistics]
This module exports functions that compute summary values for collections of data, or
This module exports functions that compute summary values for collections of samples, or
@deftech{statistics}, such as means, standard devations, medians, and @italic{k}th order
statistics. It also exports functions for managing collections of sample values.
statistics.
It also exports functions for managing collections of samples.
Most of the functions that compute statistics also accept a sequence of nonnegative reals
that correspond one-to-one with the data values.
that correspond one-to-one with samples.
These are used as weights; equivalently counts, pseudocounts or proportions.
While this makes it easy to work with weighted samples, it introduces some subtleties
in bias correction.
@ -32,14 +33,84 @@ See @secref{stats:expected-values} for a discussion.
@local-table-of-contents[]
@section{Counting}
@section{Counting and Binning}
@defthing[samples->hash Any]{
This stub represents forthcoming documentation.
@defproc*[([(samples->hash [xs (Sequenceof A)]) (HashTable A Positive-Integer)]
[(samples->hash [xs (Sequenceof A)] [ws (U #f (Sequenceof Real))])
(HashTable A Nonnegative-Real)])]{
@examples[#:eval typed-eval
(samples->hash '(1 2 3 4 4))
(samples->hash '(1 1 2 3 4) '(1/2 1/2 1 1 2))]
}
@defthing[count-samples Any]{
This stub represents forthcoming documentation.
@defproc*[([(count-samples [xs (Sequenceof A)]) (Values (Listof A) (Listof Positive-Integer))]
[(count-samples [xs (Sequenceof A)] [ws (U #f (Sequenceof Real))])
(Values (Listof A) (Listof Nonnegative-Real))])]{
@examples[#:eval typed-eval
(count-samples '(1 2 3 4 4))
(count-samples '(1 1 2 3 4) '(1/2 1/2 1 1 2))]
}
@defstruct*[sample-bin ([values (Listof A)]
[weights (U #f (Listof Nonnegative-Real))]
[min B] [max B])]{
Represents a @italic{bin}, or a group of samples within an interval in a total order.
The values and bounds have a different type to allow @racket[bin-samples] and
@racket[bin-weighted-samples] to group elements based on a function of their values (a @racket[key],
like in @racket[sort]).
}
@defproc*[([(bin-samples [xs (Sequenceof A)] [bounds (Sequenceof A)] [lte? (A A -> Any)])
(Listof (sample-bin A A))]
[(bin-samples [xs (Sequenceof A)]
[bounds (Sequenceof B)]
[lte? (B B -> Any)]
[key (A -> B)])
(Listof (sample-bin A B))])]{
Like @racket[(sort xs lte? #:key key)], but additionally groups samples into bins.
Keys are always cached, and @racket[bounds] is sorted before binning.
If @racket[n = (length bounds)], then @racket[bin-samples] returns @italic{at least} @racket[(- n 1)]
bins, one for each pair of adjacent (sorted) bounds.
If some values in @racket[xs] are less than the smallest bound, they are grouped into a single bin in
front.
If some are greater than the largest bound, they are grouped into a single bin at the end.
@examples[#:eval typed-eval
(bin-samples '(0 1 2 3 4 5 6) '() <=)
(bin-samples '(0 1 2 3 4 5 6) '(3) <=)
(bin-samples '(0 1 2 3 4 5 6) '(2 4) <=)]
Note that @racket[bin-samples] always returns bins with @racket[#f] weights, or bins containing
unweighted samples.
}
@defproc*[([(bin-weighted-samples [xs (Sequenceof A)]
[ws (Sequenceof Real)]
[bounds (Sequenceof A)]
[lte? (A A -> Any)])
(Listof (sample-bin A A))]
[(bin-weighted-samples [xs (Sequenceof A)]
[ws (Sequenceof Real)]
[bounds (Sequenceof B)]
[lte? (B B -> Any)]
[key (A -> B)])
(Listof (sample-bin A B))])]{
Like @racket[bin-samples], but for weighted samples.
}
@defproc[(sample-bin-compact [bin (sample-bin A B)]) (sample-bin A B)]{
Compacts @racket[bin] by applying @racket[count-samples] to its values and weights.
@examples[#:eval typed-eval
(sample-bin-compact (sample-bin '(1 2 3 4 4) #f 1 4))]
}
@defproc[(sample-bin-total [bin (sample-bin A B)]) Nonnegative-Real]{
If @racket[(sample-bin-weights bin)] is @racket[#f], returns the number of samples in @racket[bin];
otherwise, returns the sum of their weights.
@examples[#:eval typed-eval
(sample-bin-total (sample-bin '(1 2 3 4 4) #f 1 4))
(sample-bin-total (sample-bin-compact (sample-bin '(1 2 3 4 4) #f 1 4)))]
}
@section[#:tag "stats:expected-values"]{Expected Values}

View File

@ -1,4 +1,4 @@
#lang typed/racket/base
#lang racket/base
(require "private/statistics/statistics-struct.rkt"
"private/statistics/expected-values.rkt"

View File

@ -0,0 +1,94 @@
#lang typed/racket
(require math/statistics
typed/rackunit)
(check-equal?
(bin-samples '() '(5) <=)
empty)
(check-equal?
(bin-samples '(4) '(5) <=)
(list (sample-bin '(4) #f 4 5)))
(check-equal?
(bin-samples '(5) '(5) <=)
(list (sample-bin '(5) #f 5 5)))
(check-equal?
(bin-samples '(6) '(5) <=)
(list (sample-bin '(6) #f 5 6)))
(check-equal?
(bin-samples '(4 5) '(5) <=)
(list (sample-bin '(4 5) #f 4 5)))
(check-equal?
(bin-samples '(5 6) '(5) <=)
(list (sample-bin '(5) #f 5 5)
(sample-bin '(6) #f 5 6)))
(check-equal?
(bin-samples '(4 6) '(5) <=)
(list (sample-bin '(4) #f 4 5)
(sample-bin '(6) #f 5 6)))
(check-equal?
(bin-samples '(4 5 6) '(5) <=)
(list (sample-bin '(4 5) #f 4 5)
(sample-bin '(6) #f 5 6)))
(check-equal?
(bin-samples '() '(4 8) <=)
(list (sample-bin '() #f 4 8)))
(check-equal?
(bin-samples '(2) '(4 8) <=)
(list (sample-bin '(2) #f 2 4)
(sample-bin '() #f 4 8)))
(check-equal?
(bin-samples '(4) '(4 8) <=)
(list (sample-bin '(4) #f 4 4)
(sample-bin '() #f 4 8)))
(check-equal?
(bin-samples '(6) '(4 8) <=)
(list (sample-bin '(6) #f 4 8)))
(check-equal?
(bin-samples '(8) '(4 8) <=)
(list (sample-bin '(8) #f 4 8)))
(check-equal?
(bin-samples '(10) '(4 8) <=)
(list (sample-bin '() #f 4 8)
(sample-bin '(10) #f 8 10)))
(check-equal?
(bin-samples '(4 8) '(4 8) <=)
(list (sample-bin '(4) #f 4 4)
(sample-bin '(8) #f 4 8)))
(check-equal?
(bin-samples '(4 10) '(4 8) <=)
(list (sample-bin '(4) #f 4 4)
(sample-bin '() #f 4 8)
(sample-bin '(10) #f 8 10)))
(check-equal?
(bin-samples '(8 10) '(4 8) <=)
(list (sample-bin '(8) #f 4 8)
(sample-bin '(10) #f 8 10)))
(check-equal?
(bin-samples '(4 8 10) '(4 8) <=)
(list (sample-bin '(4) #f 4 4)
(sample-bin '(8) #f 4 8)
(sample-bin '(10) #f 8 10)))
(check-equal?
(bin-samples '(1 1 2 2 2 3 4 5 5 5 5 6 7 8 9 9) '(3 8) <=)
(list (sample-bin '(1 1 2 2 2 3) #f 1 3)
(sample-bin '(4 5 5 5 5 6 7 8) #f 3 8)
(sample-bin '(9 9) #f 8 9)))