Finished and documented counting and binning functions in `math/statistics'
This commit is contained in:
parent
6babc9ec56
commit
73395eed94
|
@ -3,54 +3,162 @@
|
|||
(require racket/list
|
||||
racket/sequence
|
||||
"../../base.rkt"
|
||||
"../unsafe.rkt")
|
||||
"../unsafe.rkt"
|
||||
"statistics-utils.rkt")
|
||||
|
||||
(provide samples->hash
|
||||
count-samples
|
||||
(struct-out sample-bin)
|
||||
Real-Bin
|
||||
bin-real-samples)
|
||||
sample-bin-compact
|
||||
sample-bin-total
|
||||
bin-samples
|
||||
bin-weighted-samples)
|
||||
|
||||
(struct: (A) sample-bin ([min : A] [max : A] [values : (Listof A)]) #:transparent)
|
||||
;; ===================================================================================================
|
||||
;; Hashing
|
||||
|
||||
(define-type Real-Bin (sample-bin Real))
|
||||
|
||||
(: samples->hash (All (A) ((Sequenceof A) -> (HashTable A Positive-Integer))))
|
||||
(define (samples->hash xs)
|
||||
(: unweighted-samples->hash (All (A) ((Sequenceof A) -> (HashTable A Positive-Integer))))
|
||||
(define (unweighted-samples->hash xs)
|
||||
(define: h : (HashTable A Positive-Integer) (make-hash))
|
||||
(for: ([x : A xs])
|
||||
(hash-set! h x (unsafe-fx+ 1 (hash-ref h x (λ () 0)))))
|
||||
h)
|
||||
|
||||
(: count-samples (All (A) ((Sequenceof A) -> (Values (Listof A) (Listof Positive-Integer)))))
|
||||
(define (count-samples xs)
|
||||
(define h (samples->hash xs))
|
||||
(define xws (hash-map h (λ: ([x : A] [c : Positive-Integer]) (cons x c))))
|
||||
(values (map (λ: ([xw : (Pair A Positive-Integer)]) (car xw)) xws)
|
||||
(map (λ: ([xw : (Pair A Positive-Integer)]) (cdr xw)) xws)))
|
||||
(: weighted-samples->hash (All (A) ((Sequenceof A) (Sequenceof Real)
|
||||
-> (HashTable A Nonnegative-Real))))
|
||||
(define (weighted-samples->hash xs ws)
|
||||
(let-values ([(xs ws) (sequences->weighted-samples 'samples->hash xs ws)])
|
||||
(define: h : (HashTable A Nonnegative-Real) (make-hash))
|
||||
(for: ([x : A xs] [w : Nonnegative-Real ws])
|
||||
(hash-set! h x (+ w (hash-ref h x (λ () 0)))))
|
||||
h))
|
||||
|
||||
(: bin-real-samples ((Sequenceof Real) (Sequenceof Real) -> (Listof Real-Bin)))
|
||||
(define (bin-real-samples bin-bounds xs)
|
||||
(let* ([bin-bounds (list* -inf.0 +inf.0 (sequence->list bin-bounds))]
|
||||
[bin-bounds (filter (λ: ([x : Real]) (not (eqv? x +nan.0)))
|
||||
(remove-duplicates bin-bounds))]
|
||||
[bin-bounds (sort bin-bounds <)]
|
||||
[x-min (first bin-bounds)]
|
||||
[x-max (last bin-bounds)]
|
||||
[xs (sequence->list xs)]
|
||||
[xs (filter (λ: ([x : Real]) (<= x-min x x-max)) xs)]
|
||||
[xs (sort xs <)])
|
||||
(define-values (res rest-xs)
|
||||
(for/fold: ([res : (Listof Real-Bin) empty]
|
||||
[xs : (Listof Real) xs]
|
||||
) ([x1 (in-list bin-bounds)]
|
||||
[x2 (in-list (rest bin-bounds))])
|
||||
(define-values (lst rest-xs)
|
||||
(let: loop : (Values (Listof Real) (Listof Real)) ([lst : (Listof Real) empty]
|
||||
[xs : (Listof Real) xs])
|
||||
(if (and (not (empty? xs)) (<= x1 (first xs) x2))
|
||||
(loop (cons (first xs) lst) (rest xs))
|
||||
(values lst xs))))
|
||||
(cond [(empty? lst) (values res rest-xs)]
|
||||
[else (values (cons (sample-bin x1 x2 (reverse lst)) res) rest-xs)])))
|
||||
(reverse res)))
|
||||
(: samples->hash
|
||||
(All (A) (case-> ((Sequenceof A) -> (HashTable A Positive-Integer))
|
||||
((Sequenceof A) (U #f (Sequenceof Real)) -> (HashTable A Nonnegative-Real)))))
|
||||
(define samples->hash
|
||||
(case-lambda
|
||||
[(xs) (unweighted-samples->hash xs)]
|
||||
[(xs ws) (cond [ws (weighted-samples->hash xs ws)]
|
||||
[else (weighted-samples->hash xs (sequence-map (λ (_) 1) xs))])]))
|
||||
|
||||
;; ===================================================================================================
|
||||
;; Counting
|
||||
|
||||
(: count-unweighted-samples
|
||||
(All (A) ((Sequenceof A) -> (Values (Listof A) (Listof Positive-Integer)))))
|
||||
(define (count-unweighted-samples xs)
|
||||
(let ([xs (sequence->list xs)])
|
||||
(define h (samples->hash xs))
|
||||
(let ([xs (remove-duplicates xs)])
|
||||
(values xs (map (λ: ([x : A]) (hash-ref h x)) xs)))))
|
||||
|
||||
(: count-weighted-samples
|
||||
(All (A) ((Sequenceof A) (Sequenceof Real) -> (Values (Listof A) (Listof Nonnegative-Real)))))
|
||||
(define (count-weighted-samples xs ws)
|
||||
(let-values ([(xs ws) (sequences->weighted-samples 'count-samples xs ws)])
|
||||
(define h (weighted-samples->hash xs ws))
|
||||
(let ([xs (remove-duplicates xs)])
|
||||
(values xs (map (λ: ([x : A]) (hash-ref h x)) xs)))))
|
||||
|
||||
(: count-samples
|
||||
(All (A) (case-> ((Sequenceof A) -> (Values (Listof A) (Listof Positive-Integer)))
|
||||
((Sequenceof A) (U #f (Sequenceof Real))
|
||||
-> (Values (Listof A) (Listof Nonnegative-Real))))))
|
||||
(define count-samples
|
||||
(case-lambda
|
||||
[(xs) (count-unweighted-samples xs)]
|
||||
[(xs ws) (if ws (count-weighted-samples xs ws) (count-unweighted-samples xs))]))
|
||||
|
||||
;; ===================================================================================================
|
||||
;; Bins
|
||||
|
||||
(struct: (A B) sample-bin
|
||||
([values : (Listof A)] [weights : (U #f (Listof Nonnegative-Real))] [min : B] [max : B])
|
||||
#:transparent)
|
||||
|
||||
(: sample-bin-compact (All (A B) ((sample-bin A B) -> (sample-bin A B))))
|
||||
(define (sample-bin-compact bin)
|
||||
(let-values ([(xs ws) (count-samples (sample-bin-values bin) (sample-bin-weights bin))])
|
||||
(sample-bin xs ws (sample-bin-min bin) (sample-bin-max bin))))
|
||||
|
||||
(: sample-bin-total (All (A B) ((sample-bin A B) -> Nonnegative-Real)))
|
||||
(define (sample-bin-total bin)
|
||||
(define ws (sample-bin-weights bin))
|
||||
(if ws (assert (sum ws) nonnegative?) (length (sample-bin-values bin))))
|
||||
|
||||
;; ===================================================================================================
|
||||
;; Binning
|
||||
|
||||
(: list-split-after (All (A) ((Listof A) (A -> Any) -> (Values (Listof A) (Listof A)))))
|
||||
(define (list-split-after xs pred?)
|
||||
(let: loop : (Values (Listof A) (Listof A)) ([xs : (Listof A) xs]
|
||||
[ys : (Listof A) empty])
|
||||
(cond [(empty? xs) (values (reverse ys) xs)]
|
||||
[else
|
||||
(define x (first xs))
|
||||
(cond [(pred? x) (loop (rest xs) (cons x ys))]
|
||||
[else (values (reverse ys) xs)])])))
|
||||
|
||||
(: bin-samples
|
||||
(All (A B)
|
||||
(case-> ((Sequenceof A) (Sequenceof A) (A A -> Any) -> (Listof (sample-bin A A)))
|
||||
((Sequenceof A) (Sequenceof B) (B B -> Any) (A -> B) -> (Listof (sample-bin A B))))))
|
||||
(define bin-samples
|
||||
(case-lambda
|
||||
[(xs bnds lte?) (bin-samples xs bnds lte? (λ: ([x : A]) x))]
|
||||
[(xs bnds lte? key)
|
||||
(let* ([bnds (sort (sequence->list bnds) (λ: ([b1 : B] [b2 : B]) (and (lte? b1 b2) #t)))]
|
||||
[xs (sequence->list xs)]
|
||||
[xks (map (λ: ([x : A]) (cons x (key x))) xs)]
|
||||
[xks (sort xks (λ: ([xk1 : (Pair A B)] [xk2 : (Pair A B)])
|
||||
(and (lte? (cdr xk1) (cdr xk2)) #t)))])
|
||||
(cond
|
||||
[(empty? bnds)
|
||||
(cond [(empty? xks) empty]
|
||||
[else (define min (cdr (first xks)))
|
||||
(define max (cdr (last xks)))
|
||||
(list (sample-bin (map (inst car A B) xks) #f min max))])]
|
||||
[else
|
||||
(let: loop : (Listof (sample-bin A B)) ([min : (U #f B) #f]
|
||||
[max : B (first bnds)]
|
||||
[bnds : (Listof B) (rest bnds)]
|
||||
[xks : (Listof (Pair A B)) xks]
|
||||
[bins : (Listof (sample-bin A B)) empty])
|
||||
(let-values ([(yks xks) (list-split-after xks (λ: ([xk : (Pair A B)])
|
||||
(lte? (cdr xk) max)))])
|
||||
(define maybe-bin
|
||||
(cond [min (list (sample-bin (map (inst car A B) yks) #f min max))]
|
||||
[(empty? yks) empty]
|
||||
[else (list (sample-bin (map (inst car A B) yks) #f (cdr (first yks)) max))]))
|
||||
(cond [(empty? bnds)
|
||||
(cond [(empty? xks) (reverse (append maybe-bin bins))]
|
||||
[else
|
||||
(define bin2
|
||||
(sample-bin (map (inst car A B) xks) #f max (cdr (last xks))))
|
||||
(reverse (append (cons bin2 maybe-bin) bins))])]
|
||||
[else
|
||||
(loop max (first bnds) (rest bnds) xks (append maybe-bin bins))])))]))]))
|
||||
|
||||
(: bin-weighted-samples
|
||||
(All (A B) (case-> ((Sequenceof A) (U #f (Sequenceof Real)) (Sequenceof A) (A A -> Any)
|
||||
-> (Listof (sample-bin A A)))
|
||||
((Sequenceof A) (U #f (Sequenceof Real)) (Sequenceof B) (B B -> Any) (A -> B)
|
||||
-> (Listof (sample-bin A B))))))
|
||||
(define bin-weighted-samples
|
||||
(case-lambda
|
||||
[(xs ws bnds lte?)
|
||||
(cond [ws (bin-weighted-samples xs ws bnds lte? (λ: ([x : A]) x))]
|
||||
[else (bin-samples xs bnds lte?)])]
|
||||
[(xs ws bnds lte? key)
|
||||
(cond [ws (let-values ([(xs ws) (sequences->weighted-samples 'bin-samples xs ws)])
|
||||
(define xws (map (inst cons A Nonnegative-Real) xs ws))
|
||||
(define xw-key (λ: ([xw : (Pair A Nonnegative-Real)]) (key (car xw))))
|
||||
(map (λ: ([bin : (sample-bin (Pair A Nonnegative-Real) B)])
|
||||
(define xws (sample-bin-values bin))
|
||||
(sample-bin (map (inst car A Nonnegative-Real) xws)
|
||||
(map (inst cdr A Nonnegative-Real) xws)
|
||||
(sample-bin-min bin)
|
||||
(sample-bin-max bin)))
|
||||
(bin-samples xws bnds lte? xw-key)))]
|
||||
[else (bin-samples xs bnds lte? key)])]))
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
(only-in typed/racket/base
|
||||
Flonum Real Boolean Any Listof Integer case-> -> U
|
||||
Sequenceof Positive-Flonum Nonnegative-Flonum
|
||||
Nonnegative-Real))
|
||||
HashTable Positive-Integer Nonnegative-Real Values))
|
||||
"utils.rkt")
|
||||
|
||||
@(define typed-eval (make-math-eval))
|
||||
|
@ -18,12 +18,13 @@
|
|||
|
||||
@defmodule[math/statistics]
|
||||
|
||||
This module exports functions that compute summary values for collections of data, or
|
||||
This module exports functions that compute summary values for collections of samples, or
|
||||
@deftech{statistics}, such as means, standard devations, medians, and @italic{k}th order
|
||||
statistics. It also exports functions for managing collections of sample values.
|
||||
statistics.
|
||||
It also exports functions for managing collections of samples.
|
||||
|
||||
Most of the functions that compute statistics also accept a sequence of nonnegative reals
|
||||
that correspond one-to-one with the data values.
|
||||
that correspond one-to-one with samples.
|
||||
These are used as weights; equivalently counts, pseudocounts or proportions.
|
||||
While this makes it easy to work with weighted samples, it introduces some subtleties
|
||||
in bias correction.
|
||||
|
@ -32,14 +33,84 @@ See @secref{stats:expected-values} for a discussion.
|
|||
|
||||
@local-table-of-contents[]
|
||||
|
||||
@section{Counting}
|
||||
@section{Counting and Binning}
|
||||
|
||||
@defthing[samples->hash Any]{
|
||||
This stub represents forthcoming documentation.
|
||||
@defproc*[([(samples->hash [xs (Sequenceof A)]) (HashTable A Positive-Integer)]
|
||||
[(samples->hash [xs (Sequenceof A)] [ws (U #f (Sequenceof Real))])
|
||||
(HashTable A Nonnegative-Real)])]{
|
||||
@examples[#:eval typed-eval
|
||||
(samples->hash '(1 2 3 4 4))
|
||||
(samples->hash '(1 1 2 3 4) '(1/2 1/2 1 1 2))]
|
||||
}
|
||||
|
||||
@defthing[count-samples Any]{
|
||||
This stub represents forthcoming documentation.
|
||||
@defproc*[([(count-samples [xs (Sequenceof A)]) (Values (Listof A) (Listof Positive-Integer))]
|
||||
[(count-samples [xs (Sequenceof A)] [ws (U #f (Sequenceof Real))])
|
||||
(Values (Listof A) (Listof Nonnegative-Real))])]{
|
||||
@examples[#:eval typed-eval
|
||||
(count-samples '(1 2 3 4 4))
|
||||
(count-samples '(1 1 2 3 4) '(1/2 1/2 1 1 2))]
|
||||
}
|
||||
|
||||
@defstruct*[sample-bin ([values (Listof A)]
|
||||
[weights (U #f (Listof Nonnegative-Real))]
|
||||
[min B] [max B])]{
|
||||
Represents a @italic{bin}, or a group of samples within an interval in a total order.
|
||||
The values and bounds have a different type to allow @racket[bin-samples] and
|
||||
@racket[bin-weighted-samples] to group elements based on a function of their values (a @racket[key],
|
||||
like in @racket[sort]).
|
||||
}
|
||||
|
||||
@defproc*[([(bin-samples [xs (Sequenceof A)] [bounds (Sequenceof A)] [lte? (A A -> Any)])
|
||||
(Listof (sample-bin A A))]
|
||||
[(bin-samples [xs (Sequenceof A)]
|
||||
[bounds (Sequenceof B)]
|
||||
[lte? (B B -> Any)]
|
||||
[key (A -> B)])
|
||||
(Listof (sample-bin A B))])]{
|
||||
Like @racket[(sort xs lte? #:key key)], but additionally groups samples into bins.
|
||||
Keys are always cached, and @racket[bounds] is sorted before binning.
|
||||
|
||||
If @racket[n = (length bounds)], then @racket[bin-samples] returns @italic{at least} @racket[(- n 1)]
|
||||
bins, one for each pair of adjacent (sorted) bounds.
|
||||
If some values in @racket[xs] are less than the smallest bound, they are grouped into a single bin in
|
||||
front.
|
||||
If some are greater than the largest bound, they are grouped into a single bin at the end.
|
||||
|
||||
@examples[#:eval typed-eval
|
||||
(bin-samples '(0 1 2 3 4 5 6) '() <=)
|
||||
(bin-samples '(0 1 2 3 4 5 6) '(3) <=)
|
||||
(bin-samples '(0 1 2 3 4 5 6) '(2 4) <=)]
|
||||
|
||||
Note that @racket[bin-samples] always returns bins with @racket[#f] weights, or bins containing
|
||||
unweighted samples.
|
||||
}
|
||||
|
||||
@defproc*[([(bin-weighted-samples [xs (Sequenceof A)]
|
||||
[ws (Sequenceof Real)]
|
||||
[bounds (Sequenceof A)]
|
||||
[lte? (A A -> Any)])
|
||||
(Listof (sample-bin A A))]
|
||||
[(bin-weighted-samples [xs (Sequenceof A)]
|
||||
[ws (Sequenceof Real)]
|
||||
[bounds (Sequenceof B)]
|
||||
[lte? (B B -> Any)]
|
||||
[key (A -> B)])
|
||||
(Listof (sample-bin A B))])]{
|
||||
Like @racket[bin-samples], but for weighted samples.
|
||||
}
|
||||
|
||||
@defproc[(sample-bin-compact [bin (sample-bin A B)]) (sample-bin A B)]{
|
||||
Compacts @racket[bin] by applying @racket[count-samples] to its values and weights.
|
||||
@examples[#:eval typed-eval
|
||||
(sample-bin-compact (sample-bin '(1 2 3 4 4) #f 1 4))]
|
||||
}
|
||||
|
||||
@defproc[(sample-bin-total [bin (sample-bin A B)]) Nonnegative-Real]{
|
||||
If @racket[(sample-bin-weights bin)] is @racket[#f], returns the number of samples in @racket[bin];
|
||||
otherwise, returns the sum of their weights.
|
||||
@examples[#:eval typed-eval
|
||||
(sample-bin-total (sample-bin '(1 2 3 4 4) #f 1 4))
|
||||
(sample-bin-total (sample-bin-compact (sample-bin '(1 2 3 4 4) #f 1 4)))]
|
||||
}
|
||||
|
||||
@section[#:tag "stats:expected-values"]{Expected Values}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#lang typed/racket/base
|
||||
#lang racket/base
|
||||
|
||||
(require "private/statistics/statistics-struct.rkt"
|
||||
"private/statistics/expected-values.rkt"
|
||||
|
|
94
collects/math/tests/statistics-tests.rkt
Normal file
94
collects/math/tests/statistics-tests.rkt
Normal file
|
@ -0,0 +1,94 @@
|
|||
#lang typed/racket
|
||||
|
||||
(require math/statistics
|
||||
typed/rackunit)
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '() '(5) <=)
|
||||
empty)
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(4) '(5) <=)
|
||||
(list (sample-bin '(4) #f 4 5)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(5) '(5) <=)
|
||||
(list (sample-bin '(5) #f 5 5)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(6) '(5) <=)
|
||||
(list (sample-bin '(6) #f 5 6)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(4 5) '(5) <=)
|
||||
(list (sample-bin '(4 5) #f 4 5)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(5 6) '(5) <=)
|
||||
(list (sample-bin '(5) #f 5 5)
|
||||
(sample-bin '(6) #f 5 6)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(4 6) '(5) <=)
|
||||
(list (sample-bin '(4) #f 4 5)
|
||||
(sample-bin '(6) #f 5 6)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(4 5 6) '(5) <=)
|
||||
(list (sample-bin '(4 5) #f 4 5)
|
||||
(sample-bin '(6) #f 5 6)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '() '(4 8) <=)
|
||||
(list (sample-bin '() #f 4 8)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(2) '(4 8) <=)
|
||||
(list (sample-bin '(2) #f 2 4)
|
||||
(sample-bin '() #f 4 8)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(4) '(4 8) <=)
|
||||
(list (sample-bin '(4) #f 4 4)
|
||||
(sample-bin '() #f 4 8)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(6) '(4 8) <=)
|
||||
(list (sample-bin '(6) #f 4 8)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(8) '(4 8) <=)
|
||||
(list (sample-bin '(8) #f 4 8)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(10) '(4 8) <=)
|
||||
(list (sample-bin '() #f 4 8)
|
||||
(sample-bin '(10) #f 8 10)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(4 8) '(4 8) <=)
|
||||
(list (sample-bin '(4) #f 4 4)
|
||||
(sample-bin '(8) #f 4 8)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(4 10) '(4 8) <=)
|
||||
(list (sample-bin '(4) #f 4 4)
|
||||
(sample-bin '() #f 4 8)
|
||||
(sample-bin '(10) #f 8 10)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(8 10) '(4 8) <=)
|
||||
(list (sample-bin '(8) #f 4 8)
|
||||
(sample-bin '(10) #f 8 10)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(4 8 10) '(4 8) <=)
|
||||
(list (sample-bin '(4) #f 4 4)
|
||||
(sample-bin '(8) #f 4 8)
|
||||
(sample-bin '(10) #f 8 10)))
|
||||
|
||||
(check-equal?
|
||||
(bin-samples '(1 1 2 2 2 3 4 5 5 5 5 6 7 8 9 9) '(3 8) <=)
|
||||
(list (sample-bin '(1 1 2 2 2 3) #f 1 3)
|
||||
(sample-bin '(4 5 5 5 5 6 7 8) #f 3 8)
|
||||
(sample-bin '(9 9) #f 8 9)))
|
Loading…
Reference in New Issue
Block a user