scribble-math/statistics/chi-squared-test.rkt
Jens Axel Søgaard 0718e558d8 Added comments
2012-06-20 20:10:15 +02:00

107 lines
4.2 KiB
Racket

#lang racket
;;;
;;; This module implements Chi-squared tests for goodness of fit and
;;; independence of categories.
;;;
(require racket/flonum
(planet williams/science:4:5/science)
(only-in (planet williams/science/unsafe-ops-utils)
real->float)
plot)
(provide
(contract-out
(chi^2-goodness-of-fit-test
(sequence? sequence? natural-number/c . -> . (values real? real?)))))
; (chi^2-goodness-of-fit-test observed expected df)
; Given: observed, a sequence of observed frequencies
; expected, a sequence of expected frequencies
; df, the degrees of freedom
; Result: X^2 = sum ((observed - expected)^2 / expected), known as the Pearson cumulative test statistic
; P-value = 1-chi^2cdf(X^2,df) , the p-value obtained by comparing the test statistic to a chi-squared distribution
(define (chi^2-goodness-of-fit-test observed expected df)
(define (flsqr x) (fl* x x))
(define X^2 (for/sum ([o observed] [e expected])
(let ([o (real->float o)]
[e (real->float e)])
(fl/ (flsqr (fl- o e))
e))))
(define P-value (fl- 1.0 (chi-squared-cdf X^2 df)))
(values X^2 P-value))
(define (graphical-chi^2-goodness-of-fit-test observed expected df)
(let-values ([(X^2 P-value) (chi^2-goodness-of-fit-test observed expected df)])
(let ([x-max (* 1.50 X^2)])
(plot (list (function (λ (x) (chi-squared-pdf x df)) 0.0 X^2 #:color 'red )
(function-interval (λ (x) (chi-squared-pdf x df)) #:line1-color 'red
(λ (x) 0.0)
X^2 x-max)
(points '(#(0.0 0.4)) #:size 0 #:label (format "X^2 = ~a" X^2) )
(points '(#(0.0 0.35)) #:size 0 #:label (format "P-value = ~a" P-value) )
(lines `(#(,X^2 0.0) #(,X^2 ,(chi-squared-pdf X^2 df))) #:color 'black)
)
#:x-min 0.0
#:x-max x-max
#:y-min 0.0
#:y-max 0.5))))
(define (chi^2-independence-test observed-matrix)
; observed-matrix is a sequence of rows.
; a row is a sequnce of natural numbers.
(let* ([nrows (sequence-length observed-matrix)]
[ncols (sequence-length (sequence-ref observed-matrix 0))]
[rows observed-matrix]
[row-sums (for/vector ([row rows])
(for/sum ([o row])
o))]
[col-sums (for/vector ([j ncols])
(for/sum ([row rows])
(sequence-ref row j)))]
[N (real->float (for/sum ([s (in-vector row-sums)]) s))]
[expected (for*/vector #:length (* nrows ncols)
([i nrows] [j ncols])
(fl/ (fl* (real->float (vector-ref row-sums i))
(real->float (vector-ref col-sums j)))
N))]
[expected-matrix (for/vector #:length nrows ([i nrows])
(for/vector #:length ncols ([j ncols])
(vector-ref expected (+ (* i ncols) j))))]
[observed (for*/vector #:length (* nrows ncols)
([row rows] [o row])
o)]
[df (* (- nrows 1) (- ncols 1))])
(let-values ([(X^2 P-value) (chi^2-goodness-of-fit-test observed expected df)])
(values X^2 P-value df expected-matrix))))
(module* test #f
; From http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
; X^2 = 1.22
; P-value ~ 0.23
(displayln "Test 1")
(define observed '(44 56))
(define expected '(50 50))
(display (graphical-chi^2-goodness-of-fit-test observed expected 1))
(newline)
(chi^2-goodness-of-fit-test observed expected 1)
; - - - - - -
; Expected outcome
; X^2 = 10.527145672037388
; P-value = 0.005176775866863048
; df = 2
; Expected =
; '#(#(33.536842105263155 20.46315789473684)
; #(9.936842105263159 6.063157894736842)
; #(15.526315789473685 9.473684210526315))
(displayln "\n\nTest 2")
(chi^2-independence-test #( #(40 14)
#(10 6)
#(9 16))))