107 lines
4.2 KiB
Racket
107 lines
4.2 KiB
Racket
#lang racket
|
|
;;;
|
|
;;; This module implements Chi-squared tests for goodness of fit and
|
|
;;; independence of categories.
|
|
;;;
|
|
|
|
(require racket/flonum
|
|
(planet williams/science:4:5/science)
|
|
(only-in (planet williams/science/unsafe-ops-utils)
|
|
real->float)
|
|
plot)
|
|
|
|
(provide
|
|
(contract-out
|
|
(chi^2-goodness-of-fit-test
|
|
(sequence? sequence? natural-number/c . -> . (values real? real?)))))
|
|
|
|
; (chi^2-goodness-of-fit-test observed expected df)
|
|
; Given: observed, a sequence of observed frequencies
|
|
; expected, a sequence of expected frequencies
|
|
; df, the degrees of freedom
|
|
; Result: X^2 = sum ((observed - expected)^2 / expected), known as the Pearson cumulative test statistic
|
|
; P-value = 1-chi^2cdf(X^2,df) , the p-value obtained by comparing the test statistic to a chi-squared distribution
|
|
(define (chi^2-goodness-of-fit-test observed expected df)
|
|
(define (flsqr x) (fl* x x))
|
|
(define X^2 (for/sum ([o observed] [e expected])
|
|
(let ([o (real->float o)]
|
|
[e (real->float e)])
|
|
(fl/ (flsqr (fl- o e))
|
|
e))))
|
|
(define P-value (fl- 1.0 (chi-squared-cdf X^2 df)))
|
|
(values X^2 P-value))
|
|
|
|
(define (graphical-chi^2-goodness-of-fit-test observed expected df)
|
|
(let-values ([(X^2 P-value) (chi^2-goodness-of-fit-test observed expected df)])
|
|
(let ([x-max (* 1.50 X^2)])
|
|
(plot (list (function (λ (x) (chi-squared-pdf x df)) 0.0 X^2 #:color 'red )
|
|
(function-interval (λ (x) (chi-squared-pdf x df)) #:line1-color 'red
|
|
(λ (x) 0.0)
|
|
X^2 x-max)
|
|
(points '(#(0.0 0.4)) #:size 0 #:label (format "X^2 = ~a" X^2) )
|
|
(points '(#(0.0 0.35)) #:size 0 #:label (format "P-value = ~a" P-value) )
|
|
(lines `(#(,X^2 0.0) #(,X^2 ,(chi-squared-pdf X^2 df))) #:color 'black)
|
|
)
|
|
#:x-min 0.0
|
|
#:x-max x-max
|
|
#:y-min 0.0
|
|
#:y-max 0.5))))
|
|
|
|
(define (chi^2-independence-test observed-matrix)
|
|
; observed-matrix is a sequence of rows.
|
|
; a row is a sequnce of natural numbers.
|
|
(let* ([nrows (sequence-length observed-matrix)]
|
|
[ncols (sequence-length (sequence-ref observed-matrix 0))]
|
|
[rows observed-matrix]
|
|
[row-sums (for/vector ([row rows])
|
|
(for/sum ([o row])
|
|
o))]
|
|
[col-sums (for/vector ([j ncols])
|
|
(for/sum ([row rows])
|
|
(sequence-ref row j)))]
|
|
[N (real->float (for/sum ([s (in-vector row-sums)]) s))]
|
|
[expected (for*/vector #:length (* nrows ncols)
|
|
([i nrows] [j ncols])
|
|
(fl/ (fl* (real->float (vector-ref row-sums i))
|
|
(real->float (vector-ref col-sums j)))
|
|
N))]
|
|
[expected-matrix (for/vector #:length nrows ([i nrows])
|
|
(for/vector #:length ncols ([j ncols])
|
|
(vector-ref expected (+ (* i ncols) j))))]
|
|
[observed (for*/vector #:length (* nrows ncols)
|
|
([row rows] [o row])
|
|
o)]
|
|
[df (* (- nrows 1) (- ncols 1))])
|
|
(let-values ([(X^2 P-value) (chi^2-goodness-of-fit-test observed expected df)])
|
|
(values X^2 P-value df expected-matrix))))
|
|
|
|
|
|
|
|
(module* test #f
|
|
; From http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
|
|
; X^2 = 1.22
|
|
; P-value ~ 0.23
|
|
(displayln "Test 1")
|
|
(define observed '(44 56))
|
|
(define expected '(50 50))
|
|
(display (graphical-chi^2-goodness-of-fit-test observed expected 1))
|
|
(newline)
|
|
(chi^2-goodness-of-fit-test observed expected 1)
|
|
|
|
; - - - - - -
|
|
|
|
; Expected outcome
|
|
; X^2 = 10.527145672037388
|
|
; P-value = 0.005176775866863048
|
|
; df = 2
|
|
; Expected =
|
|
; '#(#(33.536842105263155 20.46315789473684)
|
|
; #(9.936842105263159 6.063157894736842)
|
|
; #(15.526315789473685 9.473684210526315))
|
|
(displayln "\n\nTest 2")
|
|
(chi^2-independence-test #( #(40 14)
|
|
#(10 6)
|
|
#(9 16))))
|
|
|
|
|