random-sample: use reservoir sampling to do a single pass over sequences.

Suggested by Eli.
This commit is contained in:
Vincent St-Amour 2016-01-05 14:30:34 -06:00
parent 31a9414983
commit 656044b8fc
2 changed files with 49 additions and 23 deletions

View File

@ -3,7 +3,7 @@
(Section 'numbers)
(require racket/extflonum racket/random)
(require racket/extflonum racket/random racket/list)
(test #f number? 'a)
(test #f complex? 'a)
@ -2514,15 +2514,21 @@
(parameterize ([current-pseudo-random-generator
(vector->pseudo-random-generator
#(3620087466 1904163406 3177592043 1406334318 257151704 3090455638))])
(test 3 random-ref '(1 2 3 4 5))
(test 10 random-ref '#(7 6 8 9 10))
(test #\e random-ref "abcde"))
(test 1 random-ref '(1 2 3 4 5))
(test 6 random-ref '#(7 6 8 9 10))
(test #\a random-ref "abcde"))
(parameterize ([current-pseudo-random-generator
(vector->pseudo-random-generator
#(3620087466 1904163406 3177592043 1406334318 257151704 3090455638))])
(test '(3) random-sample '(1 2 3 4 5) 1)
(test '(1) random-sample '(1 2 3 4 5) 1)
(test '(5 5 5) random-sample '(1 2 3 4 5) 3)
(test '(2 4 5) random-sample '(1 2 3 4 5) 3 #:replacement? #f))
(test '(1 4 3) random-sample '(1 2 3 4 5) 3 #:replacement? #f)
;; distribution is uniform
(test '(100077 100479 100375 99943 99869 100055 100482 99979 99405 99336)
values ; to avoid the whole pre-`length` list being printed if test fails
(map length (group-by values
(apply append (for/list ([i 10000])
(random-sample (range 10) 100)))))))
(test #t = 0 0)

View File

@ -20,23 +20,43 @@
(current-continuation-marks)))]))
(define (random-ref seq [prng (current-pseudo-random-generator)])
(sequence-ref seq (random (sequence-length seq))))
(car (random-sample seq 1 prng)))
(define (random-sample seq n [prng (current-pseudo-random-generator)]
#:replacement? [replacement? #t])
(cond [replacement?
(for/list ([i (in-range n)])
(random-ref seq prng))]
[else
(unless (>= (sequence-length seq) n)
(raise-argument-error 'random-sample
"integer less than sequence length"
n))
(define l (sequence-length seq))
;; sequences don't necessarily support removal, so instead sample
;; indices without replacement, then index into the sequence
(let loop ([res-idx (set)])
(cond [(= (set-count res-idx) n) ; we have all we need, we're done
(for/list ([i (in-set res-idx)]) (sequence-ref seq i))]
[else
(loop (set-add res-idx (random l)))]))]))
;; doing reservoir sampling, to do a single pass over the sequence
;; (some sequences may not like multiple passes)
(cond
[(not replacement?)
;; Based on: http://rosettacode.org/wiki/Knuth's_algorithm_S#Racket
(define not-there (gensym))
(define samples (make-vector n not-there))
(for ([elt seq]
[i (in-naturals)])
(cond [(< i n) ; we're not full, sample for sure
(vector-set! samples i elt)]
[(< (random (add1 i)) n) ; we've already seen n items; replace one?
(vector-set! samples (random n) elt)]))
;; did we get enough?
(unless (for/and ([s (in-vector samples)])
(not (eq? s not-there)))
(raise-argument-error 'random-sample
"integer less than sequence length"
n))
(vector->list samples)]
[else
;; similar to above, except each sample is independent
(define samples #f)
(for ([elt seq]
[i (in-naturals)])
(cond [(= i 0) ; initialize samples
(set! samples (make-vector n elt))]
[else ; independently, maybe replace
(for ([j (in-range n)])
(when (zero? (random (add1 i)))
(vector-set! samples j elt)))]))
(unless samples
(raise-argument-error 'random-sample
"non-empty sequence"
seq))
(vector->list samples)]))