Propagated Eli's changes to reversecomplement and fasta to the typed

versions.
This commit is contained in:
Vincent St-Amour 2010-06-21 17:54:24 -04:00
parent 4f501726fb
commit c9383cffc2
2 changed files with 203 additions and 145 deletions

View File

@ -3,113 +3,139 @@
;;
;; fasta - benchmark
;;
;; Derived from the Chicken variant, which was
;; Contributed by Anthony Borla
(require racket/cmdline)
(define-type CumulativeTable (Listof (Pair Natural Float)))
;; Very loosely based on the Chicken variant by Anthony Borla, some
;; optimizations taken from the GCC version by Petr Prokhorenkov, and
;; additional heavy optimizations by Eli Barzilay (not really related to
;; the above two now).
;;
;; If you use some of these optimizations in other solutions, please
;; include a proper attribution to this Racket code.
(define +alu+
(bytes-append
#"GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG"
#"GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA"
#"CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT"
#"ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA"
#"GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG"
#"AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC"
#"AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"))
(bytes-append #"GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG"
#"GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA"
#"CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT"
#"ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA"
#"GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG"
#"AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC"
#"AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA"))
(define +iub+
(list
'(#\a . 0.27) '(#\c . 0.12) '(#\g . 0.12) '(#\t . 0.27) '(#\B . 0.02)
'(#\D . 0.02) '(#\H . 0.02) '(#\K . 0.02) '(#\M . 0.02) '(#\N . 0.02)
'(#\R . 0.02) '(#\S . 0.02) '(#\V . 0.02) '(#\W . 0.02) '(#\Y . 0.02)))
(define-type Table (Listof (List Char Float)))
(define +homosapien+
(list
'(#\a . 0.3029549426680) '(#\c . 0.1979883004921)
'(#\g . 0.1975473066391) '(#\t . 0.3015094502008)))
(define IUB
'([#\a 0.27] [#\c 0.12] [#\g 0.12] [#\t 0.27] [#\B 0.02]
[#\D 0.02] [#\H 0.02] [#\K 0.02] [#\M 0.02] [#\N 0.02]
[#\R 0.02] [#\S 0.02] [#\V 0.02] [#\W 0.02] [#\Y 0.02]))
;; -------------
(define HOMOSAPIEN
'([#\a 0.3029549426680] [#\c 0.1979883004921]
[#\g 0.1975473066391] [#\t 0.3015094502008]))
(define +line-size+ 60)
(define line-length 60)
;; -------------------------------
;; ----------------------------------------
(: make-random (Integer -> (Real -> Real)))
(define (make-random seed)
(let* ((ia 3877) (ic 29573) (im 139968) (last seed))
(lambda: ((max : Real))
(set! last (modulo (+ ic (* last ia)) im))
(/ (* max last) im))))
(require racket/cmdline racket/require (for-syntax racket/base)
(filtered-in (lambda (name) (regexp-replace #rx"unsafe-" name ""))
racket/unsafe/ops))
;; -------------------------------
;; ----------------------------------------
(: make-cumulative-table ((Listof (Pair Char Float)) -> CumulativeTable))
(define (make-cumulative-table frequency-table)
(let ([cumulative 0.0])
(for/list: : CumulativeTable
([x : (Pair Char Float) frequency-table])
(set! cumulative (+ cumulative (cdr x)))
(cons (char->integer (car x)) cumulative))))
(: repeat-fasta (String Integer Bytes -> Void))
(define (repeat-fasta header N sequence)
(define out (current-output-port))
(define len (bytes-length sequence))
(define buf (make-bytes (+ len line-length)))
(bytes-copy! buf 0 sequence)
(bytes-copy! buf len sequence 0 line-length)
(display header out)
(let: loop : Void ([n : Integer N] [start : Integer 0])
(when (fx> n 0)
(let ([end (fx+ start (fxmin n line-length))])
(write-bytes buf out start end)
(newline)
(loop (fx- n line-length) (if (fx> end len) (fx- end len) end))))))
;; -------------
;; ----------------------------------------
(define random-next (make-random 42))
(define +segmarker+ ">")
(define IA 3877)
(define IC 29573)
(define IM 139968)
(define IM.0 (fx->fl IM))
;; -------------
(define-syntax-rule (define/IM (name id) E)
(begin (define V
(let: ([v : (Vectorof Integer) (make-vector IM)])
(for ([id (in-range IM)]) (vector-set! v id E))
v))
(define-syntax-rule (name id) (vector-ref V id))))
(: select-random (CumulativeTable -> Natural))
(define (select-random cumulative-table)
(let ((rvalue (random-next 1.0)))
(let select-over-threshold ([table cumulative-table])
(if (<= rvalue (cdar table))
(caar table)
(select-over-threshold (cdr table))))))
(define/IM (random-next cur) (fxmodulo (fx+ IC (fx* cur IA)) IM))
;; -------------
(: make-lookup-table (Table -> Bytes))
(define (make-lookup-table frequency-table)
(define v (make-bytes IM))
(let: loop : Void ([t : Table frequency-table] [c : Integer 0] [c. : Float 0.0])
(unless (null? t)
(let* ([c1. (fl+ c. (fl* IM.0 (cadar t)))]
[c1 (assert (inexact->exact (flceiling c1.)) exact-integer?)]
[b (char->integer (caar t))])
(for ([i (in-range c c1)]) (bytes-set! v i b))
(loop (cdr t) c1 c1.))))
v)
(: repeat-fasta (String String Integer Bytes Integer -> Void))
(define (repeat-fasta id desc n_ sequence line-length)
(let ((seqlen (bytes-length sequence))
(out (current-output-port)))
(display (string-append +segmarker+ id " " desc "\n") out)
(let: loop-o : Void
((n : Integer n_) (k : Integer 0))
(unless (<= n 0)
(let ((m (min n line-length)))
(let loop-i ((i 0) (k k))
(if (>= i m)
(begin
(newline out)
(loop-o (- n line-length) k))
(let ([k (if (= k seqlen) 0 k)])
(write-byte (bytes-ref sequence k) out)
(loop-i (add1 i) (add1 k))))))))))
(: random-fasta (String Integer Table Integer -> Integer))
(define (random-fasta header N table R)
(define out (current-output-port))
(define lookup-byte (make-lookup-table table))
(: n-randoms (Integer Integer -> Integer))
(define (n-randoms to R)
(let loop ([n 0] [R R])
(if (fx< n to)
(let ([R (random-next R)])
(bytes-set! buf n (bytes-ref lookup-byte R))
(loop (fx+ n 1) R))
(begin (write-bytes buf out 0 (fx+ to 1)) R))))
(: make-line! (Bytes Integer Integer -> Integer))
(define (make-line! buf start R)
(let ([end (fx+ start line-length)])
(bytes-set! buf end LF)
(let loop ([n start] [R R])
(if (fx< n end)
(let ([R (random-next R)])
(bytes-set! buf n (bytes-ref lookup-byte R))
(loop (fx+ n 1) R))
R))))
(: LF Integer)
(define LF (char->integer #\newline))
(: buf Bytes)
(define buf (make-bytes (fx+ line-length 1)))
(: full-lines Integer)
(: last Integer)
(define-values (full-lines last) (quotient/remainder N line-length))
(: C Bytes)
(define C
(let* ([len+1 (fx+ line-length 1)]
[buflen (fx* len+1 IM)]
[buf (make-bytes buflen)])
(let loop ([R R] [i 0])
(if (fx< i buflen)
(loop (make-line! buf i R) (fx+ i len+1))
buf))))
(bytes-set! buf line-length LF)
(display header out)
(let loop ([i full-lines] [R R])
(if (fx> i IM)
(begin (display C out) (loop (fx- i IM) R))
(let loop ([i i] [R R])
(cond [(fx> i 0) (loop (fx- i 1) (n-randoms line-length R))]
[(fx> last 0) (bytes-set! buf last LF) (n-randoms last R)]
[else R])))))
;; -------------
;; ----------------------------------------
(: random-fasta (String String Integer CumulativeTable Integer -> Void))
(define (random-fasta id desc n_ cumulative-table line-length)
(let ((out (current-output-port)))
(display (string-append +segmarker+ id " " desc "\n") out)
(let: loop-o : Void ((n : Integer n_))
(unless (<= n 0)
(for ([i (in-range (min n line-length))])
(write-byte (select-random cumulative-table) out))
(newline out)
(loop-o (- n line-length))))))
;; -------------------------------
(let ((n (command-line #:args (n) (assert (string->number (assert n string?)) exact-integer?))))
(repeat-fasta "ONE" "Homo sapiens alu" (* n 2) +alu+ +line-size+)
(random-fasta "TWO" "IUB ambiguity codes" (* n 3)
(make-cumulative-table +iub+) +line-size+)
(random-fasta "THREE" "Homo sapiens frequency" (* n 5)
(make-cumulative-table +homosapien+) +line-size+))
(let ([n (command-line #:args (n) (assert (string->number (assert n string?)) exact-integer?))])
(repeat-fasta ">ONE Homo sapiens alu\n" (* n 2) +alu+)
(random-fasta ">THREE Homo sapiens frequency\n" (* n 5) HOMOSAPIEN
(random-fasta ">TWO IUB ambiguity codes\n" (* n 3) IUB 42))
(void))

View File

@ -1,60 +1,92 @@
;; The Computer Language Benchmarks Game
;; http://shootout.alioth.debian.org/
(require scheme/cmdline)
(define translation (make-vector 128))
(for: : Void
([from-to : (List Symbol Symbol)
'([a t]
[c g]
[g c]
[t a]
[u a]
[m k]
[r y]
[w w]
[s s]
[y R]
[k M]
[v b]
[h d]
[d h]
[b v]
[n n])])
(let ([char (lambda: ((sym : Symbol))
(string-ref (symbol->string sym) 0))])
(let ([from (char (car from-to))]
[to (char->integer (char-upcase (char (cadr from-to))))])
(vector-set! translation (char->integer from) to)
(vector-set! translation (char->integer (char-upcase from)) to))))
(for: ([from : Char (in-string "ACGTUMRWSYKVHDBN")]
[to : Char (in-string "TGCAAKYWSRMBDHVN")])
(let ([to (char->integer to)])
(vector-set! translation (char->integer from) to)
(vector-set! translation (char->integer (char-downcase from)) to)))
(: output ((Listof Bytes) -> Void))
(define (output lines)
(let*: ([str : Bytes (apply bytes-append lines)]
[o : Output-Port (current-output-port)]
[len : Natural (bytes-length str)])
(for: : Void
([offset : Natural (in-range 0 len 60)])
(write-bytes str o offset (min len (+ offset 60)))
(newline o))))
(define I (current-input-port))
(define O (current-output-port))
(let ([in (current-input-port)])
(let: loop : Void ([accum : (Listof Bytes) null])
(let ([l (read-bytes-line in)])
(if (eof-object? l)
(output accum)
(cond
[(regexp-match? #rx#"^>" l)
(output accum)
(printf "~a\n" l)
(loop null)]
[else
(let* ([len (bytes-length l)]
[dest (make-bytes len)])
(for ([i (in-range len)])
(bytes-set! dest
(- (- len i) 1)
(vector-ref translation (bytes-ref l i))))
(loop (cons dest accum)))])))))
(define marker (char->integer #\>))
(require racket/require (for-syntax racket/base)
(filtered-in (lambda (name) (regexp-replace #rx"unsafe-" name ""))
racket/unsafe/ops))
(define line-length 60)
(define buf-size (* 64 1024))
(define out-buf ; so there's always enough room for newlines
(make-bytes (+ buf-size 1 (quotient buf-size line-length))))
(define LF (char->integer #\newline))
#|
The basic idea is to read the input in chunks, and keep pointers to
them, then on output process each chunk to translate and reverse it
before dumping it out.
|#
(define-type Chunk (Vector Integer Integer Bytes))
(: output ((Listof Chunk) -> Void))
(define (output chunks)
(let: loop : Void
([chunks : (Listof Chunk) chunks] [col : Integer line-length])
(when (pair? chunks)
(let ([chunk (car chunks)])
(let: ([start : Integer (vector-ref chunk 0)]
[end : Integer (vector-ref chunk 1)]
[in-buf : Bytes (vector-ref chunk 2)])
(let: chunk-loop : Void
([i : Integer end] [j : Integer 0] [col : Integer col])
(if (fx> i start)
(let* ([i (fx- i 1)] [b (bytes-ref in-buf i)])
(if (fx= b LF)
(chunk-loop i j col)
(let ([b (vector-ref translation b)])
(if (fx= 0 col)
(begin (bytes-set! out-buf j LF)
(bytes-set! out-buf (fx+ j 1) b)
(chunk-loop i (fx+ j 2) (fx- line-length 1)))
(begin (bytes-set! out-buf j b)
(chunk-loop i (fx+ j 1) (fx- col 1)))))))
(begin (write-bytes out-buf O 0 j)
(loop (cdr chunks) col))))))))
(newline O))
(define-syntax case-regexp-posns
(syntax-rules (=> else)
[(_ rx buf start [id B1 ...] [else B2 ...])
(let ([m (regexp-match-positions rx buf start)])
(if m (let ([id (assert (car m))]) B1 ...) (begin B2 ...)))]))
(let ([m (regexp-match #rx"^([^\n]+)\n" I)]) (display (car (assert m))))
(let: loop : Void
([buf : (U Bytes EOF) (read-bytes buf-size I)]
[start : Integer 0]
[chunks : (Listof Chunk) '()])
(if (eof-object? buf)
(begin (output chunks) (void))
(case-regexp-posns #rx">" buf start
[p1 (output (cons (ann (vector start (car p1) buf)
(Vector Integer Integer Bytes))
chunks))
(case-regexp-posns #rx"\n" buf (cdr p1)
[p2 (write-bytes buf O (car p1) (cdr p2))
(loop buf (cdr p2) '())]
[else (write-bytes buf O (car p1))
(let header-loop ()
(let ([buf (assert (read-bytes buf-size I) bytes?)])
(case-regexp-posns #rx"\n" buf 0
[p2 (write-bytes buf O 0 (cdr p2))
(loop buf (cdr p2) '())]
[else (write-bytes buf O) (header-loop)])))])]
[else (loop (read-bytes buf-size I) 0
(cons (ann (vector start (bytes-length buf) buf)
(Vector Integer Integer Bytes))
chunks))])))