90 lines
2.7 KiB
Scheme
90 lines
2.7 KiB
Scheme
;; ---------------------------------------------------------------------
|
|
;; The Great Computer Language Shootout
|
|
;; http://shootout.alioth.debian.org/
|
|
;;
|
|
;; Tested with PCRE [compiler must be built with PCRE already installed
|
|
;; else other regex routines (with different behaviours) will be used].
|
|
;; Regex performance appears reasonable, but file loading [of 'large'
|
|
;; files] performance requires tweaking to effect a significant improvement.
|
|
;;
|
|
;; Contributed by Anthony Borla
|
|
;; ---------------------------------------------------------------------
|
|
|
|
(module regexpdna mzscheme
|
|
|
|
(require mzlib/port)
|
|
|
|
;; -------------------------------
|
|
|
|
(define VARIANTS
|
|
'(#"agggtaaa|tttaccct" #"[cgt]gggtaaa|tttaccc[acg]" #"a[act]ggtaaa|tttacc[agt]t"
|
|
#"ag[act]gtaaa|tttac[agt]ct" #"agg[act]taaa|ttta[agt]cct" #"aggg[acg]aaa|ttt[cgt]ccct"
|
|
#"agggt[cgt]aa|tt[acg]accct" #"agggta[cgt]a|t[acg]taccct" #"agggtaa[cgt]|[acg]ttaccct"))
|
|
|
|
|
|
(define IUBS
|
|
'((#"B" #"(c|g|t)") (#"D" #"(a|g|t)") (#"H" #"(a|c|t)")
|
|
(#"K" #"(g|t)") (#"M" #"(a|c)") (#"N" #"(a|c|g|t)")
|
|
(#"R" #"(a|g)") (#"S" #"(c|g)") (#"V" #"(a|c|g)")
|
|
(#"W" #"(a|t)") (#"Y" #"(c|t)")))
|
|
|
|
;; -------------------------------
|
|
|
|
(define (ci-byte-regexp s)
|
|
(byte-regexp (bytes-append #"(?i:" s #")")))
|
|
|
|
;; -------------------------------
|
|
|
|
(define (match-count str rx offset cnt)
|
|
(let ([m (regexp-match-positions rx str offset)])
|
|
(if m
|
|
(match-count str rx (cdar m) (add1 cnt))
|
|
cnt)))
|
|
|
|
;; --------------
|
|
|
|
(define (replace-all rx str new)
|
|
(let ([out (open-output-bytes)])
|
|
(let loop ([pos 0])
|
|
(let ([m (regexp-match-positions rx str pos)])
|
|
(if m
|
|
(begin
|
|
(write-bytes str out pos (caar m))
|
|
(write-bytes new out)
|
|
(loop (cdar m)))
|
|
(write-bytes str out pos))))
|
|
(get-output-bytes out)))
|
|
|
|
;; -------------------------------
|
|
|
|
(define (input->bytes)
|
|
(let ([b (open-output-bytes)])
|
|
(copy-port (current-input-port) b)
|
|
(get-output-bytes b)))
|
|
|
|
;; -------------------------------
|
|
|
|
;; Load sequence and record its length
|
|
(let* ([orig (input->bytes)]
|
|
[filtered (replace-all #rx#"(>.*?\n)|\n" orig #"")])
|
|
|
|
;; Perform regexp counts
|
|
(for-each
|
|
(lambda (i)
|
|
(printf "~a ~a\n" i (match-count filtered (ci-byte-regexp i) 0 0)))
|
|
VARIANTS)
|
|
|
|
;; Perform regexp replacements, and record sequence length
|
|
(let ([replaced
|
|
(let loop ([sequence filtered]
|
|
[IUBS IUBS])
|
|
(if (null? IUBS)
|
|
sequence
|
|
(loop (replace-all (byte-regexp (caar IUBS)) sequence (cadar IUBS))
|
|
(cdr IUBS))))])
|
|
;; Print statistics
|
|
(printf "~%~A~%~A~%~A~%"
|
|
(bytes-length orig)
|
|
(bytes-length filtered)
|
|
(bytes-length replaced)))))
|