added make-regexp-tweaker (not used yet), and some code reorganization

svn: r12573
This commit is contained in:
Eli Barzilay 2008-11-23 00:17:39 +00:00
parent 62a8873198
commit ecb39eedee

View File

@ -32,21 +32,7 @@
s)))))))
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Regexp helpers
(define (bstring-length s)
(if (bytes? s) (bytes-length s) (string-length s)))
(define (bstring->regexp name pattern)
(cond [(regexp? pattern) pattern]
[(byte-regexp? pattern) pattern]
[(string? pattern) (regexp pattern)]
[(bytes? pattern) (byte-regexp pattern)]
[else (raise-type-error
name "regexp, byte regexp, string, or byte string" pattern)]))
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Regexp helpers
;; Regexp utilities
(define regexp-quote-chars:s #rx"[][.*?+|(){}\\$^]")
(define regexp-quote-chars:b #rx#"[][.*?+|(){}\\$^]")
@ -69,6 +55,33 @@
[else (raise-type-error 'regexp-replace-quote
"string or byte string" s)]))
(define (make-regexp-tweaker tweaker)
(let ([t (make-weak-hasheq)])
(lambda (rx)
(define-syntax-rule (->str x) (if (bytes? x) (bytes->string/utf-8 x) x))
(define-syntax-rule (->bts x) (if (bytes? x) x (string->bytes/utf-8 x)))
(define-syntax-rule (tweak unwrap wrap convert)
(let ([tweaked (tweaker (unwrap rx))])
;; the tweaker is allowed to return a regexp
(if (or (regexp? tweaked) (byte-regexp? tweaked))
tweaked
(wrap (convert tweaked)))))
(define (run-tweak)
(cond [(pregexp? rx) (tweak object-name pregexp ->str)]
[(regexp? rx) (tweak object-name regexp ->str)]
[(byte-pregexp? rx) (tweak object-name byte-pregexp ->bts)]
[(byte-regexp? rx) (tweak object-name byte-regexp ->bts)]
;; allow getting a string, so if someone needs to go
;; from a string to a regexp, there's no penalty
;; because of the intermediate regexp being recreated
[(string? rx) (tweak (lambda (x) x) regexp ->str)]
[(bytes? rx) (tweak (lambda (x) x) byte-regexp ->bts)]
[else (raise-type-error 'regexp-tweaker
"regexp of any kind, string, or bytes"
rx)]))
(or (hash-ref t rx #f)
(let ([rx* (run-tweak)]) (hash-set! t rx rx*) rx*)))))
(define (regexp-try-match pattern input-port [start-k 0] [end-k #f] [out #f])
(unless (input-port? input-port)
(raise-type-error 'regexp-try-match
@ -91,156 +104,164 @@
(and p (subbytes s (- (car p) drop) (- (cdr p) drop))))
(cdr m))))))))
;; Helper macro for the regexp functions below.
(define-syntax regexp-loop
(syntax-rules ()
[(regexp-loop name loop start end rx string
success-choose failure-k
port-success-k port-success-choose port-failure-k
need-leftover? peek?)
(let ([len (cond [(string? string) (string-length string)]
[(bytes? string) (bytes-length string)]
[else #f])])
(if peek?
(unless (input-port? string)
(raise-type-error 'name "input port" string))
(unless (or len (input-port? string))
(raise-type-error
'name "string, byte string or input port" string)))
(unless (and (number? start) (exact? start) (integer? start)
(start . >= . 0))
(raise-type-error 'name "non-negative exact integer" start))
(unless (or (not end)
(and (number? end) (exact? end) (integer? end)
(end . >= . 0)))
(raise-type-error 'name "non-negative exact integer or false" end))
(unless (or (input-port? string) (and len (start . <= . len)))
(raise-mismatch-error
'name
(format "starting offset index out of range [0,~a]: " len)
start))
(unless (or (not end)
(and (start . <= . end)
(or (input-port? string)
(and len (end . <= . len)))))
(raise-mismatch-error
'name
(format "ending offset index out of range [~a,~a]: " start len)
end))
(reverse
(let loop ([acc '()] [start start] [end end])
;; Helper macro for the regexp functions below, with some utilities.
(define (bstring-length s)
(if (bytes? s) (bytes-length s) (string-length s)))
(define (bstring->regexp name pattern)
(cond [(regexp? pattern) pattern]
[(byte-regexp? pattern) pattern]
[(string? pattern) (regexp pattern)]
[(bytes? pattern) (byte-regexp pattern)]
[else (raise-type-error
name "regexp, byte regexp, string, or byte string" pattern)]))
(define-syntax-rule (regexp-loop
name loop start end rx string
success-choose failure-k
port-success-k port-success-choose port-failure-k
need-leftover? peek?)
(let ([len (cond [(string? string) (string-length string)]
[(bytes? string) (bytes-length string)]
[else #f])])
(if peek?
(unless (input-port? string)
(raise-type-error 'name "input port" string))
(unless (or len (input-port? string))
(raise-type-error
'name "string, byte string or input port" string)))
(unless (and (number? start) (exact? start) (integer? start)
(start . >= . 0))
(raise-type-error 'name "non-negative exact integer" start))
(unless (or (not end)
(and (number? end) (exact? end) (integer? end)
(end . >= . 0)))
(raise-type-error 'name "non-negative exact integer or false" end))
(unless (or (input-port? string) (and len (start . <= . len)))
(raise-mismatch-error
'name
(format "starting offset index out of range [0,~a]: " len)
start))
(unless (or (not end)
(and (start . <= . end)
(or (input-port? string) (and len (end . <= . len)))))
(raise-mismatch-error
'name
(format "ending offset index out of range [~a,~a]: " start len)
end))
(reverse
(let loop ([acc '()] [start start] [end end])
(if (and port-success-choose (input-port? string))
(if (and port-success-choose (input-port? string))
;; Input port match, get string
(let* ([_ (when (positive? start)
;; Skip start chars:
(let ([s (make-bytes 4096)])
(let loop ([n 0])
(unless (= n start)
(let ([m (read-bytes-avail!
s string 0 (min (- start n) 4096))])
(unless (eof-object? m) (loop (+ n m))))))))]
[discarded/leftovers (if need-leftover? #f 0)]
[spitout (if need-leftover?
(open-output-bytes)
(make-output-port
'counter always-evt
(lambda (s start end flush? breakable?)
(let ([c (- end start)])
(set! discarded/leftovers
(+ c discarded/leftovers))
c))
void))]
[end (and end (- end start))]
[m (regexp-match rx string 0 end spitout)]
;; re-match if we get a zero-length match at the
;; beginning
[m (if (and m ; we have a match
;; and it's an empty one
(zero? (bstring-length (car m)))
;; and it's at the beginning
(zero? (if need-leftover?
(file-position spitout)
discarded/leftovers))
;; and we still have stuff to match
(if end
(< 0 end)
(not (eof-object? (peek-byte string)))))
(regexp-match rx string 1 end spitout)
m)]
[m (and m (car m))]
[discarded/leftovers (if need-leftover?
(get-output-bytes spitout)
discarded/leftovers)]
[end (and end m
(- end (if need-leftover?
(bstring-length discarded/leftovers)
discarded/leftovers)
(bstring-length m)))])
;; drop matches that are both empty and at the end
(if (and m (or (< 0 (bstring-length m))
(if end
(< 0 end)
(not (eof-object? (peek-byte string))))))
(loop (cons (port-success-choose m discarded/leftovers) acc)
0 end)
(port-failure-k acc discarded/leftovers)))
;; Input port match, get string
(let* ([_ (when (positive? start)
;; Skip start chars:
(let ([s (make-bytes 4096)])
(let loop ([n 0])
(unless (= n start)
(let ([m (read-bytes-avail!
s string 0 (min (- start n) 4096))])
(unless (eof-object? m) (loop (+ n m))))))))]
[discarded/leftovers (if need-leftover? #f 0)]
[spitout (if need-leftover?
(open-output-bytes)
(make-output-port
'counter always-evt
(lambda (s start end flush? breakable?)
(let ([c (- end start)])
(set! discarded/leftovers
(+ c discarded/leftovers))
c))
void))]
[end (and end (- end start))]
[m (regexp-match rx string 0 end spitout)]
;; re-match if we get a zero-length match at the
;; beginning
[m (if (and m ; we have a match
;; and it's an empty one
(zero? (bstring-length (car m)))
;; and it's at the beginning
(zero? (if need-leftover?
(file-position spitout)
discarded/leftovers))
;; and we still have stuff to match
(if end
(< 0 end)
(not (eof-object? (peek-byte string)))))
(regexp-match rx string 1 end spitout)
m)]
[m (and m (car m))]
[discarded/leftovers (if need-leftover?
(get-output-bytes spitout)
discarded/leftovers)]
[end (and end m
(- end (if need-leftover?
(bstring-length discarded/leftovers)
discarded/leftovers)
(bstring-length m)))])
;; drop matches that are both empty and at the end
(if (and m (or (< 0 (bstring-length m))
(if end
(< 0 end)
(not (eof-object? (peek-byte string))))))
(loop (cons (port-success-choose m discarded/leftovers) acc)
0 end)
(port-failure-k acc discarded/leftovers)))
;; String/port match, get positions
(let* ([match (if peek?
regexp-match-peek-positions
regexp-match-positions)]
[m (match rx string start end)])
(if (not m)
(failure-k acc start end)
(let* ([mstart (caar m)]
[mend (cdar m)]
;; re-match if we get a zero-length match at the
;; beginning, and we can continue
[m (if (and (= mstart mend start)
(cond
[end (< start end)]
[len (< start len)]
[(input-port? string)
(not (eof-object? (peek-byte string)))]
[else (error "internal error (str)")]))
(if (or peek? (not (input-port? string)))
(match rx string (add1 start) end)
;; rematching on a port requires adding `start'
;; offsets
(let ([m (match rx string 1 end)])
(if (and m (positive? start))
(list (cons (+ start (caar m))
(+ start (cdar m))))
m)))
m)])
;; fail if rematch failed
(if (not m)
(failure-k acc start end)
(let ([mstart (caar m)]
[mend (cdar m)])
;; or if we have a zero-length match at the end
(if (and (= mstart mend)
(cond [end (= mend end)]
[len (= mend len)]
[(input-port? string)
(eof-object?
(peek-byte string (if peek? mend 0)))]
[else (error "internal error (str)")]))
(failure-k acc start end)
(if port-success-k
(port-success-k
(lambda (acc new-start new-end)
(loop acc new-start new-end))
acc start end mstart mend)
(loop (cons (success-choose start mstart mend) acc)
mend end))))))))))))]))
;; String/port match, get positions
(let* ([match (if peek?
regexp-match-peek-positions
regexp-match-positions)]
[m (match rx string start end)])
(if (not m)
(failure-k acc start end)
(let* ([mstart (caar m)]
[mend (cdar m)]
;; re-match if we get a zero-length match at the
;; beginning, and we can continue
[m (if (and (= mstart mend start)
(cond
[end (< start end)]
[len (< start len)]
[(input-port? string)
(not (eof-object? (peek-byte string)))]
[else (error "internal error (str)")]))
(if (or peek? (not (input-port? string)))
(match rx string (add1 start) end)
;; rematching on a port requires adding `start'
;; offsets
(let ([m (match rx string 1 end)])
(if (and m (positive? start))
(list (cons (+ start (caar m))
(+ start (cdar m))))
m)))
m)])
;; fail if rematch failed
(if (not m)
(failure-k acc start end)
(let ([mstart (caar m)]
[mend (cdar m)])
;; or if we have a zero-length match at the end
(if (and (= mstart mend)
(cond [end (= mend end)]
[len (= mend len)]
[(input-port? string)
(eof-object?
(peek-byte string (if peek? mend 0)))]
[else (error "internal error (str)")]))
(failure-k acc start end)
(if port-success-k
(port-success-k
(lambda (acc new-start new-end)
(loop acc new-start new-end))
acc start end mstart mend)
(loop (cons (success-choose start mstart mend) acc)
mend end)))))))))))))
;; Returns all the positions at which the pattern matched.
(define (regexp-match-positions* pattern string [start 0] [end #f])
(define rx (bstring->regexp 'regexp-match-positions* pattern))
(regexp-loop regexp-match-positions* loop start end rx string
(regexp-loop
regexp-match-positions* loop start end
(bstring->regexp 'regexp-match-positions* pattern) string
;; success-choose:
(lambda (start mstart mend) (cons mstart mend))
;; failure-k:
@ -262,8 +283,9 @@
;; Returns all the positions at which the pattern matched.
(define (regexp-match-peek-positions* pattern string [start 0] [end #f])
(define rx (bstring->regexp 'regexp-match-peek-positions* pattern))
(regexp-loop regexp-match-peek-positions* loop start end rx string
(regexp-loop
regexp-match-peek-positions* loop start end
(bstring->regexp 'regexp-match-peek-positions* pattern) string
;; success-choose:
(lambda (start mstart mend) (cons mstart mend))
;; failure-k: