added make-regexp-tweaker (not used yet), and some code reorganization
svn: r12573
This commit is contained in:
parent
62a8873198
commit
ecb39eedee
|
@ -32,21 +32,7 @@
|
|||
s)))))))
|
||||
|
||||
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Regexp helpers
|
||||
|
||||
(define (bstring-length s)
|
||||
(if (bytes? s) (bytes-length s) (string-length s)))
|
||||
|
||||
(define (bstring->regexp name pattern)
|
||||
(cond [(regexp? pattern) pattern]
|
||||
[(byte-regexp? pattern) pattern]
|
||||
[(string? pattern) (regexp pattern)]
|
||||
[(bytes? pattern) (byte-regexp pattern)]
|
||||
[else (raise-type-error
|
||||
name "regexp, byte regexp, string, or byte string" pattern)]))
|
||||
|
||||
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Regexp helpers
|
||||
;; Regexp utilities
|
||||
|
||||
(define regexp-quote-chars:s #rx"[][.*?+|(){}\\$^]")
|
||||
(define regexp-quote-chars:b #rx#"[][.*?+|(){}\\$^]")
|
||||
|
@ -69,6 +55,33 @@
|
|||
[else (raise-type-error 'regexp-replace-quote
|
||||
"string or byte string" s)]))
|
||||
|
||||
(define (make-regexp-tweaker tweaker)
|
||||
(let ([t (make-weak-hasheq)])
|
||||
(lambda (rx)
|
||||
(define-syntax-rule (->str x) (if (bytes? x) (bytes->string/utf-8 x) x))
|
||||
(define-syntax-rule (->bts x) (if (bytes? x) x (string->bytes/utf-8 x)))
|
||||
(define-syntax-rule (tweak unwrap wrap convert)
|
||||
(let ([tweaked (tweaker (unwrap rx))])
|
||||
;; the tweaker is allowed to return a regexp
|
||||
(if (or (regexp? tweaked) (byte-regexp? tweaked))
|
||||
tweaked
|
||||
(wrap (convert tweaked)))))
|
||||
(define (run-tweak)
|
||||
(cond [(pregexp? rx) (tweak object-name pregexp ->str)]
|
||||
[(regexp? rx) (tweak object-name regexp ->str)]
|
||||
[(byte-pregexp? rx) (tweak object-name byte-pregexp ->bts)]
|
||||
[(byte-regexp? rx) (tweak object-name byte-regexp ->bts)]
|
||||
;; allow getting a string, so if someone needs to go
|
||||
;; from a string to a regexp, there's no penalty
|
||||
;; because of the intermediate regexp being recreated
|
||||
[(string? rx) (tweak (lambda (x) x) regexp ->str)]
|
||||
[(bytes? rx) (tweak (lambda (x) x) byte-regexp ->bts)]
|
||||
[else (raise-type-error 'regexp-tweaker
|
||||
"regexp of any kind, string, or bytes"
|
||||
rx)]))
|
||||
(or (hash-ref t rx #f)
|
||||
(let ([rx* (run-tweak)]) (hash-set! t rx rx*) rx*)))))
|
||||
|
||||
(define (regexp-try-match pattern input-port [start-k 0] [end-k #f] [out #f])
|
||||
(unless (input-port? input-port)
|
||||
(raise-type-error 'regexp-try-match
|
||||
|
@ -91,156 +104,164 @@
|
|||
(and p (subbytes s (- (car p) drop) (- (cdr p) drop))))
|
||||
(cdr m))))))))
|
||||
|
||||
;; Helper macro for the regexp functions below.
|
||||
(define-syntax regexp-loop
|
||||
(syntax-rules ()
|
||||
[(regexp-loop name loop start end rx string
|
||||
success-choose failure-k
|
||||
port-success-k port-success-choose port-failure-k
|
||||
need-leftover? peek?)
|
||||
(let ([len (cond [(string? string) (string-length string)]
|
||||
[(bytes? string) (bytes-length string)]
|
||||
[else #f])])
|
||||
(if peek?
|
||||
(unless (input-port? string)
|
||||
(raise-type-error 'name "input port" string))
|
||||
(unless (or len (input-port? string))
|
||||
(raise-type-error
|
||||
'name "string, byte string or input port" string)))
|
||||
(unless (and (number? start) (exact? start) (integer? start)
|
||||
(start . >= . 0))
|
||||
(raise-type-error 'name "non-negative exact integer" start))
|
||||
(unless (or (not end)
|
||||
(and (number? end) (exact? end) (integer? end)
|
||||
(end . >= . 0)))
|
||||
(raise-type-error 'name "non-negative exact integer or false" end))
|
||||
(unless (or (input-port? string) (and len (start . <= . len)))
|
||||
(raise-mismatch-error
|
||||
'name
|
||||
(format "starting offset index out of range [0,~a]: " len)
|
||||
start))
|
||||
(unless (or (not end)
|
||||
(and (start . <= . end)
|
||||
(or (input-port? string)
|
||||
(and len (end . <= . len)))))
|
||||
(raise-mismatch-error
|
||||
'name
|
||||
(format "ending offset index out of range [~a,~a]: " start len)
|
||||
end))
|
||||
(reverse
|
||||
(let loop ([acc '()] [start start] [end end])
|
||||
;; Helper macro for the regexp functions below, with some utilities.
|
||||
(define (bstring-length s)
|
||||
(if (bytes? s) (bytes-length s) (string-length s)))
|
||||
(define (bstring->regexp name pattern)
|
||||
(cond [(regexp? pattern) pattern]
|
||||
[(byte-regexp? pattern) pattern]
|
||||
[(string? pattern) (regexp pattern)]
|
||||
[(bytes? pattern) (byte-regexp pattern)]
|
||||
[else (raise-type-error
|
||||
name "regexp, byte regexp, string, or byte string" pattern)]))
|
||||
(define-syntax-rule (regexp-loop
|
||||
name loop start end rx string
|
||||
success-choose failure-k
|
||||
port-success-k port-success-choose port-failure-k
|
||||
need-leftover? peek?)
|
||||
(let ([len (cond [(string? string) (string-length string)]
|
||||
[(bytes? string) (bytes-length string)]
|
||||
[else #f])])
|
||||
(if peek?
|
||||
(unless (input-port? string)
|
||||
(raise-type-error 'name "input port" string))
|
||||
(unless (or len (input-port? string))
|
||||
(raise-type-error
|
||||
'name "string, byte string or input port" string)))
|
||||
(unless (and (number? start) (exact? start) (integer? start)
|
||||
(start . >= . 0))
|
||||
(raise-type-error 'name "non-negative exact integer" start))
|
||||
(unless (or (not end)
|
||||
(and (number? end) (exact? end) (integer? end)
|
||||
(end . >= . 0)))
|
||||
(raise-type-error 'name "non-negative exact integer or false" end))
|
||||
(unless (or (input-port? string) (and len (start . <= . len)))
|
||||
(raise-mismatch-error
|
||||
'name
|
||||
(format "starting offset index out of range [0,~a]: " len)
|
||||
start))
|
||||
(unless (or (not end)
|
||||
(and (start . <= . end)
|
||||
(or (input-port? string) (and len (end . <= . len)))))
|
||||
(raise-mismatch-error
|
||||
'name
|
||||
(format "ending offset index out of range [~a,~a]: " start len)
|
||||
end))
|
||||
(reverse
|
||||
(let loop ([acc '()] [start start] [end end])
|
||||
|
||||
(if (and port-success-choose (input-port? string))
|
||||
(if (and port-success-choose (input-port? string))
|
||||
|
||||
;; Input port match, get string
|
||||
(let* ([_ (when (positive? start)
|
||||
;; Skip start chars:
|
||||
(let ([s (make-bytes 4096)])
|
||||
(let loop ([n 0])
|
||||
(unless (= n start)
|
||||
(let ([m (read-bytes-avail!
|
||||
s string 0 (min (- start n) 4096))])
|
||||
(unless (eof-object? m) (loop (+ n m))))))))]
|
||||
[discarded/leftovers (if need-leftover? #f 0)]
|
||||
[spitout (if need-leftover?
|
||||
(open-output-bytes)
|
||||
(make-output-port
|
||||
'counter always-evt
|
||||
(lambda (s start end flush? breakable?)
|
||||
(let ([c (- end start)])
|
||||
(set! discarded/leftovers
|
||||
(+ c discarded/leftovers))
|
||||
c))
|
||||
void))]
|
||||
[end (and end (- end start))]
|
||||
[m (regexp-match rx string 0 end spitout)]
|
||||
;; re-match if we get a zero-length match at the
|
||||
;; beginning
|
||||
[m (if (and m ; we have a match
|
||||
;; and it's an empty one
|
||||
(zero? (bstring-length (car m)))
|
||||
;; and it's at the beginning
|
||||
(zero? (if need-leftover?
|
||||
(file-position spitout)
|
||||
discarded/leftovers))
|
||||
;; and we still have stuff to match
|
||||
(if end
|
||||
(< 0 end)
|
||||
(not (eof-object? (peek-byte string)))))
|
||||
(regexp-match rx string 1 end spitout)
|
||||
m)]
|
||||
[m (and m (car m))]
|
||||
[discarded/leftovers (if need-leftover?
|
||||
(get-output-bytes spitout)
|
||||
discarded/leftovers)]
|
||||
[end (and end m
|
||||
(- end (if need-leftover?
|
||||
(bstring-length discarded/leftovers)
|
||||
discarded/leftovers)
|
||||
(bstring-length m)))])
|
||||
;; drop matches that are both empty and at the end
|
||||
(if (and m (or (< 0 (bstring-length m))
|
||||
(if end
|
||||
(< 0 end)
|
||||
(not (eof-object? (peek-byte string))))))
|
||||
(loop (cons (port-success-choose m discarded/leftovers) acc)
|
||||
0 end)
|
||||
(port-failure-k acc discarded/leftovers)))
|
||||
;; Input port match, get string
|
||||
(let* ([_ (when (positive? start)
|
||||
;; Skip start chars:
|
||||
(let ([s (make-bytes 4096)])
|
||||
(let loop ([n 0])
|
||||
(unless (= n start)
|
||||
(let ([m (read-bytes-avail!
|
||||
s string 0 (min (- start n) 4096))])
|
||||
(unless (eof-object? m) (loop (+ n m))))))))]
|
||||
[discarded/leftovers (if need-leftover? #f 0)]
|
||||
[spitout (if need-leftover?
|
||||
(open-output-bytes)
|
||||
(make-output-port
|
||||
'counter always-evt
|
||||
(lambda (s start end flush? breakable?)
|
||||
(let ([c (- end start)])
|
||||
(set! discarded/leftovers
|
||||
(+ c discarded/leftovers))
|
||||
c))
|
||||
void))]
|
||||
[end (and end (- end start))]
|
||||
[m (regexp-match rx string 0 end spitout)]
|
||||
;; re-match if we get a zero-length match at the
|
||||
;; beginning
|
||||
[m (if (and m ; we have a match
|
||||
;; and it's an empty one
|
||||
(zero? (bstring-length (car m)))
|
||||
;; and it's at the beginning
|
||||
(zero? (if need-leftover?
|
||||
(file-position spitout)
|
||||
discarded/leftovers))
|
||||
;; and we still have stuff to match
|
||||
(if end
|
||||
(< 0 end)
|
||||
(not (eof-object? (peek-byte string)))))
|
||||
(regexp-match rx string 1 end spitout)
|
||||
m)]
|
||||
[m (and m (car m))]
|
||||
[discarded/leftovers (if need-leftover?
|
||||
(get-output-bytes spitout)
|
||||
discarded/leftovers)]
|
||||
[end (and end m
|
||||
(- end (if need-leftover?
|
||||
(bstring-length discarded/leftovers)
|
||||
discarded/leftovers)
|
||||
(bstring-length m)))])
|
||||
;; drop matches that are both empty and at the end
|
||||
(if (and m (or (< 0 (bstring-length m))
|
||||
(if end
|
||||
(< 0 end)
|
||||
(not (eof-object? (peek-byte string))))))
|
||||
(loop (cons (port-success-choose m discarded/leftovers) acc)
|
||||
0 end)
|
||||
(port-failure-k acc discarded/leftovers)))
|
||||
|
||||
;; String/port match, get positions
|
||||
(let* ([match (if peek?
|
||||
regexp-match-peek-positions
|
||||
regexp-match-positions)]
|
||||
[m (match rx string start end)])
|
||||
(if (not m)
|
||||
(failure-k acc start end)
|
||||
(let* ([mstart (caar m)]
|
||||
[mend (cdar m)]
|
||||
;; re-match if we get a zero-length match at the
|
||||
;; beginning, and we can continue
|
||||
[m (if (and (= mstart mend start)
|
||||
(cond
|
||||
[end (< start end)]
|
||||
[len (< start len)]
|
||||
[(input-port? string)
|
||||
(not (eof-object? (peek-byte string)))]
|
||||
[else (error "internal error (str)")]))
|
||||
(if (or peek? (not (input-port? string)))
|
||||
(match rx string (add1 start) end)
|
||||
;; rematching on a port requires adding `start'
|
||||
;; offsets
|
||||
(let ([m (match rx string 1 end)])
|
||||
(if (and m (positive? start))
|
||||
(list (cons (+ start (caar m))
|
||||
(+ start (cdar m))))
|
||||
m)))
|
||||
m)])
|
||||
;; fail if rematch failed
|
||||
(if (not m)
|
||||
(failure-k acc start end)
|
||||
(let ([mstart (caar m)]
|
||||
[mend (cdar m)])
|
||||
;; or if we have a zero-length match at the end
|
||||
(if (and (= mstart mend)
|
||||
(cond [end (= mend end)]
|
||||
[len (= mend len)]
|
||||
[(input-port? string)
|
||||
(eof-object?
|
||||
(peek-byte string (if peek? mend 0)))]
|
||||
[else (error "internal error (str)")]))
|
||||
(failure-k acc start end)
|
||||
(if port-success-k
|
||||
(port-success-k
|
||||
(lambda (acc new-start new-end)
|
||||
(loop acc new-start new-end))
|
||||
acc start end mstart mend)
|
||||
(loop (cons (success-choose start mstart mend) acc)
|
||||
mend end))))))))))))]))
|
||||
;; String/port match, get positions
|
||||
(let* ([match (if peek?
|
||||
regexp-match-peek-positions
|
||||
regexp-match-positions)]
|
||||
[m (match rx string start end)])
|
||||
(if (not m)
|
||||
(failure-k acc start end)
|
||||
(let* ([mstart (caar m)]
|
||||
[mend (cdar m)]
|
||||
;; re-match if we get a zero-length match at the
|
||||
;; beginning, and we can continue
|
||||
[m (if (and (= mstart mend start)
|
||||
(cond
|
||||
[end (< start end)]
|
||||
[len (< start len)]
|
||||
[(input-port? string)
|
||||
(not (eof-object? (peek-byte string)))]
|
||||
[else (error "internal error (str)")]))
|
||||
(if (or peek? (not (input-port? string)))
|
||||
(match rx string (add1 start) end)
|
||||
;; rematching on a port requires adding `start'
|
||||
;; offsets
|
||||
(let ([m (match rx string 1 end)])
|
||||
(if (and m (positive? start))
|
||||
(list (cons (+ start (caar m))
|
||||
(+ start (cdar m))))
|
||||
m)))
|
||||
m)])
|
||||
;; fail if rematch failed
|
||||
(if (not m)
|
||||
(failure-k acc start end)
|
||||
(let ([mstart (caar m)]
|
||||
[mend (cdar m)])
|
||||
;; or if we have a zero-length match at the end
|
||||
(if (and (= mstart mend)
|
||||
(cond [end (= mend end)]
|
||||
[len (= mend len)]
|
||||
[(input-port? string)
|
||||
(eof-object?
|
||||
(peek-byte string (if peek? mend 0)))]
|
||||
[else (error "internal error (str)")]))
|
||||
(failure-k acc start end)
|
||||
(if port-success-k
|
||||
(port-success-k
|
||||
(lambda (acc new-start new-end)
|
||||
(loop acc new-start new-end))
|
||||
acc start end mstart mend)
|
||||
(loop (cons (success-choose start mstart mend) acc)
|
||||
mend end)))))))))))))
|
||||
|
||||
;; Returns all the positions at which the pattern matched.
|
||||
(define (regexp-match-positions* pattern string [start 0] [end #f])
|
||||
(define rx (bstring->regexp 'regexp-match-positions* pattern))
|
||||
(regexp-loop regexp-match-positions* loop start end rx string
|
||||
(regexp-loop
|
||||
regexp-match-positions* loop start end
|
||||
(bstring->regexp 'regexp-match-positions* pattern) string
|
||||
;; success-choose:
|
||||
(lambda (start mstart mend) (cons mstart mend))
|
||||
;; failure-k:
|
||||
|
@ -262,8 +283,9 @@
|
|||
|
||||
;; Returns all the positions at which the pattern matched.
|
||||
(define (regexp-match-peek-positions* pattern string [start 0] [end #f])
|
||||
(define rx (bstring->regexp 'regexp-match-peek-positions* pattern))
|
||||
(regexp-loop regexp-match-peek-positions* loop start end rx string
|
||||
(regexp-loop
|
||||
regexp-match-peek-positions* loop start end
|
||||
(bstring->regexp 'regexp-match-peek-positions* pattern) string
|
||||
;; success-choose:
|
||||
(lambda (start mstart mend) (cons mstart mend))
|
||||
;; failure-k:
|
||||
|
|
Loading…
Reference in New Issue
Block a user