added make-regexp-tweaker (not used yet), and some code reorganization
svn: r12573
This commit is contained in:
parent
62a8873198
commit
ecb39eedee
|
@ -32,21 +32,7 @@
|
||||||
s)))))))
|
s)))))))
|
||||||
|
|
||||||
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; Regexp helpers
|
;; Regexp utilities
|
||||||
|
|
||||||
(define (bstring-length s)
|
|
||||||
(if (bytes? s) (bytes-length s) (string-length s)))
|
|
||||||
|
|
||||||
(define (bstring->regexp name pattern)
|
|
||||||
(cond [(regexp? pattern) pattern]
|
|
||||||
[(byte-regexp? pattern) pattern]
|
|
||||||
[(string? pattern) (regexp pattern)]
|
|
||||||
[(bytes? pattern) (byte-regexp pattern)]
|
|
||||||
[else (raise-type-error
|
|
||||||
name "regexp, byte regexp, string, or byte string" pattern)]))
|
|
||||||
|
|
||||||
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
||||||
;; Regexp helpers
|
|
||||||
|
|
||||||
(define regexp-quote-chars:s #rx"[][.*?+|(){}\\$^]")
|
(define regexp-quote-chars:s #rx"[][.*?+|(){}\\$^]")
|
||||||
(define regexp-quote-chars:b #rx#"[][.*?+|(){}\\$^]")
|
(define regexp-quote-chars:b #rx#"[][.*?+|(){}\\$^]")
|
||||||
|
@ -69,6 +55,33 @@
|
||||||
[else (raise-type-error 'regexp-replace-quote
|
[else (raise-type-error 'regexp-replace-quote
|
||||||
"string or byte string" s)]))
|
"string or byte string" s)]))
|
||||||
|
|
||||||
|
(define (make-regexp-tweaker tweaker)
|
||||||
|
(let ([t (make-weak-hasheq)])
|
||||||
|
(lambda (rx)
|
||||||
|
(define-syntax-rule (->str x) (if (bytes? x) (bytes->string/utf-8 x) x))
|
||||||
|
(define-syntax-rule (->bts x) (if (bytes? x) x (string->bytes/utf-8 x)))
|
||||||
|
(define-syntax-rule (tweak unwrap wrap convert)
|
||||||
|
(let ([tweaked (tweaker (unwrap rx))])
|
||||||
|
;; the tweaker is allowed to return a regexp
|
||||||
|
(if (or (regexp? tweaked) (byte-regexp? tweaked))
|
||||||
|
tweaked
|
||||||
|
(wrap (convert tweaked)))))
|
||||||
|
(define (run-tweak)
|
||||||
|
(cond [(pregexp? rx) (tweak object-name pregexp ->str)]
|
||||||
|
[(regexp? rx) (tweak object-name regexp ->str)]
|
||||||
|
[(byte-pregexp? rx) (tweak object-name byte-pregexp ->bts)]
|
||||||
|
[(byte-regexp? rx) (tweak object-name byte-regexp ->bts)]
|
||||||
|
;; allow getting a string, so if someone needs to go
|
||||||
|
;; from a string to a regexp, there's no penalty
|
||||||
|
;; because of the intermediate regexp being recreated
|
||||||
|
[(string? rx) (tweak (lambda (x) x) regexp ->str)]
|
||||||
|
[(bytes? rx) (tweak (lambda (x) x) byte-regexp ->bts)]
|
||||||
|
[else (raise-type-error 'regexp-tweaker
|
||||||
|
"regexp of any kind, string, or bytes"
|
||||||
|
rx)]))
|
||||||
|
(or (hash-ref t rx #f)
|
||||||
|
(let ([rx* (run-tweak)]) (hash-set! t rx rx*) rx*)))))
|
||||||
|
|
||||||
(define (regexp-try-match pattern input-port [start-k 0] [end-k #f] [out #f])
|
(define (regexp-try-match pattern input-port [start-k 0] [end-k #f] [out #f])
|
||||||
(unless (input-port? input-port)
|
(unless (input-port? input-port)
|
||||||
(raise-type-error 'regexp-try-match
|
(raise-type-error 'regexp-try-match
|
||||||
|
@ -91,156 +104,164 @@
|
||||||
(and p (subbytes s (- (car p) drop) (- (cdr p) drop))))
|
(and p (subbytes s (- (car p) drop) (- (cdr p) drop))))
|
||||||
(cdr m))))))))
|
(cdr m))))))))
|
||||||
|
|
||||||
;; Helper macro for the regexp functions below.
|
;; Helper macro for the regexp functions below, with some utilities.
|
||||||
(define-syntax regexp-loop
|
(define (bstring-length s)
|
||||||
(syntax-rules ()
|
(if (bytes? s) (bytes-length s) (string-length s)))
|
||||||
[(regexp-loop name loop start end rx string
|
(define (bstring->regexp name pattern)
|
||||||
success-choose failure-k
|
(cond [(regexp? pattern) pattern]
|
||||||
port-success-k port-success-choose port-failure-k
|
[(byte-regexp? pattern) pattern]
|
||||||
need-leftover? peek?)
|
[(string? pattern) (regexp pattern)]
|
||||||
(let ([len (cond [(string? string) (string-length string)]
|
[(bytes? pattern) (byte-regexp pattern)]
|
||||||
[(bytes? string) (bytes-length string)]
|
[else (raise-type-error
|
||||||
[else #f])])
|
name "regexp, byte regexp, string, or byte string" pattern)]))
|
||||||
(if peek?
|
(define-syntax-rule (regexp-loop
|
||||||
(unless (input-port? string)
|
name loop start end rx string
|
||||||
(raise-type-error 'name "input port" string))
|
success-choose failure-k
|
||||||
(unless (or len (input-port? string))
|
port-success-k port-success-choose port-failure-k
|
||||||
(raise-type-error
|
need-leftover? peek?)
|
||||||
'name "string, byte string or input port" string)))
|
(let ([len (cond [(string? string) (string-length string)]
|
||||||
(unless (and (number? start) (exact? start) (integer? start)
|
[(bytes? string) (bytes-length string)]
|
||||||
(start . >= . 0))
|
[else #f])])
|
||||||
(raise-type-error 'name "non-negative exact integer" start))
|
(if peek?
|
||||||
(unless (or (not end)
|
(unless (input-port? string)
|
||||||
(and (number? end) (exact? end) (integer? end)
|
(raise-type-error 'name "input port" string))
|
||||||
(end . >= . 0)))
|
(unless (or len (input-port? string))
|
||||||
(raise-type-error 'name "non-negative exact integer or false" end))
|
(raise-type-error
|
||||||
(unless (or (input-port? string) (and len (start . <= . len)))
|
'name "string, byte string or input port" string)))
|
||||||
(raise-mismatch-error
|
(unless (and (number? start) (exact? start) (integer? start)
|
||||||
'name
|
(start . >= . 0))
|
||||||
(format "starting offset index out of range [0,~a]: " len)
|
(raise-type-error 'name "non-negative exact integer" start))
|
||||||
start))
|
(unless (or (not end)
|
||||||
(unless (or (not end)
|
(and (number? end) (exact? end) (integer? end)
|
||||||
(and (start . <= . end)
|
(end . >= . 0)))
|
||||||
(or (input-port? string)
|
(raise-type-error 'name "non-negative exact integer or false" end))
|
||||||
(and len (end . <= . len)))))
|
(unless (or (input-port? string) (and len (start . <= . len)))
|
||||||
(raise-mismatch-error
|
(raise-mismatch-error
|
||||||
'name
|
'name
|
||||||
(format "ending offset index out of range [~a,~a]: " start len)
|
(format "starting offset index out of range [0,~a]: " len)
|
||||||
end))
|
start))
|
||||||
(reverse
|
(unless (or (not end)
|
||||||
(let loop ([acc '()] [start start] [end end])
|
(and (start . <= . end)
|
||||||
|
(or (input-port? string) (and len (end . <= . len)))))
|
||||||
|
(raise-mismatch-error
|
||||||
|
'name
|
||||||
|
(format "ending offset index out of range [~a,~a]: " start len)
|
||||||
|
end))
|
||||||
|
(reverse
|
||||||
|
(let loop ([acc '()] [start start] [end end])
|
||||||
|
|
||||||
(if (and port-success-choose (input-port? string))
|
(if (and port-success-choose (input-port? string))
|
||||||
|
|
||||||
;; Input port match, get string
|
;; Input port match, get string
|
||||||
(let* ([_ (when (positive? start)
|
(let* ([_ (when (positive? start)
|
||||||
;; Skip start chars:
|
;; Skip start chars:
|
||||||
(let ([s (make-bytes 4096)])
|
(let ([s (make-bytes 4096)])
|
||||||
(let loop ([n 0])
|
(let loop ([n 0])
|
||||||
(unless (= n start)
|
(unless (= n start)
|
||||||
(let ([m (read-bytes-avail!
|
(let ([m (read-bytes-avail!
|
||||||
s string 0 (min (- start n) 4096))])
|
s string 0 (min (- start n) 4096))])
|
||||||
(unless (eof-object? m) (loop (+ n m))))))))]
|
(unless (eof-object? m) (loop (+ n m))))))))]
|
||||||
[discarded/leftovers (if need-leftover? #f 0)]
|
[discarded/leftovers (if need-leftover? #f 0)]
|
||||||
[spitout (if need-leftover?
|
[spitout (if need-leftover?
|
||||||
(open-output-bytes)
|
(open-output-bytes)
|
||||||
(make-output-port
|
(make-output-port
|
||||||
'counter always-evt
|
'counter always-evt
|
||||||
(lambda (s start end flush? breakable?)
|
(lambda (s start end flush? breakable?)
|
||||||
(let ([c (- end start)])
|
(let ([c (- end start)])
|
||||||
(set! discarded/leftovers
|
(set! discarded/leftovers
|
||||||
(+ c discarded/leftovers))
|
(+ c discarded/leftovers))
|
||||||
c))
|
c))
|
||||||
void))]
|
void))]
|
||||||
[end (and end (- end start))]
|
[end (and end (- end start))]
|
||||||
[m (regexp-match rx string 0 end spitout)]
|
[m (regexp-match rx string 0 end spitout)]
|
||||||
;; re-match if we get a zero-length match at the
|
;; re-match if we get a zero-length match at the
|
||||||
;; beginning
|
;; beginning
|
||||||
[m (if (and m ; we have a match
|
[m (if (and m ; we have a match
|
||||||
;; and it's an empty one
|
;; and it's an empty one
|
||||||
(zero? (bstring-length (car m)))
|
(zero? (bstring-length (car m)))
|
||||||
;; and it's at the beginning
|
;; and it's at the beginning
|
||||||
(zero? (if need-leftover?
|
(zero? (if need-leftover?
|
||||||
(file-position spitout)
|
(file-position spitout)
|
||||||
discarded/leftovers))
|
discarded/leftovers))
|
||||||
;; and we still have stuff to match
|
;; and we still have stuff to match
|
||||||
(if end
|
(if end
|
||||||
(< 0 end)
|
(< 0 end)
|
||||||
(not (eof-object? (peek-byte string)))))
|
(not (eof-object? (peek-byte string)))))
|
||||||
(regexp-match rx string 1 end spitout)
|
(regexp-match rx string 1 end spitout)
|
||||||
m)]
|
m)]
|
||||||
[m (and m (car m))]
|
[m (and m (car m))]
|
||||||
[discarded/leftovers (if need-leftover?
|
[discarded/leftovers (if need-leftover?
|
||||||
(get-output-bytes spitout)
|
(get-output-bytes spitout)
|
||||||
discarded/leftovers)]
|
discarded/leftovers)]
|
||||||
[end (and end m
|
[end (and end m
|
||||||
(- end (if need-leftover?
|
(- end (if need-leftover?
|
||||||
(bstring-length discarded/leftovers)
|
(bstring-length discarded/leftovers)
|
||||||
discarded/leftovers)
|
discarded/leftovers)
|
||||||
(bstring-length m)))])
|
(bstring-length m)))])
|
||||||
;; drop matches that are both empty and at the end
|
;; drop matches that are both empty and at the end
|
||||||
(if (and m (or (< 0 (bstring-length m))
|
(if (and m (or (< 0 (bstring-length m))
|
||||||
(if end
|
(if end
|
||||||
(< 0 end)
|
(< 0 end)
|
||||||
(not (eof-object? (peek-byte string))))))
|
(not (eof-object? (peek-byte string))))))
|
||||||
(loop (cons (port-success-choose m discarded/leftovers) acc)
|
(loop (cons (port-success-choose m discarded/leftovers) acc)
|
||||||
0 end)
|
0 end)
|
||||||
(port-failure-k acc discarded/leftovers)))
|
(port-failure-k acc discarded/leftovers)))
|
||||||
|
|
||||||
;; String/port match, get positions
|
;; String/port match, get positions
|
||||||
(let* ([match (if peek?
|
(let* ([match (if peek?
|
||||||
regexp-match-peek-positions
|
regexp-match-peek-positions
|
||||||
regexp-match-positions)]
|
regexp-match-positions)]
|
||||||
[m (match rx string start end)])
|
[m (match rx string start end)])
|
||||||
(if (not m)
|
(if (not m)
|
||||||
(failure-k acc start end)
|
(failure-k acc start end)
|
||||||
(let* ([mstart (caar m)]
|
(let* ([mstart (caar m)]
|
||||||
[mend (cdar m)]
|
[mend (cdar m)]
|
||||||
;; re-match if we get a zero-length match at the
|
;; re-match if we get a zero-length match at the
|
||||||
;; beginning, and we can continue
|
;; beginning, and we can continue
|
||||||
[m (if (and (= mstart mend start)
|
[m (if (and (= mstart mend start)
|
||||||
(cond
|
(cond
|
||||||
[end (< start end)]
|
[end (< start end)]
|
||||||
[len (< start len)]
|
[len (< start len)]
|
||||||
[(input-port? string)
|
[(input-port? string)
|
||||||
(not (eof-object? (peek-byte string)))]
|
(not (eof-object? (peek-byte string)))]
|
||||||
[else (error "internal error (str)")]))
|
[else (error "internal error (str)")]))
|
||||||
(if (or peek? (not (input-port? string)))
|
(if (or peek? (not (input-port? string)))
|
||||||
(match rx string (add1 start) end)
|
(match rx string (add1 start) end)
|
||||||
;; rematching on a port requires adding `start'
|
;; rematching on a port requires adding `start'
|
||||||
;; offsets
|
;; offsets
|
||||||
(let ([m (match rx string 1 end)])
|
(let ([m (match rx string 1 end)])
|
||||||
(if (and m (positive? start))
|
(if (and m (positive? start))
|
||||||
(list (cons (+ start (caar m))
|
(list (cons (+ start (caar m))
|
||||||
(+ start (cdar m))))
|
(+ start (cdar m))))
|
||||||
m)))
|
m)))
|
||||||
m)])
|
m)])
|
||||||
;; fail if rematch failed
|
;; fail if rematch failed
|
||||||
(if (not m)
|
(if (not m)
|
||||||
(failure-k acc start end)
|
(failure-k acc start end)
|
||||||
(let ([mstart (caar m)]
|
(let ([mstart (caar m)]
|
||||||
[mend (cdar m)])
|
[mend (cdar m)])
|
||||||
;; or if we have a zero-length match at the end
|
;; or if we have a zero-length match at the end
|
||||||
(if (and (= mstart mend)
|
(if (and (= mstart mend)
|
||||||
(cond [end (= mend end)]
|
(cond [end (= mend end)]
|
||||||
[len (= mend len)]
|
[len (= mend len)]
|
||||||
[(input-port? string)
|
[(input-port? string)
|
||||||
(eof-object?
|
(eof-object?
|
||||||
(peek-byte string (if peek? mend 0)))]
|
(peek-byte string (if peek? mend 0)))]
|
||||||
[else (error "internal error (str)")]))
|
[else (error "internal error (str)")]))
|
||||||
(failure-k acc start end)
|
(failure-k acc start end)
|
||||||
(if port-success-k
|
(if port-success-k
|
||||||
(port-success-k
|
(port-success-k
|
||||||
(lambda (acc new-start new-end)
|
(lambda (acc new-start new-end)
|
||||||
(loop acc new-start new-end))
|
(loop acc new-start new-end))
|
||||||
acc start end mstart mend)
|
acc start end mstart mend)
|
||||||
(loop (cons (success-choose start mstart mend) acc)
|
(loop (cons (success-choose start mstart mend) acc)
|
||||||
mend end))))))))))))]))
|
mend end)))))))))))))
|
||||||
|
|
||||||
;; Returns all the positions at which the pattern matched.
|
;; Returns all the positions at which the pattern matched.
|
||||||
(define (regexp-match-positions* pattern string [start 0] [end #f])
|
(define (regexp-match-positions* pattern string [start 0] [end #f])
|
||||||
(define rx (bstring->regexp 'regexp-match-positions* pattern))
|
(regexp-loop
|
||||||
(regexp-loop regexp-match-positions* loop start end rx string
|
regexp-match-positions* loop start end
|
||||||
|
(bstring->regexp 'regexp-match-positions* pattern) string
|
||||||
;; success-choose:
|
;; success-choose:
|
||||||
(lambda (start mstart mend) (cons mstart mend))
|
(lambda (start mstart mend) (cons mstart mend))
|
||||||
;; failure-k:
|
;; failure-k:
|
||||||
|
@ -262,8 +283,9 @@
|
||||||
|
|
||||||
;; Returns all the positions at which the pattern matched.
|
;; Returns all the positions at which the pattern matched.
|
||||||
(define (regexp-match-peek-positions* pattern string [start 0] [end #f])
|
(define (regexp-match-peek-positions* pattern string [start 0] [end #f])
|
||||||
(define rx (bstring->regexp 'regexp-match-peek-positions* pattern))
|
(regexp-loop
|
||||||
(regexp-loop regexp-match-peek-positions* loop start end rx string
|
regexp-match-peek-positions* loop start end
|
||||||
|
(bstring->regexp 'regexp-match-peek-positions* pattern) string
|
||||||
;; success-choose:
|
;; success-choose:
|
||||||
(lambda (start mstart mend) (cons mstart mend))
|
(lambda (start mstart mend) (cons mstart mend))
|
||||||
;; failure-k:
|
;; failure-k:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user