diff --git a/collects/scheme/private/string.ss b/collects/scheme/private/string.ss index d50236b433..2ec85b8de4 100644 --- a/collects/scheme/private/string.ss +++ b/collects/scheme/private/string.ss @@ -32,21 +32,7 @@ s))))))) ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Regexp helpers - - (define (bstring-length s) - (if (bytes? s) (bytes-length s) (string-length s))) - - (define (bstring->regexp name pattern) - (cond [(regexp? pattern) pattern] - [(byte-regexp? pattern) pattern] - [(string? pattern) (regexp pattern)] - [(bytes? pattern) (byte-regexp pattern)] - [else (raise-type-error - name "regexp, byte regexp, string, or byte string" pattern)])) - - ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Regexp helpers + ;; Regexp utilities (define regexp-quote-chars:s #rx"[][.*?+|(){}\\$^]") (define regexp-quote-chars:b #rx#"[][.*?+|(){}\\$^]") @@ -69,6 +55,33 @@ [else (raise-type-error 'regexp-replace-quote "string or byte string" s)])) + (define (make-regexp-tweaker tweaker) + (let ([t (make-weak-hasheq)]) + (lambda (rx) + (define-syntax-rule (->str x) (if (bytes? x) (bytes->string/utf-8 x) x)) + (define-syntax-rule (->bts x) (if (bytes? x) x (string->bytes/utf-8 x))) + (define-syntax-rule (tweak unwrap wrap convert) + (let ([tweaked (tweaker (unwrap rx))]) + ;; the tweaker is allowed to return a regexp + (if (or (regexp? tweaked) (byte-regexp? tweaked)) + tweaked + (wrap (convert tweaked))))) + (define (run-tweak) + (cond [(pregexp? rx) (tweak object-name pregexp ->str)] + [(regexp? rx) (tweak object-name regexp ->str)] + [(byte-pregexp? rx) (tweak object-name byte-pregexp ->bts)] + [(byte-regexp? rx) (tweak object-name byte-regexp ->bts)] + ;; allow getting a string, so if someone needs to go + ;; from a string to a regexp, there's no penalty + ;; because of the intermediate regexp being recreated + [(string? rx) (tweak (lambda (x) x) regexp ->str)] + [(bytes? rx) (tweak (lambda (x) x) byte-regexp ->bts)] + [else (raise-type-error 'regexp-tweaker + "regexp of any kind, string, or bytes" + rx)])) + (or (hash-ref t rx #f) + (let ([rx* (run-tweak)]) (hash-set! t rx rx*) rx*))))) + (define (regexp-try-match pattern input-port [start-k 0] [end-k #f] [out #f]) (unless (input-port? input-port) (raise-type-error 'regexp-try-match @@ -91,156 +104,164 @@ (and p (subbytes s (- (car p) drop) (- (cdr p) drop)))) (cdr m)))))))) - ;; Helper macro for the regexp functions below. - (define-syntax regexp-loop - (syntax-rules () - [(regexp-loop name loop start end rx string - success-choose failure-k - port-success-k port-success-choose port-failure-k - need-leftover? peek?) - (let ([len (cond [(string? string) (string-length string)] - [(bytes? string) (bytes-length string)] - [else #f])]) - (if peek? - (unless (input-port? string) - (raise-type-error 'name "input port" string)) - (unless (or len (input-port? string)) - (raise-type-error - 'name "string, byte string or input port" string))) - (unless (and (number? start) (exact? start) (integer? start) - (start . >= . 0)) - (raise-type-error 'name "non-negative exact integer" start)) - (unless (or (not end) - (and (number? end) (exact? end) (integer? end) - (end . >= . 0))) - (raise-type-error 'name "non-negative exact integer or false" end)) - (unless (or (input-port? string) (and len (start . <= . len))) - (raise-mismatch-error - 'name - (format "starting offset index out of range [0,~a]: " len) - start)) - (unless (or (not end) - (and (start . <= . end) - (or (input-port? string) - (and len (end . <= . len))))) - (raise-mismatch-error - 'name - (format "ending offset index out of range [~a,~a]: " start len) - end)) - (reverse - (let loop ([acc '()] [start start] [end end]) + ;; Helper macro for the regexp functions below, with some utilities. + (define (bstring-length s) + (if (bytes? s) (bytes-length s) (string-length s))) + (define (bstring->regexp name pattern) + (cond [(regexp? pattern) pattern] + [(byte-regexp? pattern) pattern] + [(string? pattern) (regexp pattern)] + [(bytes? pattern) (byte-regexp pattern)] + [else (raise-type-error + name "regexp, byte regexp, string, or byte string" pattern)])) + (define-syntax-rule (regexp-loop + name loop start end rx string + success-choose failure-k + port-success-k port-success-choose port-failure-k + need-leftover? peek?) + (let ([len (cond [(string? string) (string-length string)] + [(bytes? string) (bytes-length string)] + [else #f])]) + (if peek? + (unless (input-port? string) + (raise-type-error 'name "input port" string)) + (unless (or len (input-port? string)) + (raise-type-error + 'name "string, byte string or input port" string))) + (unless (and (number? start) (exact? start) (integer? start) + (start . >= . 0)) + (raise-type-error 'name "non-negative exact integer" start)) + (unless (or (not end) + (and (number? end) (exact? end) (integer? end) + (end . >= . 0))) + (raise-type-error 'name "non-negative exact integer or false" end)) + (unless (or (input-port? string) (and len (start . <= . len))) + (raise-mismatch-error + 'name + (format "starting offset index out of range [0,~a]: " len) + start)) + (unless (or (not end) + (and (start . <= . end) + (or (input-port? string) (and len (end . <= . len))))) + (raise-mismatch-error + 'name + (format "ending offset index out of range [~a,~a]: " start len) + end)) + (reverse + (let loop ([acc '()] [start start] [end end]) - (if (and port-success-choose (input-port? string)) + (if (and port-success-choose (input-port? string)) - ;; Input port match, get string - (let* ([_ (when (positive? start) - ;; Skip start chars: - (let ([s (make-bytes 4096)]) - (let loop ([n 0]) - (unless (= n start) - (let ([m (read-bytes-avail! - s string 0 (min (- start n) 4096))]) - (unless (eof-object? m) (loop (+ n m))))))))] - [discarded/leftovers (if need-leftover? #f 0)] - [spitout (if need-leftover? - (open-output-bytes) - (make-output-port - 'counter always-evt - (lambda (s start end flush? breakable?) - (let ([c (- end start)]) - (set! discarded/leftovers - (+ c discarded/leftovers)) - c)) - void))] - [end (and end (- end start))] - [m (regexp-match rx string 0 end spitout)] - ;; re-match if we get a zero-length match at the - ;; beginning - [m (if (and m ; we have a match - ;; and it's an empty one - (zero? (bstring-length (car m))) - ;; and it's at the beginning - (zero? (if need-leftover? - (file-position spitout) - discarded/leftovers)) - ;; and we still have stuff to match - (if end - (< 0 end) - (not (eof-object? (peek-byte string))))) - (regexp-match rx string 1 end spitout) - m)] - [m (and m (car m))] - [discarded/leftovers (if need-leftover? - (get-output-bytes spitout) - discarded/leftovers)] - [end (and end m - (- end (if need-leftover? - (bstring-length discarded/leftovers) - discarded/leftovers) - (bstring-length m)))]) - ;; drop matches that are both empty and at the end - (if (and m (or (< 0 (bstring-length m)) - (if end - (< 0 end) - (not (eof-object? (peek-byte string)))))) - (loop (cons (port-success-choose m discarded/leftovers) acc) - 0 end) - (port-failure-k acc discarded/leftovers))) + ;; Input port match, get string + (let* ([_ (when (positive? start) + ;; Skip start chars: + (let ([s (make-bytes 4096)]) + (let loop ([n 0]) + (unless (= n start) + (let ([m (read-bytes-avail! + s string 0 (min (- start n) 4096))]) + (unless (eof-object? m) (loop (+ n m))))))))] + [discarded/leftovers (if need-leftover? #f 0)] + [spitout (if need-leftover? + (open-output-bytes) + (make-output-port + 'counter always-evt + (lambda (s start end flush? breakable?) + (let ([c (- end start)]) + (set! discarded/leftovers + (+ c discarded/leftovers)) + c)) + void))] + [end (and end (- end start))] + [m (regexp-match rx string 0 end spitout)] + ;; re-match if we get a zero-length match at the + ;; beginning + [m (if (and m ; we have a match + ;; and it's an empty one + (zero? (bstring-length (car m))) + ;; and it's at the beginning + (zero? (if need-leftover? + (file-position spitout) + discarded/leftovers)) + ;; and we still have stuff to match + (if end + (< 0 end) + (not (eof-object? (peek-byte string))))) + (regexp-match rx string 1 end spitout) + m)] + [m (and m (car m))] + [discarded/leftovers (if need-leftover? + (get-output-bytes spitout) + discarded/leftovers)] + [end (and end m + (- end (if need-leftover? + (bstring-length discarded/leftovers) + discarded/leftovers) + (bstring-length m)))]) + ;; drop matches that are both empty and at the end + (if (and m (or (< 0 (bstring-length m)) + (if end + (< 0 end) + (not (eof-object? (peek-byte string)))))) + (loop (cons (port-success-choose m discarded/leftovers) acc) + 0 end) + (port-failure-k acc discarded/leftovers))) - ;; String/port match, get positions - (let* ([match (if peek? - regexp-match-peek-positions - regexp-match-positions)] - [m (match rx string start end)]) - (if (not m) - (failure-k acc start end) - (let* ([mstart (caar m)] - [mend (cdar m)] - ;; re-match if we get a zero-length match at the - ;; beginning, and we can continue - [m (if (and (= mstart mend start) - (cond - [end (< start end)] - [len (< start len)] - [(input-port? string) - (not (eof-object? (peek-byte string)))] - [else (error "internal error (str)")])) - (if (or peek? (not (input-port? string))) - (match rx string (add1 start) end) - ;; rematching on a port requires adding `start' - ;; offsets - (let ([m (match rx string 1 end)]) - (if (and m (positive? start)) - (list (cons (+ start (caar m)) - (+ start (cdar m)))) - m))) - m)]) - ;; fail if rematch failed - (if (not m) - (failure-k acc start end) - (let ([mstart (caar m)] - [mend (cdar m)]) - ;; or if we have a zero-length match at the end - (if (and (= mstart mend) - (cond [end (= mend end)] - [len (= mend len)] - [(input-port? string) - (eof-object? - (peek-byte string (if peek? mend 0)))] - [else (error "internal error (str)")])) - (failure-k acc start end) - (if port-success-k - (port-success-k - (lambda (acc new-start new-end) - (loop acc new-start new-end)) - acc start end mstart mend) - (loop (cons (success-choose start mstart mend) acc) - mend end))))))))))))])) + ;; String/port match, get positions + (let* ([match (if peek? + regexp-match-peek-positions + regexp-match-positions)] + [m (match rx string start end)]) + (if (not m) + (failure-k acc start end) + (let* ([mstart (caar m)] + [mend (cdar m)] + ;; re-match if we get a zero-length match at the + ;; beginning, and we can continue + [m (if (and (= mstart mend start) + (cond + [end (< start end)] + [len (< start len)] + [(input-port? string) + (not (eof-object? (peek-byte string)))] + [else (error "internal error (str)")])) + (if (or peek? (not (input-port? string))) + (match rx string (add1 start) end) + ;; rematching on a port requires adding `start' + ;; offsets + (let ([m (match rx string 1 end)]) + (if (and m (positive? start)) + (list (cons (+ start (caar m)) + (+ start (cdar m)))) + m))) + m)]) + ;; fail if rematch failed + (if (not m) + (failure-k acc start end) + (let ([mstart (caar m)] + [mend (cdar m)]) + ;; or if we have a zero-length match at the end + (if (and (= mstart mend) + (cond [end (= mend end)] + [len (= mend len)] + [(input-port? string) + (eof-object? + (peek-byte string (if peek? mend 0)))] + [else (error "internal error (str)")])) + (failure-k acc start end) + (if port-success-k + (port-success-k + (lambda (acc new-start new-end) + (loop acc new-start new-end)) + acc start end mstart mend) + (loop (cons (success-choose start mstart mend) acc) + mend end))))))))))))) ;; Returns all the positions at which the pattern matched. (define (regexp-match-positions* pattern string [start 0] [end #f]) - (define rx (bstring->regexp 'regexp-match-positions* pattern)) - (regexp-loop regexp-match-positions* loop start end rx string + (regexp-loop + regexp-match-positions* loop start end + (bstring->regexp 'regexp-match-positions* pattern) string ;; success-choose: (lambda (start mstart mend) (cons mstart mend)) ;; failure-k: @@ -262,8 +283,9 @@ ;; Returns all the positions at which the pattern matched. (define (regexp-match-peek-positions* pattern string [start 0] [end #f]) - (define rx (bstring->regexp 'regexp-match-peek-positions* pattern)) - (regexp-loop regexp-match-peek-positions* loop start end rx string + (regexp-loop + regexp-match-peek-positions* loop start end + (bstring->regexp 'regexp-match-peek-positions* pattern) string ;; success-choose: (lambda (start mstart mend) (cons mstart mend)) ;; failure-k: