From af91af353416ac0dd0a1e7b2105d500d2d3de5fe Mon Sep 17 00:00:00 2001 From: Eli Barzilay Date: Wed, 31 May 2006 18:41:18 +0000 Subject: [PATCH] rewrite glob->regexp to deal with char ranges properly svn: r3151 original commit: 8607cacc089786209ce2f7214b1b80294205fcef --- collects/mzlib/string.ss | 179 +++++++++++++++++++++++++++------------ 1 file changed, 126 insertions(+), 53 deletions(-) diff --git a/collects/mzlib/string.ss b/collects/mzlib/string.ss index 1d9db01..35e71e1 100644 --- a/collects/mzlib/string.ss +++ b/collects/mzlib/string.ss @@ -405,60 +405,133 @@ (string-utf-8-length s))) (= (cdar m) (string-length s))))))) - (define default-glob-case-sens - (not (memq (system-type) '(windows macos macosx)))) (define glob->regexp - (opt-lambda (glob [case-sens? default-glob-case-sens]) - (define len (string-length glob)) - (define range #f) - (define init-*? #f) - (define case-sens - (if case-sens? - values - (lambda (c) - (if (char? c) ; do this only for char inputs -- see below + (let* ([def-case-sens (not (memq (system-type) '(windows macos macosx)))] + [quoted-chars (string->list ".|+^$()[]")] + [error + (lambda xs (apply error 'glob->regexp xs))] + [->case-insens + (lambda (c) (let ([c1 (char-upcase c)] [c2 (char-downcase c)]) - (if (char=? c1 c2) c (list #\[ c1 c2 #\]))) - c)))) - (let loop ([res '()] [i 0]) - (define (next x) (loop (cons x res) (add1 i))) - (if (= i len) - (begin - (when range - (error 'glob->regexp "unterminated range in glob: ~e" glob)) - (let loop ([left res] [res (if init-*? '() '(#\$))]) - (if (null? left) - (regexp (list->string - (cons #\^ (if init-*? - `(#\( #\? #\: #\[ #\^ #\. #\] #\. #\* - ,@res #\| ,@res #\) #\$) - res)))) - ;; doing this only for single chars, which means that - ;; backslash-quoted char is left alone - (let ([c (case-sens (car left))]) - (loop (cdr left) - (if (char? c) - (cons (case-sens c) res) - (append c res))))))) - (let ([c (string-ref glob i)]) - (if range - (begin (set! range - (case range - [(0) (case c ((#\^) 1) (else 2))] - [(1) 2] - [else (case c ((#\]) #f) (else 2))])) - (next c)) - (case c - [(#\\) (set! i (add1 i)) - (if (< i len) - (next (list #\\ (string-ref glob i))) - (error 'glob->regexp "glob ends in backslash: ~e" glob))] - [(#\*) (if (eq? 0 i) - (begin (set! init-*? #t) (next '())) - (next '(#\. #\*)))] - [(#\?) (next (if (eq? 0 i) '(#\[ #\^ #\. #\]) #\.))] - [(#\[) (set! range 0) (next #\[)] - [(#\. #\| #\+ #\^ #\$ #\( #\) #\]) (next (list #\\ c))] - [else (next c)]))))))) + (if (char=? c1 c2) c (list #\[ c1 c2 #\]))))] + [no-dot* (string->list "(?:[^.].*|)")] + [no-dot+ (string->list "(?:[^.].*)")] + [no-dot1 (string->list "[^.]")] + [ndot (char->integer #\.)] + [dot+1 (integer->char (add1 ndot))] + ;; don't use dot-1, because it's "-" + [dot-2 (integer->char (- ndot 2))]) + (define (range:add-dot l) ; add it to the end, before a possible last "-" + (if (member l '(() (#\-))) + (cons #\. l) + (cons (car l) (range:add-dot (cdr l))))) + (define (range:del-dot l) ; remove "."s, take care of ranges too + ;; problem: when we remove a "." from the middle of a range, we need to + ;; add "-" at the end + (let loop ([l l] [-? #f]) + (cond [(and -? (null? l)) '(#\-)] ; add a "-" + [(and -? (null? (cdr l))) + (if (char=? #\- (car l)) l (list (car l) #\-))] + [(null? l) '()] + [(and (pair? l) (pair? (cdr l)) (pair? (cddr l)) + (char=? #\- (cadr l))) + (let* ([lo (car l)] [nlo (char->integer lo)] + [hi (caddr l)] [nhi (char->integer hi)] + [rest (lambda (-?) (loop (cdddr l) -?))]) + (cond [(= nlo ndot nhi) (rest -?)] + [(= nlo ndot) (list* dot+1 #\- hi (rest -?))] + [(= nhi ndot) (list* lo #\- dot-2 (rest #t))] + [(<= nlo ndot nhi) + (list* lo #\- dot-2 dot+1 #\- hi (rest #t))] + [else (list* lo #\- hi (rest -?))]))] + [(char=? #\. (car l)) (loop (cdr l) -?)] + [else (cons (car l) (loop (cdr l) -?))]))) + (define (hide-dots l) + (let* ([1st (and (pair? l) (car l))] + [2nd (and 1st (pair? (cdr l)) (cadr l))]) + (cond [(or (not 1st) (char? 1st)) l] + ;; "?---" --> "[^.]---" + [(eq? 'char 1st) (cons no-dot1 (cdr l))] + [(eq? 'any 1st) + (cond ;; "*" --> "(?:[^.].*)" + [(not 2nd) no-dot+] + ;; "*.---" --> "(?:[^.].*).---" + [(or (eq? #\. 2nd) (equal? '(quote #\.) 2nd)) + (cons no-dot+ (cdr l))] + ;; "*---" --> "(?:[^.].*|)---" + [(char? 2nd) (cons no-dot* (cdr l))] + ;; "*?---" --> "?*---" + [(eq? 'char 2nd) (hide-dots (list* 'char 'any (cddr l)))] + ;; "**---" --> "*---" + [(eq? 'any 2nd) (hide-dots (cons 'any (cddr l)))] + ;; same as non-.-char + [(eq? 'quote (car 2nd)) (cons no-dot* (cdr l))] + ;; "*[---]---" --> "(?:[^.].*|)[--*--]---" + [(eq? 'range (car 2nd)) + (list* no-dot* + (list* (car 2nd) (cadr 2nd) + ((if (cadr 2nd) + range:add-dot range:del-dot) + (cddr 2nd))) + (cddr l))])] + [(eq? 'quote (caar l)) l] + [(eq? 'range (caar l)) + (list* (list* (car 1st) (cadr 1st) + ((if (cadr 1st) range:add-dot range:del-dot) + (cddr 1st))) + (cdr l))] + [else (error "internal error")]))) + (opt-lambda (glob [hide-dots? #t] [case-sens? def-case-sens]) + (let ([len (string-length glob)] + [->case (if case-sens? values ->case-insens)]) + (let loop ([r '()] [i 0]) + (if (< i len) + (let ([c (string-ref glob i)]) + (case c + [(#\\) + (let ([i (add1 i)]) + (if (< i len) + (loop (cons `(quote ,(string-ref glob i)) r) (add1 i)) + (error "glob ends in backslash: ~e" glob)))] + [(#\*) (loop (cons 'any r) (add1 i))] + [(#\?) (loop (cons 'char r) (add1 i))] + [(#\[) + (let* ([i (add1 i)] + [not? (and (< i len) + (char=? #\^ (string-ref glob i)))]) + (let loop2 ([chars '()] [i (if not? (add1 i) i)]) + (if (= i len) + (error "unterminated range in glob: ~e" glob) + (let ([c (string-ref glob i)]) + (if (and (char=? #\] c) (pair? chars)) + (loop (cons `(range ,not? ,@(reverse! chars)) r) + (add1 i)) + (loop2 (cons c chars) (add1 i)))))))] + [else (loop (cons c r) (add1 i))])) + (let* ([r (reverse! r)] + [r (if hide-dots? (hide-dots r) r)]) + (regexp + (list->string + (apply append + (map (lambda (c) + (cond [(char? c) + (if (memq c quoted-chars) + ;; doing this only for single chars, + ;; which means that backslash-quoted + ;; chars are left alone + `(#\\ ,c) (list (->case c)))] + [(eq? 'any c) '(#\. #\*)] + [(eq? 'char c) '(#\.)] + ;; results of dot tweaking: no alphabetics + [(char? (car c)) c] + ;; note: no ->case here + [(eq? 'quote (car c)) `(#\\ ,(cadr c))] + [(eq? 'range (car c)) + (append '(#\[) + (if (cadr c) '(#\^) '()) + (cddr c) + '(#\]))] + [else (error "internal error")])) + r))))))))))) )