improved glob->regexp + tests

svn: r4933

original commit: 6fbc2dc27f888bdb223aa028e927657d2fccb2d5
This commit is contained in:
Eli Barzilay 2006-11-23 09:53:58 +00:00
parent 7ae43e2d72
commit 28a09920e8

View File

@ -147,8 +147,8 @@
(opt-lambda (s [case-sens? #t]) (opt-lambda (s [case-sens? #t])
(let* ([b? (cond [(bytes? s) #t] (let* ([b? (cond [(bytes? s) #t]
[(string? s) #f] [(string? s) #f]
(raise-type-error 'regexp-quote [else (raise-type-error 'regexp-quote
"string or byte string" s))] "string or byte string" s)])]
[s (if b? [s (if b?
(regexp-replace* regexp-quote-chars:b s #"\\\\&") (regexp-replace* regexp-quote-chars:b s #"\\\\&")
(regexp-replace* regexp-quote-chars:s s "\\\\&"))]) (regexp-replace* regexp-quote-chars:s s "\\\\&"))])
@ -159,8 +159,8 @@
(define (regexp-replace-quote s) (define (regexp-replace-quote s)
(let ([b? (cond [(bytes? s) #t] (let ([b? (cond [(bytes? s) #t]
[(string? s) #f] [(string? s) #f]
(raise-type-error 'regexp-replace-quote [else (raise-type-error 'regexp-replace-quote
"string or byte string" s))]) "string or byte string" s)])])
(if b? (if b?
(regexp-replace* #rx#"[&\\]" s #"\\\\&") (regexp-replace* #rx#"[&\\]" s #"\\\\&")
(regexp-replace* #rx"[&\\]" s "\\\\&")))) (regexp-replace* #rx"[&\\]" s "\\\\&"))))
@ -418,132 +418,53 @@
(= (cdar m) (string-length s))))))) (= (cdar m) (string-length s)))))))
(define glob->regexp (define glob->regexp
(let* ([def-case-sens (not (memq (system-type) '(windows macos macosx)))] (let-values
[quoted-chars (string->list ".|+^$()[]")] ([(def-case-sens) (not (memq (system-type) '(windows macos macosx)))]
[error [(item:s item:b)
(lambda xs (apply error 'glob->regexp xs))] (let ([rx (string-append
[->case-insens "(?:"
(lambda (c) "[\\]." ; escaped item
(let ([c1 (char-upcase c)] [c2 (char-downcase c)]) "|"
(if (char=? c1 c2) c (list #\[ c1 c2 #\]))))] "[*?]" ; wildcards -- the only 1-character match
[no-dot* (string->list "(?:[^.].*|)")] "|"
[no-dot+ (string->list "(?:[^.].*)")] "\\[(?:\\^?\\]|\\^?[^]^])[^]]*\\]" ; range
[no-dot1 (string->list "[^.]")] ")"
[ndot (char->integer #\.)] )])
[dot+1 (integer->char (add1 ndot))] (values (regexp rx) (byte-regexp (string->bytes/utf-8 rx))))])
;; don't use dot-1, because it's "-"
[dot-2 (integer->char (- ndot 2))])
(define (range:add-dot l) ; add it to the end, before a possible last "-"
(if (member l '(() (#\-)))
(cons #\. l)
(cons (car l) (range:add-dot (cdr l)))))
(define (range:del-dot l) ; remove "."s, take care of ranges too
;; problem: when we remove a "." from the middle of a range, we need to
;; add "-" at the end
(let loop ([l l] [-? #f])
(cond [(and -? (null? l)) '(#\-)] ; add a "-"
[(and -? (null? (cdr l)))
(if (char=? #\- (car l)) l (list (car l) #\-))]
[(null? l) '()]
[(and (pair? l) (pair? (cdr l)) (pair? (cddr l))
(char=? #\- (cadr l)))
(let* ([lo (car l)] [nlo (char->integer lo)]
[hi (caddr l)] [nhi (char->integer hi)]
[rest (lambda (-?) (loop (cdddr l) -?))])
(cond [(= nlo ndot nhi) (rest -?)]
[(= nlo ndot) (list* dot+1 #\- hi (rest -?))]
[(= nhi ndot) (list* lo #\- dot-2 (rest #t))]
[(<= nlo ndot nhi)
(list* lo #\- dot-2 dot+1 #\- hi (rest #t))]
[else (list* lo #\- hi (rest -?))]))]
[(char=? #\. (car l)) (loop (cdr l) -?)]
[else (cons (car l) (loop (cdr l) -?))])))
(define (hide-dots l)
(let* ([1st (and (pair? l) (car l))]
[2nd (and 1st (pair? (cdr l)) (cadr l))])
(cond [(or (not 1st) (char? 1st)) l]
;; "?---" --> "[^.]---"
[(eq? 'char 1st) (cons no-dot1 (cdr l))]
[(eq? 'any 1st)
(cond ;; "*" --> "(?:[^.].*)"
[(not 2nd) (list no-dot+)]
;; "*.---" --> "(?:[^.].*).---"
[(or (eq? #\. 2nd) (equal? '(quote #\.) 2nd))
(cons no-dot+ (cdr l))]
;; "*---" --> "(?:[^.].*|)---"
[(char? 2nd) (cons no-dot* (cdr l))]
;; "*?---" --> "?*---"
[(eq? 'char 2nd) (hide-dots (list* 'char 'any (cddr l)))]
;; "**---" --> "*---"
[(eq? 'any 2nd) (hide-dots (cons 'any (cddr l)))]
;; same as non-.-char
[(eq? 'quote (car 2nd)) (cons no-dot* (cdr l))]
;; "*[---]---" --> "(?:[^.].*|)[--*--]---"
[(eq? 'range (car 2nd))
(list* no-dot*
(list* (car 2nd) (cadr 2nd)
((if (cadr 2nd)
range:add-dot range:del-dot)
(cddr 2nd)))
(cddr l))])]
[(eq? 'quote (caar l)) l]
[(eq? 'range (caar l))
(list* (list* (car 1st) (cadr 1st)
((if (cadr 1st) range:add-dot range:del-dot)
(cddr 1st)))
(cdr l))]
[else (error "internal error")])))
(opt-lambda (glob [hide-dots? #t] [case-sens? def-case-sens]) (opt-lambda (glob [hide-dots? #t] [case-sens? def-case-sens])
(let ([len (string-length glob)] (let*-values ([(b?) (cond [(bytes? glob) #t]
[->case (if case-sens? list ->case-insens)]) [(string? glob) #f]
(let loop ([r '()] [i 0]) [else (raise-type-error
(if (< i len) 'glob->regexp
(let ([c (string-ref glob i)]) "string or byte string" glob)])]
(case c [(app sub ref rx item star any one)
[(#\\) (if b?
(let ([i (add1 i)]) (values bytes-append subbytes bytes-ref byte-regexp
(if (< i len) item:b (char->integer #\*) #".*" #".")
(loop (cons `(quote ,(string-ref glob i)) r) (add1 i)) (values string-append substring string-ref regexp
(error "glob ends in backslash: ~e" glob)))] item:s #\* ".*" "."))]
[(#\*) (loop (cons 'any r) (add1 i))] [(pfx sfx) (if case-sens?
[(#\?) (loop (cons 'char r) (add1 i))] (if b? (values #"^" #"^")
[(#\[) (values "^" "$"))
(let* ([i (add1 i)] (if b? (values #"^(?i:" #")$")
[not? (and (< i len) (values "^(?i:" ")$")))]
(char=? #\^ (string-ref glob i)))]) [(pfx) (if hide-dots?
(let loop2 ([chars '()] [i (if not? (add1 i) i)]) (app pfx (if b? #"(?![.])" "(?![.])"))
(if (= i len) pfx)])
(error "unterminated range in glob: ~e" glob) (let loop ([i 0] [ps (regexp-match-positions* item glob)] [r '()])
(let ([c (string-ref glob i)]) (cond [(null? ps)
(if (and (char=? #\] c) (pair? chars)) (rx (app pfx
(loop (cons `(range ,not? ,@(reverse! chars)) r) (apply app (reverse! (cons (sub glob i) r)))
(add1 i)) sfx))]
(loop2 (cons c chars) (add1 i)))))))] [(= 1 (- (cdar ps) (caar ps)))
[else (loop (cons c r) (add1 i))])) (loop (cdar ps)
(let* ([r (reverse! r)] (cdr ps)
[r `((#\^) ,@(if hide-dots? (hide-dots r) r) (#\$))]) (cons (if (equal? star (ref glob (caar ps))) any one)
(regexp (if (= i (caar ps))
(list->string r
(apply append (cons (sub glob i (caar ps)) r))))]
(map (lambda (c) [else (loop (cdar ps)
(cond [(char? c) (cdr ps)
(if (memq c quoted-chars) (cons (sub glob i (cdar ps)) r))]))))))
;; doing this only for single chars,
;; which means that backslash-quoted
;; chars are left alone
`(#\\ ,c) (->case c))]
[(eq? 'any c) '(#\. #\*)]
[(eq? 'char c) '(#\.)]
;; results of dot tweaking: no alphabetics
[(char? (car c)) c]
;; note: no ->case here
[(eq? 'quote (car c)) `(#\\ ,(cadr c))]
[(eq? 'range (car c))
(append '(#\[)
(if (cadr c) '(#\^) '())
(cddr c)
'(#\]))]
[else (error "internal error")]))
r)))))))))))
) )