[regexp] follow spec for starred groups and alternates
This commit is contained in:
parent
4faad17905
commit
5787e27d60
|
@ -265,4 +265,65 @@
|
||||||
(regexp-match: #rx"hello" #"world")
|
(regexp-match: #rx"hello" #"world")
|
||||||
;; Would be a type error if we annotated wrong
|
;; Would be a type error if we annotated wrong
|
||||||
#f)
|
#f)
|
||||||
|
|
||||||
|
;; -- starred group => 1 group
|
||||||
|
(check-equal?
|
||||||
|
(ann
|
||||||
|
(regexp-match: "(poo )*" "poo poo platter")
|
||||||
|
(U #f (List String (U #f String))))
|
||||||
|
'("poo poo " "poo "))
|
||||||
|
|
||||||
|
(check-equal?
|
||||||
|
(ann
|
||||||
|
(regexp-match: "([a-z ]+;)*" "lather; rinse; repeat;")
|
||||||
|
(U #f (List String (U #f String))))
|
||||||
|
'("lather; rinse; repeat;" " repeat;"))
|
||||||
|
|
||||||
|
;; -- ? = 1 group
|
||||||
|
(let-regexp: ([date-re (pregexp: "([a-z]+) +([0-9]+,)? *([0-9]+)")])
|
||||||
|
(check-equal?
|
||||||
|
(ann
|
||||||
|
(regexp-match: date-re "jan 1, 1970")
|
||||||
|
(U #f (List String String (U #f String) String)))
|
||||||
|
'("jan 1, 1970" "jan" "1," "1970"))
|
||||||
|
|
||||||
|
(check-equal?
|
||||||
|
(ann
|
||||||
|
(regexp-match: date-re "jan 1970")
|
||||||
|
(U #f (List String String (U #f String) String)))
|
||||||
|
'("jan 1970" "jan" #f "1970"))
|
||||||
|
)
|
||||||
|
|
||||||
|
;; -- (? = 0 groups
|
||||||
|
(check-equal?
|
||||||
|
(ann
|
||||||
|
(regexp-match: "^(?:[a-z]*/)*([a-z]+)$" "/usr/local/bin/mzscheme")
|
||||||
|
(U #f (List String String)))
|
||||||
|
'("/usr/local/bin/mzscheme" "mzscheme"))
|
||||||
|
|
||||||
|
(check-equal?
|
||||||
|
(ann
|
||||||
|
(regexp-match: #px"(?i:AloT)" "alot")
|
||||||
|
(U #f (List String)))
|
||||||
|
'("alot"))
|
||||||
|
|
||||||
|
;; -- pipes = take min groups
|
||||||
|
(check-equal?
|
||||||
|
(ann
|
||||||
|
(regexp-match: "^(a*)|(b*)$" "aaa")
|
||||||
|
(U #f (List String (U #f String) (U #f String))))
|
||||||
|
'("aaa" "aaa" #f))
|
||||||
|
|
||||||
|
(check-equal?
|
||||||
|
(ann
|
||||||
|
(regexp-match: "^(aa*)(c*)|(b*)$" "b")
|
||||||
|
(U #f (List String (U #f String) (U #f String) (U #f String))))
|
||||||
|
'("b" #f #f "b"))
|
||||||
|
|
||||||
|
;; -- nested gropus
|
||||||
|
(check-equal?
|
||||||
|
(ann
|
||||||
|
(regexp-match: "((a)b)" "abc")
|
||||||
|
(U #f (List String String String)))
|
||||||
|
'("ab" "ab" "a"))
|
||||||
)
|
)
|
||||||
|
|
|
@ -32,6 +32,7 @@
|
||||||
(for-syntax
|
(for-syntax
|
||||||
(only-in racket/syntax format-id)
|
(only-in racket/syntax format-id)
|
||||||
typed/racket/base
|
typed/racket/base
|
||||||
|
(only-in racket/list range)
|
||||||
(only-in racket/format ~a)
|
(only-in racket/format ~a)
|
||||||
syntax/parse
|
syntax/parse
|
||||||
trivial/private/common))
|
trivial/private/common))
|
||||||
|
@ -59,69 +60,86 @@
|
||||||
[(byte-pregexp? v) (parse-groups/byte-pregexp v #:src v-stx)]
|
[(byte-pregexp? v) (parse-groups/byte-pregexp v #:src v-stx)]
|
||||||
[else #f]))
|
[else #f]))
|
||||||
|
|
||||||
|
;; Handle pipes
|
||||||
|
;; If there is a pipe, everything is nullable, but we know the number of groups
|
||||||
|
(define (parse-groups/untyped str #:src stx)
|
||||||
|
(define alt* (string->alt* str))
|
||||||
|
(cond
|
||||||
|
[(null? alt*)
|
||||||
|
(list 0 '())]
|
||||||
|
[(null? (cdr alt*))
|
||||||
|
(parse-groups-for-alt (car alt*) #:src stx)]
|
||||||
|
[else
|
||||||
|
(define num-groups
|
||||||
|
(for/fold ([num-groups 0])
|
||||||
|
([alt (in-list alt*)])
|
||||||
|
(define ng+null* (parse-groups-for-alt alt #:src stx))
|
||||||
|
(+ num-groups (car ng+null*))))
|
||||||
|
(list num-groups (range num-groups))]))
|
||||||
|
|
||||||
;; Count the number of matched parentheses in a regexp pattern.
|
;; Count the number of matched parentheses in a regexp pattern.
|
||||||
;; Raise an exception if there are unmatched parens.
|
;; Raise an exception if there are unmatched parens.
|
||||||
(define (parse-groups/untyped str #:src stx)
|
(define (parse-groups-for-alt str #:src stx)
|
||||||
(define last-index (- (string-length str) 1))
|
(define last-index (- (string-length str) 1))
|
||||||
(define ignore? (box #f))
|
(define in-square? (box #f))
|
||||||
(let loop ([i 0] [in-paren '()] [num-groups 0])
|
(let loop ([i 0] [in-paren '()] [num-groups 0] [null-idx* '()])
|
||||||
(if (> i last-index)
|
(if (> i last-index)
|
||||||
(cond
|
(cond
|
||||||
[(not (null? in-paren))
|
[(not (null? in-paren))
|
||||||
(group-error str (format "'(' at index ~a" (car in-paren)))]
|
(group-error str (format "'(' at index ~a" (car in-paren)))]
|
||||||
[(unbox ignore?)
|
[(unbox in-square?)
|
||||||
(group-error str (format "'[' at index ~a" (car in-paren)))]
|
(group-error str (format "'[' at index ~a" (car in-paren)))]
|
||||||
[else
|
[else
|
||||||
num-groups])
|
(list num-groups null-idx*)])
|
||||||
(if (unbox ignore?)
|
(if (unbox in-square?)
|
||||||
(if (eq? #\] (string-ref str i))
|
(if (eq? #\] (string-ref str i))
|
||||||
(begin (set-box! ignore? #f)
|
(begin (set-box! in-square? #f)
|
||||||
(loop (+ i 1) (cdr in-paren) num-groups))
|
(loop (+ i 1) (cdr in-paren) num-groups null-idx*))
|
||||||
(loop (+ i 1) in-paren num-groups))
|
(loop (+ i 1) in-paren num-groups null-idx*))
|
||||||
(case (string-ref str i)
|
(case (string-ref str i)
|
||||||
[(#\[)
|
[(#\[)
|
||||||
;; Ignore things between [ ... ]
|
;; Ignore things between [ ... ]
|
||||||
(set-box! ignore? #t)
|
(set-box! in-square? #t)
|
||||||
(loop (+ i 1) (cons i in-paren) num-groups)]
|
(loop (+ i 1) (cons i in-paren) num-groups null-idx*)]
|
||||||
[(#\()
|
[(#\()
|
||||||
;; Watch for (? patterns
|
;; Watch for (? patterns
|
||||||
(if (and (< i last-index)
|
(if (and (< i last-index)
|
||||||
(eq? #\? (string-ref str (+ i 1))))
|
(eq? #\? (string-ref str (+ i 1))))
|
||||||
(loop (+ i 2) (cons #f in-paren) num-groups)
|
(loop (+ i 2) (cons #f in-paren) num-groups null-idx*)
|
||||||
(loop (+ i 1) (cons i in-paren) num-groups))]
|
(loop (+ i 1) (cons i in-paren) num-groups null-idx*))]
|
||||||
[(#\))
|
[(#\))
|
||||||
(cond
|
(cond
|
||||||
[(and (< i last-index)
|
|
||||||
(or
|
|
||||||
(eq? #\? (string-ref str (+ i 1)))
|
|
||||||
(eq? #\* (string-ref str (+ i 1)))))
|
|
||||||
;; TODO starred group = may be #f
|
|
||||||
#f]
|
|
||||||
[(null? in-paren)
|
[(null? in-paren)
|
||||||
(group-error str (format "')' at index ~a" i))]
|
(group-error str (format "')' at index ~a" i))]
|
||||||
[(eq? #f (car in-paren))
|
[(eq? #f (car in-paren))
|
||||||
;; Matched closing paren, but does not count as a group
|
;; Matched closing paren, but does not count as a group
|
||||||
(loop (+ i 1) (cdr in-paren) num-groups)]
|
(loop (+ i 1) (cdr in-paren) num-groups null-idx*)]
|
||||||
|
[(and (< i last-index)
|
||||||
|
(or
|
||||||
|
(eq? #\? (string-ref str (+ i 1)))
|
||||||
|
(eq? #\* (string-ref str (+ i 1)))))
|
||||||
|
;; group = may be #f
|
||||||
|
(loop (+ i 1) (cdr in-paren) (+ 1 num-groups) (cons num-groups null-idx*))]
|
||||||
[else
|
[else
|
||||||
(loop (+ i 1) (cdr in-paren) (+ 1 num-groups))])]
|
(loop (+ i 1) (cdr in-paren) (+ 1 num-groups) null-idx*)])]
|
||||||
[(#\\)
|
[(#\\)
|
||||||
(if (and (< i last-index)
|
(if (and (< i last-index)
|
||||||
(eq? #\\ (string-ref str (+ i 1))))
|
(eq? #\\ (string-ref str (+ i 1))))
|
||||||
(loop (+ i 3) in-paren num-groups)
|
(loop (+ i 3) in-paren num-groups null-idx*)
|
||||||
(loop (+ i 2) in-paren num-groups))]
|
(loop (+ i 2) in-paren num-groups null-idx*))]
|
||||||
[(#\|)
|
[(#\|)
|
||||||
;; Nope! Can't handle pipes
|
;; Nope! Can't handle pipes
|
||||||
#f]
|
(error 'internal-error "Found '|' character in regexp string.")]
|
||||||
[else
|
[else
|
||||||
(loop (+ i 1) in-paren num-groups)])))))
|
(loop (+ i 1) in-paren num-groups null-idx*)])))))
|
||||||
|
|
||||||
(define (parse-groups/string str #:src stx)
|
(define (parse-groups/string str #:src stx)
|
||||||
(let ([ng (parse-groups/untyped str #:src stx)])
|
(let ([ng (parse-groups/untyped str #:src stx)])
|
||||||
(and ng (cons ng 'String))))
|
(and ng (cons 'String ng))))
|
||||||
|
|
||||||
(define (parse-groups/bytes b #:src stx)
|
(define (parse-groups/bytes b #:src stx)
|
||||||
(let ([ng (parse-groups/untyped (~a b) #:src stx)])
|
(let ([ng (parse-groups/untyped (~a b) #:src stx)])
|
||||||
(and ng (cons ng 'Bytes))))
|
(and ng (cons 'Bytes ng))))
|
||||||
|
|
||||||
(define (parse-groups/regexp rx #:src stx)
|
(define (parse-groups/regexp rx #:src stx)
|
||||||
(parse-groups/string (~a rx) #:src stx))
|
(parse-groups/string (~a rx) #:src stx))
|
||||||
|
@ -144,6 +162,40 @@
|
||||||
;; --- Other helpers
|
;; --- Other helpers
|
||||||
|
|
||||||
(begin-for-syntax
|
(begin-for-syntax
|
||||||
|
|
||||||
|
;; Divide string into |-separated substrings (regex alternates)
|
||||||
|
;; Be wary of escaped | characters.
|
||||||
|
(define (string->alt* str)
|
||||||
|
(define L (string-length str))
|
||||||
|
(let loop ([prev-i 0] [i 0])
|
||||||
|
(cond
|
||||||
|
[(= i L)
|
||||||
|
;; End of string, return last alternate
|
||||||
|
(list (substring str prev-i i))]
|
||||||
|
[(and (eq? (string-ref str i) #\|)
|
||||||
|
(< 1 i)
|
||||||
|
(not (and (eq? (string-ref str (- i 1)) #\\)
|
||||||
|
(eq? (string-ref str (- i 2)) #\\))))
|
||||||
|
;; Found a pipe, save current alternate
|
||||||
|
(cons (substring str prev-i i)
|
||||||
|
(loop (+ i 1) (+ i 1)))]
|
||||||
|
[else
|
||||||
|
;; Nothing interesting, continue building alternate
|
||||||
|
(loop prev-i (+ i 1))])))
|
||||||
|
|
||||||
|
(define (intlist-union i* j*)
|
||||||
|
(cond
|
||||||
|
[(null? i*)
|
||||||
|
j*]
|
||||||
|
[(null? j*)
|
||||||
|
i*]
|
||||||
|
[(< (car i*) (car j*))
|
||||||
|
(cons (car i*) (intlist-union (cdr i*) j*))]
|
||||||
|
[(> (car i*) (car j*))
|
||||||
|
(cons (car j*) (intlist-union i* (cdr j*)))]
|
||||||
|
[else
|
||||||
|
(cons (car i*) (intlist-union (cdr i*) (cdr j*)))]))
|
||||||
|
|
||||||
(define (infer-return-type pattern-sym arg-stx)
|
(define (infer-return-type pattern-sym arg-stx)
|
||||||
(if (and
|
(if (and
|
||||||
(or (eq? pattern-sym 'String)
|
(or (eq? pattern-sym 'String)
|
||||||
|
@ -181,20 +233,19 @@
|
||||||
(define-syntax regexp-match: (make-alias #'regexp-match
|
(define-syntax regexp-match: (make-alias #'regexp-match
|
||||||
(lambda (stx) (syntax-parse stx
|
(lambda (stx) (syntax-parse stx
|
||||||
[(_ pat:pattern/groups arg* ...)
|
[(_ pat:pattern/groups arg* ...)
|
||||||
#:with (num-groups . type-sym) (syntax/loc stx pat.evidence)
|
#:with (type-sym num-groups null-idx*) (syntax/loc stx pat.evidence)
|
||||||
;; TODO keep source location in type-sym, stop using format-id
|
;; TODO keep source location in type-sym, stop using format-id
|
||||||
;; (Is it really that bad?)
|
;; (Is it really that bad?)
|
||||||
#:with return-type (format-id stx "~a" (infer-return-type (syntax-e #'type-sym)
|
#:with return-type (format-id stx "~a" (infer-return-type (syntax-e #'type-sym)
|
||||||
#'(arg* ...)))
|
#'(arg* ...)))
|
||||||
#:with (index* ...) (for/list ([i (in-range (syntax-e #'num-groups))]) i)
|
#:with (group-type* ...) (let ([null* (syntax->datum #'null-idx*)])
|
||||||
|
(for/list ([i (in-range (syntax-e #'num-groups))])
|
||||||
|
(if (memv i null*)
|
||||||
|
(syntax/loc stx (U #f return-type))
|
||||||
|
(syntax/loc stx return-type))))
|
||||||
(syntax/loc stx
|
(syntax/loc stx
|
||||||
(let ([maybe-match (regexp-match pat.expanded arg* ...)])
|
(let ([maybe-match (regexp-match pat.expanded arg* ...)])
|
||||||
(if maybe-match
|
(if maybe-match
|
||||||
(let ([m : (Listof (Option return-type)) maybe-match])
|
(cast maybe-match (List return-type group-type* ...))
|
||||||
(list (car maybe-match)
|
|
||||||
(begin (set! m (cdr m))
|
|
||||||
(or (car m) (error 'regexp-match: (format "Internal error at result index ~a, try using Racket's regexp-match" 'index*))))
|
|
||||||
...))
|
|
||||||
#f)))]
|
#f)))]
|
||||||
[_ #f]))))
|
[_ #f]))))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user