[private] much better regexp for pipes
This commit is contained in:
parent
5d04125c55
commit
fb22461c75
|
@ -10,9 +10,20 @@
|
|||
|
||||
;; -- regexp
|
||||
(let ()
|
||||
;; TODO (what groups does this return? re-read regexp spec)
|
||||
(check-true (and (regexp: "^(\r|\n|(\r\n))") #t)))
|
||||
|
||||
(let ([str "Pete would gain 4."])
|
||||
(check-equal?
|
||||
(ann (regexp-match: #px"^(.*?) would (gain|lose) (\\d+)\\.$" str)
|
||||
(U #f (List String String String String)))
|
||||
(list str "Pete" "gain" "4")))
|
||||
|
||||
(let ([str "| yo"]) ;; from John Clement's morse-code-trainer
|
||||
(check-equal?
|
||||
(ann (regexp-match: #px"hey [|] (yo)" str)
|
||||
(U #f (List String String)))
|
||||
#f))
|
||||
|
||||
;; -- regexp-match:
|
||||
(check-equal?
|
||||
(ann
|
||||
|
@ -248,7 +259,7 @@
|
|||
(U #f (List String String String)))
|
||||
'("ab" "ab" "a"))
|
||||
|
||||
;; --- Can't handle |, yet
|
||||
;; --- Can't handle |
|
||||
(check-equal?
|
||||
(ann
|
||||
(regexp-match: "this(group)|that" "that")
|
||||
|
@ -312,19 +323,19 @@
|
|||
(U #f (List String)))
|
||||
'("alot"))
|
||||
|
||||
;; -- pipes = take min groups
|
||||
;; 2016-06-08: currently disabled
|
||||
;(check-equal?
|
||||
; (ann
|
||||
; (regexp-match: "^(a*)|(b*)$" "aaa")
|
||||
; (U #f (List String (U #f String) (U #f String))))
|
||||
; '("aaa" "aaa" #f))
|
||||
; -- pipes = take min groups
|
||||
; 2016-06-08: currently disabled
|
||||
(check-equal?
|
||||
(ann
|
||||
(regexp-match: "^(a*)|(b*)$" "aaa")
|
||||
(U #f (List String (U #f String) (U #f String))))
|
||||
'("aaa" "aaa" #f))
|
||||
|
||||
;(check-equal?
|
||||
; (ann
|
||||
; (regexp-match: "^(aa*)(c*)|(b*)$" "b")
|
||||
; (U #f (List String (U #f String) (U #f String) (U #f String))))
|
||||
; '("b" #f #f "b"))
|
||||
(check-equal?
|
||||
(ann
|
||||
(regexp-match: "^(aa*)(c*)|(b*)$" "b")
|
||||
(U #f (List String (U #f String) (U #f String) (U #f String))))
|
||||
'("b" #f #f "b"))
|
||||
|
||||
;; -- nested gropus
|
||||
(check-equal?
|
||||
|
|
|
@ -98,8 +98,7 @@
|
|||
(syntax-parse stx #:literals (tr:#%plain-lambda)
|
||||
[(_ name:id v)
|
||||
#:with (tr:#%plain-lambda (_) v+)
|
||||
(parameterize ([*STOP-LIST* (cons #'name (*STOP-LIST*))])
|
||||
(expand-expr (syntax/loc stx (tr:lambda (name) v))))
|
||||
(expand-expr (syntax/loc stx (tr:lambda (name) v)))
|
||||
#:when (syntax-e (syntax/loc stx v+))
|
||||
#:with m (f-parse (syntax/loc stx v+))
|
||||
#:when (syntax-e (syntax/loc stx m))
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
typed/racket/base
|
||||
(only-in racket/list range)
|
||||
(only-in racket/format ~a)
|
||||
(only-in racket/unsafe/ops unsafe-string-ref)
|
||||
syntax/parse
|
||||
trivial/private/common))
|
||||
|
||||
|
@ -51,6 +52,9 @@
|
|||
str))
|
||||
|
||||
;; Dispatch for counting groups
|
||||
;; On success, return (Pairof Type (Listof Boolean))
|
||||
;; - type is probably the return type of matches
|
||||
;; - booleans indicating "always succeeds" (#t) and "may fail" (#f)
|
||||
(define (parse-groups v-stx)
|
||||
(define v (quoted-stx-value? v-stx))
|
||||
(cond
|
||||
|
@ -62,81 +66,6 @@
|
|||
[(byte-pregexp? v) (parse-groups/byte-pregexp v #:src v-stx)]
|
||||
[else #f]))
|
||||
|
||||
;; Handle pipes
|
||||
;; If there is a pipe, everything is nullable, but we know the number of groups
|
||||
(define (parse-groups/untyped str #:src stx)
|
||||
(define alt* (string->alt* str))
|
||||
(cond
|
||||
[(null? alt*)
|
||||
(list 0 '())]
|
||||
[(null? (cdr alt*))
|
||||
(parse-groups-for-alt (car alt*) #:src stx)]
|
||||
[else
|
||||
;(define num-groups
|
||||
; (for/fold ([num-groups 0])
|
||||
; ([alt (in-list alt*)])
|
||||
; (define ng+null* (parse-groups-for-alt alt #:src stx))
|
||||
; (+ num-groups (car ng+null*))))
|
||||
(parse-groups-for-alt str #:src stx)
|
||||
#f
|
||||
#;(list num-groups (range num-groups))]))
|
||||
|
||||
;; Count the number of matched parentheses in a regexp pattern.
|
||||
;; Raise an exception if there are unmatched parens.
|
||||
(define (parse-groups-for-alt str #:src stx)
|
||||
(define last-index (- (string-length str) 1))
|
||||
(define in-square? (box #f))
|
||||
(let loop ([i 0] [in-paren '()] [num-groups 0] [null-idx* '()])
|
||||
(if (> i last-index)
|
||||
(cond
|
||||
[(not (null? in-paren))
|
||||
(group-error stx str (format "'(' at index ~a" (car in-paren)))]
|
||||
[(unbox in-square?)
|
||||
(group-error stx str (format "'[' at index ~a" (car in-paren)))]
|
||||
[else
|
||||
(list num-groups null-idx*)])
|
||||
(if (unbox in-square?)
|
||||
(if (eq? #\] (string-ref str i))
|
||||
(begin (set-box! in-square? #f)
|
||||
(loop (+ i 1) (cdr in-paren) num-groups null-idx*))
|
||||
(loop (+ i 1) in-paren num-groups null-idx*))
|
||||
(case (string-ref str i)
|
||||
[(#\[)
|
||||
;; Ignore things between [ ... ]
|
||||
(set-box! in-square? #t)
|
||||
(loop (+ i 1) (cons i in-paren) num-groups null-idx*)]
|
||||
[(#\()
|
||||
;; Watch for (? patterns
|
||||
(if (and (< i last-index)
|
||||
(eq? #\? (string-ref str (+ i 1))))
|
||||
(loop (+ i 2) (cons #f in-paren) num-groups null-idx*)
|
||||
(loop (+ i 1) (cons i in-paren) num-groups null-idx*))]
|
||||
[(#\))
|
||||
(cond
|
||||
[(null? in-paren)
|
||||
(group-error stx str (format "')' at index ~a" i))]
|
||||
[(eq? #f (car in-paren))
|
||||
;; Matched closing paren, but does not count as a group
|
||||
(loop (+ i 1) (cdr in-paren) num-groups null-idx*)]
|
||||
[(and (< i last-index)
|
||||
(or
|
||||
(eq? #\? (string-ref str (+ i 1)))
|
||||
(eq? #\* (string-ref str (+ i 1)))))
|
||||
;; group = may be #f
|
||||
(loop (+ i 1) (cdr in-paren) (+ 1 num-groups) (cons num-groups null-idx*))]
|
||||
[else
|
||||
(loop (+ i 1) (cdr in-paren) (+ 1 num-groups) null-idx*)])]
|
||||
[(#\\)
|
||||
(if (and (< i last-index)
|
||||
(eq? #\\ (string-ref str (+ i 1))))
|
||||
(loop (+ i 3) in-paren num-groups null-idx*)
|
||||
(loop (+ i 2) in-paren num-groups null-idx*))]
|
||||
;[(#\|)
|
||||
; ;; Nope! Can't handle pipes
|
||||
; (error 'internal-error "Found '|' character in regexp string.")]
|
||||
[else
|
||||
(loop (+ i 1) in-paren num-groups null-idx*)])))))
|
||||
|
||||
(define (parse-groups/string str #:src stx)
|
||||
(let ([ng (parse-groups/untyped str #:src stx)])
|
||||
(and ng (cons 'String ng))))
|
||||
|
@ -161,44 +90,118 @@
|
|||
(make-value-property 'rx:groups parse-groups))
|
||||
(define-syntax-class/predicate pattern/groups rx?)
|
||||
|
||||
)
|
||||
;; -----------------------------------------------------------------------------
|
||||
;; --- Other helpers
|
||||
;; (-> String #:src Syntax (Listof Boolean))
|
||||
(define (parse-groups/untyped str #:src stx)
|
||||
(define pos**
|
||||
(unescaped-pos* str '(#\[ #\] #\( #\) #\| #\?)))
|
||||
;; -- check that [] are matched
|
||||
(define brack-ivl*
|
||||
(let* ([l-brack-pos* (car pos**)]
|
||||
[r-brack-pos* (cadr pos**)]
|
||||
[r (pair-up l-brack-pos* r-brack-pos*)])
|
||||
;; ?? okay for brackets to nest?
|
||||
(if (list? r)
|
||||
r
|
||||
(let ([brack-char (if (memv r l-brack-pos*) "[" "]")])
|
||||
(group-error stx str (format "'~a' at index ~a" brack-char r))))))
|
||||
;; -- ignore characters between a pair of brackets
|
||||
(define-values (l-paren-pos* r-paren-pos* pipe-pos* ?-pos*)
|
||||
(apply values (for/list ([pos* (in-list (cddr pos**))])
|
||||
(ivl-remove* brack-ivl* pos*))))
|
||||
;; -- check that () are matched
|
||||
(define paren-ivl*
|
||||
(let ([r (pair-up l-paren-pos* r-paren-pos*)])
|
||||
(if (list? r)
|
||||
r
|
||||
(let ([paren-char (if (memv r l-paren-pos*) "(" ")")])
|
||||
(group-error stx str (format "'~a' at index ~a" paren-char r))))))
|
||||
;; -- groups = #parens.
|
||||
;; may fail to capture if has | outside (that are not nested in other parens)
|
||||
;; or ? after close
|
||||
(for/list ([ivl (in-list paren-ivl*)]
|
||||
#:when (not (has-?-before ivl ?-pos*)))
|
||||
(and
|
||||
(not (has-unguarded-pipe-before-or-after ivl paren-ivl* pipe-pos*))
|
||||
(not (has-?-after ivl ?-pos*)))))
|
||||
|
||||
(begin-for-syntax
|
||||
(define (has-?-before ivl ?-pos*)
|
||||
(define pos-before (+ 1 (car ivl))) ;; Well, just inside the paren.
|
||||
(for/or ([?pos (in-list ?-pos*)])
|
||||
(= pos-before ?pos)))
|
||||
|
||||
;; Divide string into |-separated substrings (regex alternates)
|
||||
;; Be wary of escaped | characters.
|
||||
(define (string->alt* str)
|
||||
(define (has-?-after ivl ?-pos*)
|
||||
(define pos-after (+ 1 (cdr ivl)))
|
||||
(for/or ([?pos (in-list ?-pos*)])
|
||||
(= pos-after ?pos)))
|
||||
|
||||
(define (has-unguarded-pipe-before-or-after ivl paren-ivl* pipe-pos*)
|
||||
(define other-paren-ivl*
|
||||
(for/list ([ivl2 (in-list paren-ivl*)]
|
||||
#:when (not (ivl< ivl ivl2)))
|
||||
ivl2))
|
||||
(define dangerous-pipe* (ivl-remove* other-paren-ivl* pipe-pos*))
|
||||
(not (null? dangerous-pipe*)))
|
||||
|
||||
;; Does not work for #\\ character
|
||||
(define (unescaped-pos* str c*)
|
||||
(define L (string-length str))
|
||||
(let loop ([prev-i 0] [i 0])
|
||||
(cond
|
||||
[(= i L)
|
||||
;; End of string, return last alternate
|
||||
(list (substring str prev-i i))]
|
||||
[(and (eq? (string-ref str i) #\|)
|
||||
(< 1 i)
|
||||
(not (and (eq? (string-ref str (- i 1)) #\\)
|
||||
(eq? (string-ref str (- i 2)) #\\))))
|
||||
;; Found a pipe, save current alternate
|
||||
(cons (substring str prev-i i)
|
||||
(loop (+ i 1) (+ i 1)))]
|
||||
[else
|
||||
;; Nothing interesting, continue building alternate
|
||||
(loop prev-i (+ i 1))])))
|
||||
(define escaped? (box #f))
|
||||
(map reverse
|
||||
(for/fold ([hist (for/list ([c (in-list c*)]) '())])
|
||||
([i (in-range L)])
|
||||
(define char (unsafe-string-ref str i))
|
||||
(cond
|
||||
[(unbox escaped?)
|
||||
(unless (eq? #\\ char)
|
||||
(set-box! escaped? #f))
|
||||
hist]
|
||||
[(eq? #\\ char)
|
||||
(set-box! escaped? #t)
|
||||
hist]
|
||||
[else
|
||||
(for/list ([h (in-list hist)]
|
||||
[c (in-list c*)])
|
||||
(if (eq? c char)
|
||||
(cons i h)
|
||||
h))]))))
|
||||
|
||||
(define (intlist-union i* j*)
|
||||
(cond
|
||||
[(null? i*)
|
||||
j*]
|
||||
[(null? j*)
|
||||
i*]
|
||||
[(< (car i*) (car j*))
|
||||
(cons (car i*) (intlist-union (cdr i*) j*))]
|
||||
[(> (car i*) (car j*))
|
||||
(cons (car j*) (intlist-union i* (cdr j*)))]
|
||||
[else
|
||||
(cons (car i*) (intlist-union (cdr i*) (cdr j*)))]))
|
||||
;; Match a list of left indices with a list of right indices.
|
||||
;; Return a list of pairs on success
|
||||
;; and the unmatched index on failure.
|
||||
;; (-> (Listof Natural) (Listof Natural) (U Natural (Listof (Pairof Natural Natural))))
|
||||
(define (pair-up l* r*)
|
||||
(let loop ([i 0] [l* l*] [r* r*] [prev* '()])
|
||||
(cond
|
||||
[(null? r*)
|
||||
(if (null? l*)
|
||||
(if (null? prev*)
|
||||
'() ;; good
|
||||
(car prev*)) ;; bad
|
||||
(car l*))] ;; bad
|
||||
[(= i (car r*))
|
||||
(if (null? prev*)
|
||||
i
|
||||
(let ([r (loop (+ i 1) l* (cdr r*) (cdr prev*))])
|
||||
(if (integer? r)
|
||||
r
|
||||
(cons (cons (car prev*) i) r))))]
|
||||
[(or (null? l*) (< i (car l*)))
|
||||
(loop (+ i 1) l* r* prev*)]
|
||||
[(= i (car l*))
|
||||
(loop (+ i 1) (cdr l*) r* (cons i prev*))])))
|
||||
|
||||
(define (ivl-remove* ivl* i*)
|
||||
(for/list ([i (in-list i*)]
|
||||
#:when (not (for/or ([ivl (in-list ivl*)]) (in-ivl? ivl i))))
|
||||
i))
|
||||
|
||||
(define (ivl< ivl1 ivl2)
|
||||
(and (< (car ivl2) (car ivl1))
|
||||
(< (cdr ivl1) (cdr ivl2))))
|
||||
|
||||
(define (in-ivl? ivl i)
|
||||
(and (< (car ivl) i)
|
||||
(< i (cdr ivl))))
|
||||
|
||||
(define (infer-return-type pattern-sym arg-stx)
|
||||
(if (and
|
||||
|
@ -237,16 +240,17 @@
|
|||
(define-syntax regexp-match: (make-alias #'regexp-match
|
||||
(lambda (stx) (syntax-parse stx
|
||||
[(_ pat:pattern/groups arg* ...)
|
||||
#:with (type-sym num-groups null-idx*) (syntax/loc stx pat.evidence)
|
||||
;; TODO keep source location in type-sym, stop using format-id
|
||||
;; (Is it really that bad?)
|
||||
#:with return-type (format-id stx "~a" (infer-return-type (syntax-e #'type-sym)
|
||||
#'(arg* ...)))
|
||||
#:with (group-type* ...) (let ([null* (syntax->datum #'null-idx*)])
|
||||
(for/list ([i (in-range (syntax-e #'num-groups))])
|
||||
(if (memv i null*)
|
||||
(syntax/loc stx (U #f return-type))
|
||||
(syntax/loc stx return-type))))
|
||||
#:with (type-sym . capture?*)
|
||||
(syntax/loc stx pat.evidence)
|
||||
#:with return-type
|
||||
(format-id stx "~a" (infer-return-type (syntax-e #'type-sym) #'(arg* ...)))
|
||||
#:with (group-type* ...)
|
||||
(let ([stx-never-fail (syntax/loc stx return-type)]
|
||||
[stx-may-fail (syntax/loc stx (U #f return-type))])
|
||||
(for/list ([c-stx (in-list (syntax-e #'capture?*))])
|
||||
(if (syntax-e c-stx)
|
||||
stx-never-fail
|
||||
stx-may-fail)))
|
||||
(syntax/loc stx
|
||||
(let ([maybe-match (regexp-match pat.expanded arg* ...)])
|
||||
(if maybe-match
|
||||
|
|
Loading…
Reference in New Issue
Block a user