[private] much better regexp for pipes
This commit is contained in:
parent
5d04125c55
commit
fb22461c75
|
@ -10,9 +10,20 @@
|
||||||
|
|
||||||
;; -- regexp
|
;; -- regexp
|
||||||
(let ()
|
(let ()
|
||||||
;; TODO (what groups does this return? re-read regexp spec)
|
|
||||||
(check-true (and (regexp: "^(\r|\n|(\r\n))") #t)))
|
(check-true (and (regexp: "^(\r|\n|(\r\n))") #t)))
|
||||||
|
|
||||||
|
(let ([str "Pete would gain 4."])
|
||||||
|
(check-equal?
|
||||||
|
(ann (regexp-match: #px"^(.*?) would (gain|lose) (\\d+)\\.$" str)
|
||||||
|
(U #f (List String String String String)))
|
||||||
|
(list str "Pete" "gain" "4")))
|
||||||
|
|
||||||
|
(let ([str "| yo"]) ;; from John Clement's morse-code-trainer
|
||||||
|
(check-equal?
|
||||||
|
(ann (regexp-match: #px"hey [|] (yo)" str)
|
||||||
|
(U #f (List String String)))
|
||||||
|
#f))
|
||||||
|
|
||||||
;; -- regexp-match:
|
;; -- regexp-match:
|
||||||
(check-equal?
|
(check-equal?
|
||||||
(ann
|
(ann
|
||||||
|
@ -248,7 +259,7 @@
|
||||||
(U #f (List String String String)))
|
(U #f (List String String String)))
|
||||||
'("ab" "ab" "a"))
|
'("ab" "ab" "a"))
|
||||||
|
|
||||||
;; --- Can't handle |, yet
|
;; --- Can't handle |
|
||||||
(check-equal?
|
(check-equal?
|
||||||
(ann
|
(ann
|
||||||
(regexp-match: "this(group)|that" "that")
|
(regexp-match: "this(group)|that" "that")
|
||||||
|
@ -312,19 +323,19 @@
|
||||||
(U #f (List String)))
|
(U #f (List String)))
|
||||||
'("alot"))
|
'("alot"))
|
||||||
|
|
||||||
;; -- pipes = take min groups
|
; -- pipes = take min groups
|
||||||
;; 2016-06-08: currently disabled
|
; 2016-06-08: currently disabled
|
||||||
;(check-equal?
|
(check-equal?
|
||||||
; (ann
|
(ann
|
||||||
; (regexp-match: "^(a*)|(b*)$" "aaa")
|
(regexp-match: "^(a*)|(b*)$" "aaa")
|
||||||
; (U #f (List String (U #f String) (U #f String))))
|
(U #f (List String (U #f String) (U #f String))))
|
||||||
; '("aaa" "aaa" #f))
|
'("aaa" "aaa" #f))
|
||||||
|
|
||||||
;(check-equal?
|
(check-equal?
|
||||||
; (ann
|
(ann
|
||||||
; (regexp-match: "^(aa*)(c*)|(b*)$" "b")
|
(regexp-match: "^(aa*)(c*)|(b*)$" "b")
|
||||||
; (U #f (List String (U #f String) (U #f String) (U #f String))))
|
(U #f (List String (U #f String) (U #f String) (U #f String))))
|
||||||
; '("b" #f #f "b"))
|
'("b" #f #f "b"))
|
||||||
|
|
||||||
;; -- nested gropus
|
;; -- nested gropus
|
||||||
(check-equal?
|
(check-equal?
|
||||||
|
|
|
@ -98,8 +98,7 @@
|
||||||
(syntax-parse stx #:literals (tr:#%plain-lambda)
|
(syntax-parse stx #:literals (tr:#%plain-lambda)
|
||||||
[(_ name:id v)
|
[(_ name:id v)
|
||||||
#:with (tr:#%plain-lambda (_) v+)
|
#:with (tr:#%plain-lambda (_) v+)
|
||||||
(parameterize ([*STOP-LIST* (cons #'name (*STOP-LIST*))])
|
(expand-expr (syntax/loc stx (tr:lambda (name) v)))
|
||||||
(expand-expr (syntax/loc stx (tr:lambda (name) v))))
|
|
||||||
#:when (syntax-e (syntax/loc stx v+))
|
#:when (syntax-e (syntax/loc stx v+))
|
||||||
#:with m (f-parse (syntax/loc stx v+))
|
#:with m (f-parse (syntax/loc stx v+))
|
||||||
#:when (syntax-e (syntax/loc stx m))
|
#:when (syntax-e (syntax/loc stx m))
|
||||||
|
|
|
@ -34,6 +34,7 @@
|
||||||
typed/racket/base
|
typed/racket/base
|
||||||
(only-in racket/list range)
|
(only-in racket/list range)
|
||||||
(only-in racket/format ~a)
|
(only-in racket/format ~a)
|
||||||
|
(only-in racket/unsafe/ops unsafe-string-ref)
|
||||||
syntax/parse
|
syntax/parse
|
||||||
trivial/private/common))
|
trivial/private/common))
|
||||||
|
|
||||||
|
@ -51,6 +52,9 @@
|
||||||
str))
|
str))
|
||||||
|
|
||||||
;; Dispatch for counting groups
|
;; Dispatch for counting groups
|
||||||
|
;; On success, return (Pairof Type (Listof Boolean))
|
||||||
|
;; - type is probably the return type of matches
|
||||||
|
;; - booleans indicating "always succeeds" (#t) and "may fail" (#f)
|
||||||
(define (parse-groups v-stx)
|
(define (parse-groups v-stx)
|
||||||
(define v (quoted-stx-value? v-stx))
|
(define v (quoted-stx-value? v-stx))
|
||||||
(cond
|
(cond
|
||||||
|
@ -62,81 +66,6 @@
|
||||||
[(byte-pregexp? v) (parse-groups/byte-pregexp v #:src v-stx)]
|
[(byte-pregexp? v) (parse-groups/byte-pregexp v #:src v-stx)]
|
||||||
[else #f]))
|
[else #f]))
|
||||||
|
|
||||||
;; Handle pipes
|
|
||||||
;; If there is a pipe, everything is nullable, but we know the number of groups
|
|
||||||
(define (parse-groups/untyped str #:src stx)
|
|
||||||
(define alt* (string->alt* str))
|
|
||||||
(cond
|
|
||||||
[(null? alt*)
|
|
||||||
(list 0 '())]
|
|
||||||
[(null? (cdr alt*))
|
|
||||||
(parse-groups-for-alt (car alt*) #:src stx)]
|
|
||||||
[else
|
|
||||||
;(define num-groups
|
|
||||||
; (for/fold ([num-groups 0])
|
|
||||||
; ([alt (in-list alt*)])
|
|
||||||
; (define ng+null* (parse-groups-for-alt alt #:src stx))
|
|
||||||
; (+ num-groups (car ng+null*))))
|
|
||||||
(parse-groups-for-alt str #:src stx)
|
|
||||||
#f
|
|
||||||
#;(list num-groups (range num-groups))]))
|
|
||||||
|
|
||||||
;; Count the number of matched parentheses in a regexp pattern.
|
|
||||||
;; Raise an exception if there are unmatched parens.
|
|
||||||
(define (parse-groups-for-alt str #:src stx)
|
|
||||||
(define last-index (- (string-length str) 1))
|
|
||||||
(define in-square? (box #f))
|
|
||||||
(let loop ([i 0] [in-paren '()] [num-groups 0] [null-idx* '()])
|
|
||||||
(if (> i last-index)
|
|
||||||
(cond
|
|
||||||
[(not (null? in-paren))
|
|
||||||
(group-error stx str (format "'(' at index ~a" (car in-paren)))]
|
|
||||||
[(unbox in-square?)
|
|
||||||
(group-error stx str (format "'[' at index ~a" (car in-paren)))]
|
|
||||||
[else
|
|
||||||
(list num-groups null-idx*)])
|
|
||||||
(if (unbox in-square?)
|
|
||||||
(if (eq? #\] (string-ref str i))
|
|
||||||
(begin (set-box! in-square? #f)
|
|
||||||
(loop (+ i 1) (cdr in-paren) num-groups null-idx*))
|
|
||||||
(loop (+ i 1) in-paren num-groups null-idx*))
|
|
||||||
(case (string-ref str i)
|
|
||||||
[(#\[)
|
|
||||||
;; Ignore things between [ ... ]
|
|
||||||
(set-box! in-square? #t)
|
|
||||||
(loop (+ i 1) (cons i in-paren) num-groups null-idx*)]
|
|
||||||
[(#\()
|
|
||||||
;; Watch for (? patterns
|
|
||||||
(if (and (< i last-index)
|
|
||||||
(eq? #\? (string-ref str (+ i 1))))
|
|
||||||
(loop (+ i 2) (cons #f in-paren) num-groups null-idx*)
|
|
||||||
(loop (+ i 1) (cons i in-paren) num-groups null-idx*))]
|
|
||||||
[(#\))
|
|
||||||
(cond
|
|
||||||
[(null? in-paren)
|
|
||||||
(group-error stx str (format "')' at index ~a" i))]
|
|
||||||
[(eq? #f (car in-paren))
|
|
||||||
;; Matched closing paren, but does not count as a group
|
|
||||||
(loop (+ i 1) (cdr in-paren) num-groups null-idx*)]
|
|
||||||
[(and (< i last-index)
|
|
||||||
(or
|
|
||||||
(eq? #\? (string-ref str (+ i 1)))
|
|
||||||
(eq? #\* (string-ref str (+ i 1)))))
|
|
||||||
;; group = may be #f
|
|
||||||
(loop (+ i 1) (cdr in-paren) (+ 1 num-groups) (cons num-groups null-idx*))]
|
|
||||||
[else
|
|
||||||
(loop (+ i 1) (cdr in-paren) (+ 1 num-groups) null-idx*)])]
|
|
||||||
[(#\\)
|
|
||||||
(if (and (< i last-index)
|
|
||||||
(eq? #\\ (string-ref str (+ i 1))))
|
|
||||||
(loop (+ i 3) in-paren num-groups null-idx*)
|
|
||||||
(loop (+ i 2) in-paren num-groups null-idx*))]
|
|
||||||
;[(#\|)
|
|
||||||
; ;; Nope! Can't handle pipes
|
|
||||||
; (error 'internal-error "Found '|' character in regexp string.")]
|
|
||||||
[else
|
|
||||||
(loop (+ i 1) in-paren num-groups null-idx*)])))))
|
|
||||||
|
|
||||||
(define (parse-groups/string str #:src stx)
|
(define (parse-groups/string str #:src stx)
|
||||||
(let ([ng (parse-groups/untyped str #:src stx)])
|
(let ([ng (parse-groups/untyped str #:src stx)])
|
||||||
(and ng (cons 'String ng))))
|
(and ng (cons 'String ng))))
|
||||||
|
@ -161,44 +90,118 @@
|
||||||
(make-value-property 'rx:groups parse-groups))
|
(make-value-property 'rx:groups parse-groups))
|
||||||
(define-syntax-class/predicate pattern/groups rx?)
|
(define-syntax-class/predicate pattern/groups rx?)
|
||||||
|
|
||||||
)
|
;; (-> String #:src Syntax (Listof Boolean))
|
||||||
;; -----------------------------------------------------------------------------
|
(define (parse-groups/untyped str #:src stx)
|
||||||
;; --- Other helpers
|
(define pos**
|
||||||
|
(unescaped-pos* str '(#\[ #\] #\( #\) #\| #\?)))
|
||||||
|
;; -- check that [] are matched
|
||||||
|
(define brack-ivl*
|
||||||
|
(let* ([l-brack-pos* (car pos**)]
|
||||||
|
[r-brack-pos* (cadr pos**)]
|
||||||
|
[r (pair-up l-brack-pos* r-brack-pos*)])
|
||||||
|
;; ?? okay for brackets to nest?
|
||||||
|
(if (list? r)
|
||||||
|
r
|
||||||
|
(let ([brack-char (if (memv r l-brack-pos*) "[" "]")])
|
||||||
|
(group-error stx str (format "'~a' at index ~a" brack-char r))))))
|
||||||
|
;; -- ignore characters between a pair of brackets
|
||||||
|
(define-values (l-paren-pos* r-paren-pos* pipe-pos* ?-pos*)
|
||||||
|
(apply values (for/list ([pos* (in-list (cddr pos**))])
|
||||||
|
(ivl-remove* brack-ivl* pos*))))
|
||||||
|
;; -- check that () are matched
|
||||||
|
(define paren-ivl*
|
||||||
|
(let ([r (pair-up l-paren-pos* r-paren-pos*)])
|
||||||
|
(if (list? r)
|
||||||
|
r
|
||||||
|
(let ([paren-char (if (memv r l-paren-pos*) "(" ")")])
|
||||||
|
(group-error stx str (format "'~a' at index ~a" paren-char r))))))
|
||||||
|
;; -- groups = #parens.
|
||||||
|
;; may fail to capture if has | outside (that are not nested in other parens)
|
||||||
|
;; or ? after close
|
||||||
|
(for/list ([ivl (in-list paren-ivl*)]
|
||||||
|
#:when (not (has-?-before ivl ?-pos*)))
|
||||||
|
(and
|
||||||
|
(not (has-unguarded-pipe-before-or-after ivl paren-ivl* pipe-pos*))
|
||||||
|
(not (has-?-after ivl ?-pos*)))))
|
||||||
|
|
||||||
(begin-for-syntax
|
(define (has-?-before ivl ?-pos*)
|
||||||
|
(define pos-before (+ 1 (car ivl))) ;; Well, just inside the paren.
|
||||||
|
(for/or ([?pos (in-list ?-pos*)])
|
||||||
|
(= pos-before ?pos)))
|
||||||
|
|
||||||
;; Divide string into |-separated substrings (regex alternates)
|
(define (has-?-after ivl ?-pos*)
|
||||||
;; Be wary of escaped | characters.
|
(define pos-after (+ 1 (cdr ivl)))
|
||||||
(define (string->alt* str)
|
(for/or ([?pos (in-list ?-pos*)])
|
||||||
|
(= pos-after ?pos)))
|
||||||
|
|
||||||
|
(define (has-unguarded-pipe-before-or-after ivl paren-ivl* pipe-pos*)
|
||||||
|
(define other-paren-ivl*
|
||||||
|
(for/list ([ivl2 (in-list paren-ivl*)]
|
||||||
|
#:when (not (ivl< ivl ivl2)))
|
||||||
|
ivl2))
|
||||||
|
(define dangerous-pipe* (ivl-remove* other-paren-ivl* pipe-pos*))
|
||||||
|
(not (null? dangerous-pipe*)))
|
||||||
|
|
||||||
|
;; Does not work for #\\ character
|
||||||
|
(define (unescaped-pos* str c*)
|
||||||
(define L (string-length str))
|
(define L (string-length str))
|
||||||
(let loop ([prev-i 0] [i 0])
|
(define escaped? (box #f))
|
||||||
(cond
|
(map reverse
|
||||||
[(= i L)
|
(for/fold ([hist (for/list ([c (in-list c*)]) '())])
|
||||||
;; End of string, return last alternate
|
([i (in-range L)])
|
||||||
(list (substring str prev-i i))]
|
(define char (unsafe-string-ref str i))
|
||||||
[(and (eq? (string-ref str i) #\|)
|
(cond
|
||||||
(< 1 i)
|
[(unbox escaped?)
|
||||||
(not (and (eq? (string-ref str (- i 1)) #\\)
|
(unless (eq? #\\ char)
|
||||||
(eq? (string-ref str (- i 2)) #\\))))
|
(set-box! escaped? #f))
|
||||||
;; Found a pipe, save current alternate
|
hist]
|
||||||
(cons (substring str prev-i i)
|
[(eq? #\\ char)
|
||||||
(loop (+ i 1) (+ i 1)))]
|
(set-box! escaped? #t)
|
||||||
[else
|
hist]
|
||||||
;; Nothing interesting, continue building alternate
|
[else
|
||||||
(loop prev-i (+ i 1))])))
|
(for/list ([h (in-list hist)]
|
||||||
|
[c (in-list c*)])
|
||||||
|
(if (eq? c char)
|
||||||
|
(cons i h)
|
||||||
|
h))]))))
|
||||||
|
|
||||||
(define (intlist-union i* j*)
|
;; Match a list of left indices with a list of right indices.
|
||||||
(cond
|
;; Return a list of pairs on success
|
||||||
[(null? i*)
|
;; and the unmatched index on failure.
|
||||||
j*]
|
;; (-> (Listof Natural) (Listof Natural) (U Natural (Listof (Pairof Natural Natural))))
|
||||||
[(null? j*)
|
(define (pair-up l* r*)
|
||||||
i*]
|
(let loop ([i 0] [l* l*] [r* r*] [prev* '()])
|
||||||
[(< (car i*) (car j*))
|
(cond
|
||||||
(cons (car i*) (intlist-union (cdr i*) j*))]
|
[(null? r*)
|
||||||
[(> (car i*) (car j*))
|
(if (null? l*)
|
||||||
(cons (car j*) (intlist-union i* (cdr j*)))]
|
(if (null? prev*)
|
||||||
[else
|
'() ;; good
|
||||||
(cons (car i*) (intlist-union (cdr i*) (cdr j*)))]))
|
(car prev*)) ;; bad
|
||||||
|
(car l*))] ;; bad
|
||||||
|
[(= i (car r*))
|
||||||
|
(if (null? prev*)
|
||||||
|
i
|
||||||
|
(let ([r (loop (+ i 1) l* (cdr r*) (cdr prev*))])
|
||||||
|
(if (integer? r)
|
||||||
|
r
|
||||||
|
(cons (cons (car prev*) i) r))))]
|
||||||
|
[(or (null? l*) (< i (car l*)))
|
||||||
|
(loop (+ i 1) l* r* prev*)]
|
||||||
|
[(= i (car l*))
|
||||||
|
(loop (+ i 1) (cdr l*) r* (cons i prev*))])))
|
||||||
|
|
||||||
|
(define (ivl-remove* ivl* i*)
|
||||||
|
(for/list ([i (in-list i*)]
|
||||||
|
#:when (not (for/or ([ivl (in-list ivl*)]) (in-ivl? ivl i))))
|
||||||
|
i))
|
||||||
|
|
||||||
|
(define (ivl< ivl1 ivl2)
|
||||||
|
(and (< (car ivl2) (car ivl1))
|
||||||
|
(< (cdr ivl1) (cdr ivl2))))
|
||||||
|
|
||||||
|
(define (in-ivl? ivl i)
|
||||||
|
(and (< (car ivl) i)
|
||||||
|
(< i (cdr ivl))))
|
||||||
|
|
||||||
(define (infer-return-type pattern-sym arg-stx)
|
(define (infer-return-type pattern-sym arg-stx)
|
||||||
(if (and
|
(if (and
|
||||||
|
@ -237,16 +240,17 @@
|
||||||
(define-syntax regexp-match: (make-alias #'regexp-match
|
(define-syntax regexp-match: (make-alias #'regexp-match
|
||||||
(lambda (stx) (syntax-parse stx
|
(lambda (stx) (syntax-parse stx
|
||||||
[(_ pat:pattern/groups arg* ...)
|
[(_ pat:pattern/groups arg* ...)
|
||||||
#:with (type-sym num-groups null-idx*) (syntax/loc stx pat.evidence)
|
#:with (type-sym . capture?*)
|
||||||
;; TODO keep source location in type-sym, stop using format-id
|
(syntax/loc stx pat.evidence)
|
||||||
;; (Is it really that bad?)
|
#:with return-type
|
||||||
#:with return-type (format-id stx "~a" (infer-return-type (syntax-e #'type-sym)
|
(format-id stx "~a" (infer-return-type (syntax-e #'type-sym) #'(arg* ...)))
|
||||||
#'(arg* ...)))
|
#:with (group-type* ...)
|
||||||
#:with (group-type* ...) (let ([null* (syntax->datum #'null-idx*)])
|
(let ([stx-never-fail (syntax/loc stx return-type)]
|
||||||
(for/list ([i (in-range (syntax-e #'num-groups))])
|
[stx-may-fail (syntax/loc stx (U #f return-type))])
|
||||||
(if (memv i null*)
|
(for/list ([c-stx (in-list (syntax-e #'capture?*))])
|
||||||
(syntax/loc stx (U #f return-type))
|
(if (syntax-e c-stx)
|
||||||
(syntax/loc stx return-type))))
|
stx-never-fail
|
||||||
|
stx-may-fail)))
|
||||||
(syntax/loc stx
|
(syntax/loc stx
|
||||||
(let ([maybe-match (regexp-match pat.expanded arg* ...)])
|
(let ([maybe-match (regexp-match pat.expanded arg* ...)])
|
||||||
(if maybe-match
|
(if maybe-match
|
||||||
|
|
Loading…
Reference in New Issue
Block a user