[private] much better regexp for pipes

This commit is contained in:
Ben Greenman 2016-06-10 05:09:29 -04:00
parent 5d04125c55
commit fb22461c75
3 changed files with 149 additions and 135 deletions

View File

@ -10,9 +10,20 @@
;; -- regexp
(let ()
;; TODO (what groups does this return? re-read regexp spec)
(check-true (and (regexp: "^(\r|\n|(\r\n))") #t)))
(let ([str "Pete would gain 4."])
(check-equal?
(ann (regexp-match: #px"^(.*?) would (gain|lose) (\\d+)\\.$" str)
(U #f (List String String String String)))
(list str "Pete" "gain" "4")))
(let ([str "| yo"]) ;; from John Clement's morse-code-trainer
(check-equal?
(ann (regexp-match: #px"hey [|] (yo)" str)
(U #f (List String String)))
#f))
;; -- regexp-match:
(check-equal?
(ann
@ -248,7 +259,7 @@
(U #f (List String String String)))
'("ab" "ab" "a"))
;; --- Can't handle |, yet
;; --- Can't handle |
(check-equal?
(ann
(regexp-match: "this(group)|that" "that")
@ -312,19 +323,19 @@
(U #f (List String)))
'("alot"))
;; -- pipes = take min groups
;; 2016-06-08: currently disabled
;(check-equal?
; (ann
; (regexp-match: "^(a*)|(b*)$" "aaa")
; (U #f (List String (U #f String) (U #f String))))
; '("aaa" "aaa" #f))
; -- pipes = take min groups
; 2016-06-08: currently disabled
(check-equal?
(ann
(regexp-match: "^(a*)|(b*)$" "aaa")
(U #f (List String (U #f String) (U #f String))))
'("aaa" "aaa" #f))
;(check-equal?
; (ann
; (regexp-match: "^(aa*)(c*)|(b*)$" "b")
; (U #f (List String (U #f String) (U #f String) (U #f String))))
; '("b" #f #f "b"))
(check-equal?
(ann
(regexp-match: "^(aa*)(c*)|(b*)$" "b")
(U #f (List String (U #f String) (U #f String) (U #f String))))
'("b" #f #f "b"))
;; -- nested gropus
(check-equal?

View File

@ -98,8 +98,7 @@
(syntax-parse stx #:literals (tr:#%plain-lambda)
[(_ name:id v)
#:with (tr:#%plain-lambda (_) v+)
(parameterize ([*STOP-LIST* (cons #'name (*STOP-LIST*))])
(expand-expr (syntax/loc stx (tr:lambda (name) v))))
(expand-expr (syntax/loc stx (tr:lambda (name) v)))
#:when (syntax-e (syntax/loc stx v+))
#:with m (f-parse (syntax/loc stx v+))
#:when (syntax-e (syntax/loc stx m))

View File

@ -34,6 +34,7 @@
typed/racket/base
(only-in racket/list range)
(only-in racket/format ~a)
(only-in racket/unsafe/ops unsafe-string-ref)
syntax/parse
trivial/private/common))
@ -51,6 +52,9 @@
str))
;; Dispatch for counting groups
;; On success, return (Pairof Type (Listof Boolean))
;; - type is probably the return type of matches
;; - booleans indicating "always succeeds" (#t) and "may fail" (#f)
(define (parse-groups v-stx)
(define v (quoted-stx-value? v-stx))
(cond
@ -62,81 +66,6 @@
[(byte-pregexp? v) (parse-groups/byte-pregexp v #:src v-stx)]
[else #f]))
;; Handle pipes
;; If there is a pipe, everything is nullable, but we know the number of groups
(define (parse-groups/untyped str #:src stx)
(define alt* (string->alt* str))
(cond
[(null? alt*)
(list 0 '())]
[(null? (cdr alt*))
(parse-groups-for-alt (car alt*) #:src stx)]
[else
;(define num-groups
; (for/fold ([num-groups 0])
; ([alt (in-list alt*)])
; (define ng+null* (parse-groups-for-alt alt #:src stx))
; (+ num-groups (car ng+null*))))
(parse-groups-for-alt str #:src stx)
#f
#;(list num-groups (range num-groups))]))
;; Count the number of matched parentheses in a regexp pattern.
;; Raise an exception if there are unmatched parens.
(define (parse-groups-for-alt str #:src stx)
(define last-index (- (string-length str) 1))
(define in-square? (box #f))
(let loop ([i 0] [in-paren '()] [num-groups 0] [null-idx* '()])
(if (> i last-index)
(cond
[(not (null? in-paren))
(group-error stx str (format "'(' at index ~a" (car in-paren)))]
[(unbox in-square?)
(group-error stx str (format "'[' at index ~a" (car in-paren)))]
[else
(list num-groups null-idx*)])
(if (unbox in-square?)
(if (eq? #\] (string-ref str i))
(begin (set-box! in-square? #f)
(loop (+ i 1) (cdr in-paren) num-groups null-idx*))
(loop (+ i 1) in-paren num-groups null-idx*))
(case (string-ref str i)
[(#\[)
;; Ignore things between [ ... ]
(set-box! in-square? #t)
(loop (+ i 1) (cons i in-paren) num-groups null-idx*)]
[(#\()
;; Watch for (? patterns
(if (and (< i last-index)
(eq? #\? (string-ref str (+ i 1))))
(loop (+ i 2) (cons #f in-paren) num-groups null-idx*)
(loop (+ i 1) (cons i in-paren) num-groups null-idx*))]
[(#\))
(cond
[(null? in-paren)
(group-error stx str (format "')' at index ~a" i))]
[(eq? #f (car in-paren))
;; Matched closing paren, but does not count as a group
(loop (+ i 1) (cdr in-paren) num-groups null-idx*)]
[(and (< i last-index)
(or
(eq? #\? (string-ref str (+ i 1)))
(eq? #\* (string-ref str (+ i 1)))))
;; group = may be #f
(loop (+ i 1) (cdr in-paren) (+ 1 num-groups) (cons num-groups null-idx*))]
[else
(loop (+ i 1) (cdr in-paren) (+ 1 num-groups) null-idx*)])]
[(#\\)
(if (and (< i last-index)
(eq? #\\ (string-ref str (+ i 1))))
(loop (+ i 3) in-paren num-groups null-idx*)
(loop (+ i 2) in-paren num-groups null-idx*))]
;[(#\|)
; ;; Nope! Can't handle pipes
; (error 'internal-error "Found '|' character in regexp string.")]
[else
(loop (+ i 1) in-paren num-groups null-idx*)])))))
(define (parse-groups/string str #:src stx)
(let ([ng (parse-groups/untyped str #:src stx)])
(and ng (cons 'String ng))))
@ -161,44 +90,118 @@
(make-value-property 'rx:groups parse-groups))
(define-syntax-class/predicate pattern/groups rx?)
)
;; -----------------------------------------------------------------------------
;; --- Other helpers
;; (-> String #:src Syntax (Listof Boolean))
(define (parse-groups/untyped str #:src stx)
(define pos**
(unescaped-pos* str '(#\[ #\] #\( #\) #\| #\?)))
;; -- check that [] are matched
(define brack-ivl*
(let* ([l-brack-pos* (car pos**)]
[r-brack-pos* (cadr pos**)]
[r (pair-up l-brack-pos* r-brack-pos*)])
;; ?? okay for brackets to nest?
(if (list? r)
r
(let ([brack-char (if (memv r l-brack-pos*) "[" "]")])
(group-error stx str (format "'~a' at index ~a" brack-char r))))))
;; -- ignore characters between a pair of brackets
(define-values (l-paren-pos* r-paren-pos* pipe-pos* ?-pos*)
(apply values (for/list ([pos* (in-list (cddr pos**))])
(ivl-remove* brack-ivl* pos*))))
;; -- check that () are matched
(define paren-ivl*
(let ([r (pair-up l-paren-pos* r-paren-pos*)])
(if (list? r)
r
(let ([paren-char (if (memv r l-paren-pos*) "(" ")")])
(group-error stx str (format "'~a' at index ~a" paren-char r))))))
;; -- groups = #parens.
;; may fail to capture if has | outside (that are not nested in other parens)
;; or ? after close
(for/list ([ivl (in-list paren-ivl*)]
#:when (not (has-?-before ivl ?-pos*)))
(and
(not (has-unguarded-pipe-before-or-after ivl paren-ivl* pipe-pos*))
(not (has-?-after ivl ?-pos*)))))
(begin-for-syntax
(define (has-?-before ivl ?-pos*)
(define pos-before (+ 1 (car ivl))) ;; Well, just inside the paren.
(for/or ([?pos (in-list ?-pos*)])
(= pos-before ?pos)))
;; Divide string into |-separated substrings (regex alternates)
;; Be wary of escaped | characters.
(define (string->alt* str)
(define (has-?-after ivl ?-pos*)
(define pos-after (+ 1 (cdr ivl)))
(for/or ([?pos (in-list ?-pos*)])
(= pos-after ?pos)))
(define (has-unguarded-pipe-before-or-after ivl paren-ivl* pipe-pos*)
(define other-paren-ivl*
(for/list ([ivl2 (in-list paren-ivl*)]
#:when (not (ivl< ivl ivl2)))
ivl2))
(define dangerous-pipe* (ivl-remove* other-paren-ivl* pipe-pos*))
(not (null? dangerous-pipe*)))
;; Does not work for #\\ character
(define (unescaped-pos* str c*)
(define L (string-length str))
(let loop ([prev-i 0] [i 0])
(cond
[(= i L)
;; End of string, return last alternate
(list (substring str prev-i i))]
[(and (eq? (string-ref str i) #\|)
(< 1 i)
(not (and (eq? (string-ref str (- i 1)) #\\)
(eq? (string-ref str (- i 2)) #\\))))
;; Found a pipe, save current alternate
(cons (substring str prev-i i)
(loop (+ i 1) (+ i 1)))]
[else
;; Nothing interesting, continue building alternate
(loop prev-i (+ i 1))])))
(define escaped? (box #f))
(map reverse
(for/fold ([hist (for/list ([c (in-list c*)]) '())])
([i (in-range L)])
(define char (unsafe-string-ref str i))
(cond
[(unbox escaped?)
(unless (eq? #\\ char)
(set-box! escaped? #f))
hist]
[(eq? #\\ char)
(set-box! escaped? #t)
hist]
[else
(for/list ([h (in-list hist)]
[c (in-list c*)])
(if (eq? c char)
(cons i h)
h))]))))
(define (intlist-union i* j*)
(cond
[(null? i*)
j*]
[(null? j*)
i*]
[(< (car i*) (car j*))
(cons (car i*) (intlist-union (cdr i*) j*))]
[(> (car i*) (car j*))
(cons (car j*) (intlist-union i* (cdr j*)))]
[else
(cons (car i*) (intlist-union (cdr i*) (cdr j*)))]))
;; Match a list of left indices with a list of right indices.
;; Return a list of pairs on success
;; and the unmatched index on failure.
;; (-> (Listof Natural) (Listof Natural) (U Natural (Listof (Pairof Natural Natural))))
(define (pair-up l* r*)
(let loop ([i 0] [l* l*] [r* r*] [prev* '()])
(cond
[(null? r*)
(if (null? l*)
(if (null? prev*)
'() ;; good
(car prev*)) ;; bad
(car l*))] ;; bad
[(= i (car r*))
(if (null? prev*)
i
(let ([r (loop (+ i 1) l* (cdr r*) (cdr prev*))])
(if (integer? r)
r
(cons (cons (car prev*) i) r))))]
[(or (null? l*) (< i (car l*)))
(loop (+ i 1) l* r* prev*)]
[(= i (car l*))
(loop (+ i 1) (cdr l*) r* (cons i prev*))])))
(define (ivl-remove* ivl* i*)
(for/list ([i (in-list i*)]
#:when (not (for/or ([ivl (in-list ivl*)]) (in-ivl? ivl i))))
i))
(define (ivl< ivl1 ivl2)
(and (< (car ivl2) (car ivl1))
(< (cdr ivl1) (cdr ivl2))))
(define (in-ivl? ivl i)
(and (< (car ivl) i)
(< i (cdr ivl))))
(define (infer-return-type pattern-sym arg-stx)
(if (and
@ -237,16 +240,17 @@
(define-syntax regexp-match: (make-alias #'regexp-match
(lambda (stx) (syntax-parse stx
[(_ pat:pattern/groups arg* ...)
#:with (type-sym num-groups null-idx*) (syntax/loc stx pat.evidence)
;; TODO keep source location in type-sym, stop using format-id
;; (Is it really that bad?)
#:with return-type (format-id stx "~a" (infer-return-type (syntax-e #'type-sym)
#'(arg* ...)))
#:with (group-type* ...) (let ([null* (syntax->datum #'null-idx*)])
(for/list ([i (in-range (syntax-e #'num-groups))])
(if (memv i null*)
(syntax/loc stx (U #f return-type))
(syntax/loc stx return-type))))
#:with (type-sym . capture?*)
(syntax/loc stx pat.evidence)
#:with return-type
(format-id stx "~a" (infer-return-type (syntax-e #'type-sym) #'(arg* ...)))
#:with (group-type* ...)
(let ([stx-never-fail (syntax/loc stx return-type)]
[stx-may-fail (syntax/loc stx (U #f return-type))])
(for/list ([c-stx (in-list (syntax-e #'capture?*))])
(if (syntax-e c-stx)
stx-never-fail
stx-may-fail)))
(syntax/loc stx
(let ([maybe-match (regexp-match pat.expanded arg* ...)])
(if maybe-match