From 4bcdc32fec27d3aafa70ea58de9546e7fe715e61 Mon Sep 17 00:00:00 2001 From: Ben Greenman Date: Sat, 11 Jun 2016 22:52:47 -0400 Subject: [PATCH] [private] rx: always order clusters by left-paren position --- test/regexp-pass.rkt | 32 +++++- trivial/private/regexp.rkt | 19 +++- trivial/untyped/regexp.rkt | 225 ++++++++++++++++++++----------------- 3 files changed, 171 insertions(+), 105 deletions(-) diff --git a/test/regexp-pass.rkt b/test/regexp-pass.rkt index ceddc73..006a98f 100644 --- a/test/regexp-pass.rkt +++ b/test/regexp-pass.rkt @@ -8,11 +8,32 @@ trivial/regexp typed/rackunit) - ;; -- regexp + ;; -- TODO + ; (define re:normal-name (regexp (format "~a*<([^>]*)>~a*" blank blank))) + + ;; -- regexps, from the world + + (let ([str "1cm"]) ;; from html-render.rkt + (check-equal? + (ann (regexp-match: #rx"^([+-]?[0-9]*\\.?([0-9]+)?)(em|ex|px|in|cm|mm|pt|pc|%|)$" str) + (U #f (List String String (U #f String) String))) + (list str "1" #f "cm"))) + + (let ([expr "x+y*x"]) ;; from racket-doc/guide/scribblings/arith.rkt + (check-equal? + (ann (regexp-match: #px"^([a-z]|[0-9]+)(?:[-+*/]([a-z]|[0-9]+))*(?![-+*/])" expr) + (U #f (List String String String))) + (list expr "x" "x"))) + + (let ([str "(this and that!)"]) ;; from load-one.rkt + (check-equal? + (ann (regexp-match: #rx"^[(].*[)]$" str) (U #f (List String))) + (list str))) + (let () (check-true (and (regexp: "^(\r|\n|(\r\n))") #t))) - (let ([str "Pete would gain 4."]) + (let ([str "Pete would gain 4."]) ;; from Matthew Butterick's Advent of Code solutions (check-equal? (ann (regexp-match: #px"^(.*?) would (gain|lose) (\\d+)\\.$" str) (U #f (List String String String String))) @@ -24,6 +45,13 @@ (U #f (List String String))) #f)) + (let ([l "0 afAF09 AF09af ABSD_asdf ="]) ;; from racket/src/worksp/gendef.rkt + (define m : (U #f (List String String String)) + (regexp-match: + #rx"([0-9]+) +(?:[0-9A-Fa-f]+) +(?:[0-9A-Fa-f]+) +([_A-Za-z][_A-Za-z0-9]*) +=" + l)) + (check-equal? m (list l "0" "ABSD_asdf"))) + ;; -- regexp-match: (check-equal? (ann diff --git a/trivial/private/regexp.rkt b/trivial/private/regexp.rkt index 890aa1d..44d584d 100644 --- a/trivial/private/regexp.rkt +++ b/trivial/private/regexp.rkt @@ -165,10 +165,12 @@ (cons i h) h))])))) + ;; (define-type Ivl (Pairof Natural Natural)) + ;; Match a list of left indices with a list of right indices. ;; Return a list of pairs on success ;; and the unmatched index on failure. - ;; (-> (Listof Natural) (Listof Natural) (U Natural (Listof (Pairof Natural Natural)))) + ;; (-> (Listof Natural) (Listof Natural) (U Natural (Listof Ivl))) (define (pair-up l* r*) (let loop ([i 0] [l* l*] [r* r*] [prev* '()]) (cond @@ -184,12 +186,24 @@ (let ([r (loop (+ i 1) l* (cdr r*) (cdr prev*))]) (if (integer? r) r - (cons (cons (car prev*) i) r))))] + (ivl-insert (cons (car prev*) i) r))))] [(or (null? l*) (< i (car l*))) (loop (+ i 1) l* r* prev*)] [(= i (car l*)) (loop (+ i 1) (cdr l*) r* (cons i prev*))]))) + ;; Assume `ivl*` is sorted by left position + ;; Insert `ivl` in sorted order + ;; (-> Ivl (Listof Ivl) (Listof Ivl)) + (define (ivl-insert ivl ivl*) + (cond + [(null? ivl*) + (list ivl)] + [(< (car ivl) (caar ivl*)) + (cons ivl ivl*)] + [else + (cons (car ivl*) (ivl-insert ivl (cdr ivl*)))])) + (define (ivl-remove* ivl* i*) (for/list ([i (in-list i*)] #:when (not (for/or ([ivl (in-list ivl*)]) (in-ivl? ivl i)))) @@ -210,6 +224,7 @@ (or (syntax-parse arg-stx ((x:str) #t) ((x) #:when (bytes? (syntax-e #'x)) #f) + ;; TODO ;; ((x) #:when (port? (syntax-e #'x)) #f) (_ #t)))) 'String 'Bytes)) diff --git a/trivial/untyped/regexp.rkt b/trivial/untyped/regexp.rkt index 43dfc0b..f8d20e4 100644 --- a/trivial/untyped/regexp.rkt +++ b/trivial/untyped/regexp.rkt @@ -31,6 +31,7 @@ (require (for-syntax (only-in racket/syntax format-id) + (only-in racket/unsafe/ops unsafe-string-ref) typed/racket/base (only-in racket/list range) (only-in racket/format ~a) @@ -51,6 +52,9 @@ str)) ;; Dispatch for counting groups + ;; On success, return (Pairof Type (Listof Boolean)) + ;; - type is probably the return type of matches + ;; - booleans indicating "always succeeds" (#t) and "may fail" (#f) (define (parse-groups v-stx) (define v (quoted-stx-value? v-stx)) (cond @@ -62,72 +66,6 @@ [(byte-pregexp? v) (parse-groups/byte-pregexp v #:src v-stx)] [else #f])) - ;; Handle pipes - ;; If there is a pipe, everything is nullable, but we know the number of groups - (define (parse-groups/untyped str #:src stx) - (define alt* (string->alt* str)) - (cond - [(null? alt*) - (list 0 '())] - [(null? (cdr alt*)) - (parse-groups-for-alt (car alt*) #:src stx)] - [else - (parse-groups-for-alt str #:src stx) - #f])) - - ;; Count the number of matched parentheses in a regexp pattern. - ;; Raise an exception if there are unmatched parens. - (define (parse-groups-for-alt str #:src stx) - (define last-index (- (string-length str) 1)) - (define in-square? (box #f)) - (let loop ([i 0] [in-paren '()] [num-groups 0] [null-idx* '()]) - (if (> i last-index) - (cond - [(not (null? in-paren)) - (group-error stx str (format "'(' at index ~a" (car in-paren)))] - [(unbox in-square?) - (group-error stx str (format "'[' at index ~a" (car in-paren)))] - [else - (list num-groups null-idx*)]) - (if (unbox in-square?) - (if (eq? #\] (string-ref str i)) - (begin (set-box! in-square? #f) - (loop (+ i 1) (cdr in-paren) num-groups null-idx*)) - (loop (+ i 1) in-paren num-groups null-idx*)) - (case (string-ref str i) - [(#\[) - ;; Ignore things between [ ... ] - (set-box! in-square? #t) - (loop (+ i 1) (cons i in-paren) num-groups null-idx*)] - [(#\() - ;; Watch for (? patterns - (if (and (< i last-index) - (eq? #\? (string-ref str (+ i 1)))) - (loop (+ i 2) (cons #f in-paren) num-groups null-idx*) - (loop (+ i 1) (cons i in-paren) num-groups null-idx*))] - [(#\)) - (cond - [(null? in-paren) - (group-error stx str (format "')' at index ~a" i))] - [(eq? #f (car in-paren)) - ;; Matched closing paren, but does not count as a group - (loop (+ i 1) (cdr in-paren) num-groups null-idx*)] - [(and (< i last-index) - (or - (eq? #\? (string-ref str (+ i 1))) - (eq? #\* (string-ref str (+ i 1))))) - ;; group = may be #f - (loop (+ i 1) (cdr in-paren) (+ 1 num-groups) (cons num-groups null-idx*))] - [else - (loop (+ i 1) (cdr in-paren) (+ 1 num-groups) null-idx*)])] - [(#\\) - (if (and (< i last-index) - (eq? #\\ (string-ref str (+ i 1)))) - (loop (+ i 3) in-paren num-groups null-idx*) - (loop (+ i 2) in-paren num-groups null-idx*))] - [else - (loop (+ i 1) in-paren num-groups null-idx*)]))))) - (define (parse-groups/string str #:src stx) (let ([ng (parse-groups/untyped str #:src stx)]) (and ng (cons 'String ng)))) @@ -152,44 +90,130 @@ (make-value-property 'rx:groups parse-groups)) (define-syntax-class/predicate pattern/groups rx?) -) -;; ----------------------------------------------------------------------------- -;; --- Other helpers + ;; (-> String #:src Syntax (Listof Boolean)) + (define (parse-groups/untyped str #:src stx) + (define pos** + (unescaped-pos* str '(#\[ #\] #\( #\) #\| #\?))) + ;; -- check that [] are matched + (define brack-ivl* + (let* ([l-brack-pos* (car pos**)] + [r-brack-pos* (cadr pos**)] + [r (pair-up l-brack-pos* r-brack-pos*)]) + ;; ?? okay for brackets to nest? + (if (list? r) + r + (let ([brack-char (if (memv r l-brack-pos*) "[" "]")]) + (group-error stx str (format "'~a' at index ~a" brack-char r)))))) + ;; -- ignore characters between a pair of brackets + (define-values (l-paren-pos* r-paren-pos* pipe-pos* ?-pos*) + (apply values (for/list ([pos* (in-list (cddr pos**))]) + (ivl-remove* brack-ivl* pos*)))) + ;; -- check that () are matched + (define paren-ivl* + (let ([r (pair-up l-paren-pos* r-paren-pos*)]) + (if (list? r) + r + (let ([paren-char (if (memv r l-paren-pos*) "(" ")")]) + (group-error stx str (format "'~a' at index ~a" paren-char r)))))) + ;; -- groups = #parens. + ;; may fail to capture if has | outside (that are not nested in other parens) + ;; or ? after close + (for/list ([ivl (in-list paren-ivl*)] + #:when (not (has-?-before ivl ?-pos*))) + (and + (not (has-unguarded-pipe-before-or-after ivl paren-ivl* pipe-pos*)) + (not (has-?-after ivl ?-pos*))))) -(begin-for-syntax + (define (has-?-before ivl ?-pos*) + (define pos-before (+ 1 (car ivl))) ;; Well, just inside the paren. + (for/or ([?pos (in-list ?-pos*)]) + (= pos-before ?pos))) - ;; Divide string into |-separated substrings (regex alternates) - ;; Be wary of escaped | characters. - (define (string->alt* str) + (define (has-?-after ivl ?-pos*) + (define pos-after (+ 1 (cdr ivl))) + (for/or ([?pos (in-list ?-pos*)]) + (= pos-after ?pos))) + + (define (has-unguarded-pipe-before-or-after ivl paren-ivl* pipe-pos*) + (define other-paren-ivl* + (for/list ([ivl2 (in-list paren-ivl*)] + #:when (not (ivl< ivl ivl2))) + ivl2)) + (define dangerous-pipe* (ivl-remove* other-paren-ivl* pipe-pos*)) + (not (null? dangerous-pipe*))) + + ;; Does not work for #\\ character + (define (unescaped-pos* str c*) (define L (string-length str)) - (let loop ([prev-i 0] [i 0]) - (cond - [(= i L) - ;; End of string, return last alternate - (list (substring str prev-i i))] - [(and (eq? (string-ref str i) #\|) - (< 1 i) - (not (and (eq? (string-ref str (- i 1)) #\\) - (eq? (string-ref str (- i 2)) #\\)))) - ;; Found a pipe, save current alternate - (cons (substring str prev-i i) - (loop (+ i 1) (+ i 1)))] - [else - ;; Nothing interesting, continue building alternate - (loop prev-i (+ i 1))]))) + (define escaped? (box #f)) + (map reverse + (for/fold ([hist (for/list ([c (in-list c*)]) '())]) + ([i (in-range L)]) + (define char (unsafe-string-ref str i)) + (cond + [(unbox escaped?) + (unless (eq? #\\ char) + (set-box! escaped? #f)) + hist] + [(eq? #\\ char) + (set-box! escaped? #t) + hist] + [else + (for/list ([h (in-list hist)] + [c (in-list c*)]) + (if (eq? c char) + (cons i h) + h))])))) - (define (intlist-union i* j*) + ;; Match a list of left indices with a list of right indices. + ;; Return a list of pairs on success + ;; and the unmatched index on failure. + ;; (-> (Listof Natural) (Listof Natural) (U Natural (Listof Ivl))) + (define (pair-up l* r*) + (let loop ([i 0] [l* l*] [r* r*] [prev* '()]) + (cond + [(null? r*) + (if (null? l*) + (if (null? prev*) + '() ;; good + (car prev*)) ;; bad + (car l*))] ;; bad + [(= i (car r*)) + (if (null? prev*) + i + (let ([r (loop (+ i 1) l* (cdr r*) (cdr prev*))]) + (if (integer? r) + r + (ivl-insert (cons (car prev*) i) r))))] + [(or (null? l*) (< i (car l*))) + (loop (+ i 1) l* r* prev*)] + [(= i (car l*)) + (loop (+ i 1) (cdr l*) r* (cons i prev*))]))) + + ;; Assume `ivl*` is sorted by left position + ;; Insert `ivl` in sorted order + ;; (-> Ivl (Listof Ivl) (Listof Ivl)) + (define (ivl-insert ivl ivl*) (cond - [(null? i*) - j*] - [(null? j*) - i*] - [(< (car i*) (car j*)) - (cons (car i*) (intlist-union (cdr i*) j*))] - [(> (car i*) (car j*)) - (cons (car j*) (intlist-union i* (cdr j*)))] + [(null? ivl*) + (list ivl)] + [(< (car ivl) (caar ivl*)) + (cons ivl ivl*)] [else - (cons (car i*) (intlist-union (cdr i*) (cdr j*)))])) + (cons (car ivl*) (ivl-insert ivl (cdr ivl*)))])) + + (define (ivl-remove* ivl* i*) + (for/list ([i (in-list i*)] + #:when (not (for/or ([ivl (in-list ivl*)]) (in-ivl? ivl i)))) + i)) + + (define (ivl< ivl1 ivl2) + (and (< (car ivl2) (car ivl1)) + (< (cdr ivl1) (cdr ivl2)))) + + (define (in-ivl? ivl i) + (and (< (car ivl) i) + (< i (cdr ivl)))) (define (infer-return-type pattern-sym arg-stx) (if (and @@ -228,9 +252,8 @@ (define-syntax regexp-match: (make-alias #'regexp-match (lambda (stx) (syntax-parse stx [(_ pat:pattern/groups arg* ...) - #:with (type-sym num-groups null-idx*) (syntax/loc stx pat.evidence) - ;; TODO keep source location in type-sym, stop using format-id - ;; (Is it really that bad?) + #:with (type-sym . capture?*) + (syntax/loc stx pat.evidence) (syntax/loc stx (let ([maybe-match (regexp-match pat.expanded arg* ...)]) (if maybe-match