284 lines
9.4 KiB
Racket
284 lines
9.4 KiB
Racket
#lang typed/racket/base
|
|
|
|
;; Stronger types for regular expression matching.
|
|
|
|
;; Specification:
|
|
;; - Racket docs:
|
|
;; http://docs.racket-lang.org/reference/regexp.html
|
|
;;
|
|
;; - Pregexp docs:
|
|
;; http://ds26gte.github.io/pregexp/index.html
|
|
;;
|
|
;; - Racket source:
|
|
;; https://github.com/racket/racket/blob/master/racket/src/racket/src/regexp.c
|
|
|
|
(provide
|
|
regexp:
|
|
pregexp:
|
|
byte-regexp:
|
|
byte-pregexp:
|
|
define-regexp:
|
|
let-regexp:
|
|
|
|
regexp-match:
|
|
|
|
(for-syntax
|
|
rx-key
|
|
rx-define
|
|
rx-let)
|
|
)
|
|
|
|
(require
|
|
(for-syntax
|
|
(only-in racket/syntax format-id)
|
|
typed/racket/base
|
|
(only-in racket/list range)
|
|
(only-in racket/format ~a)
|
|
syntax/parse
|
|
trivial/private/common))
|
|
|
|
;; =============================================================================
|
|
|
|
(begin-for-syntax
|
|
(define errloc-key 'regexp-match:)
|
|
|
|
(define (group-error stx str reason)
|
|
(raise-user-error errloc-key
|
|
"(~a:~a) Invalid regexp pattern (unmatched ~a) in ~a"
|
|
(syntax-line stx)
|
|
(syntax-column stx)
|
|
reason
|
|
str))
|
|
|
|
;; Dispatch for counting groups
|
|
;; On success, return (Listof Boolean)
|
|
;; - booleans indicating "always succeeds" (#t) and "may fail" (#f)
|
|
(define (parse-groups v-stx)
|
|
(define v (quoted-stx-value? v-stx))
|
|
(cond
|
|
[(string? v) (parse-groups/string v #:src v-stx)]
|
|
[(regexp? v) (parse-groups/regexp v #:src v-stx)]
|
|
[(pregexp? v) (parse-groups/pregexp v #:src v-stx)]
|
|
[(bytes? v) (parse-groups/bytes v #:src v-stx)]
|
|
[(byte-regexp? v) (parse-groups/byte-regexp v #:src v-stx)]
|
|
[(byte-pregexp? v) (parse-groups/byte-pregexp v #:src v-stx)]
|
|
[else #f]))
|
|
|
|
(define (parse-groups/string str #:src stx)
|
|
(parse-groups/untyped str #:src stx))
|
|
|
|
(define (parse-groups/bytes b #:src stx)
|
|
(parse-groups/untyped (~a b) #:src stx))
|
|
|
|
(define (parse-groups/regexp rx #:src stx)
|
|
(parse-groups/string (~a rx) #:src stx))
|
|
|
|
(define parse-groups/pregexp
|
|
parse-groups/regexp)
|
|
|
|
(define (parse-groups/byte-regexp bx #:src stx)
|
|
(parse-groups/bytes (~a bx) #:src stx))
|
|
|
|
(define parse-groups/byte-pregexp
|
|
parse-groups/byte-regexp)
|
|
|
|
(define-values (rx-key rx? rx-define rx-let)
|
|
(make-value-property 'rx:groups parse-groups))
|
|
(define-syntax-class/predicate pattern/groups rx?)
|
|
|
|
;; (-> String #:src Syntax (Listof Boolean))
|
|
(define (parse-groups/untyped str #:src stx)
|
|
(define char->pos*
|
|
(let ([H (unescaped-pos* str '(#\[ #\] #\( #\) #\| #\?))])
|
|
(lambda (c)
|
|
(hash-ref H c (lambda () (raise-user-error 'parse-groups "No position data for '~a' character" c))))))
|
|
;; -- check that [] are matched
|
|
(define brack-ivl*
|
|
(let* ([l-brack-pos* (char->pos* #\[)]
|
|
[r-brack-pos* (char->pos* #\])]
|
|
[r (pair-up l-brack-pos* r-brack-pos*)])
|
|
;; ?? okay for brackets to nest?
|
|
(if (list? r)
|
|
r
|
|
(let ([brack-char (if (memv r l-brack-pos*) "[" "]")])
|
|
(group-error stx str (format "'~a' at index ~a" brack-char r))))))
|
|
;; -- ignore characters between a pair of brackets
|
|
(define-values (l-paren-pos* r-paren-pos* pipe-pos* ?-pos*)
|
|
(apply values
|
|
(for/list ([c (in-list '(#\( #\) #\| #\?))])
|
|
(ivl-remove* brack-ivl* (char->pos* c)))))
|
|
;; -- check that () are matched
|
|
(define paren-ivl*
|
|
(let ([r (pair-up l-paren-pos* r-paren-pos*)])
|
|
(if (list? r)
|
|
r
|
|
(let ([paren-char (if (memv r l-paren-pos*) "(" ")")])
|
|
(group-error stx str (format "'~a' at index ~a" paren-char r))))))
|
|
;; -- groups = #parens.
|
|
;; may fail to capture if has | outside (that are not nested in other parens)
|
|
;; or ? after close
|
|
(for/list ([ivl (in-list paren-ivl*)]
|
|
#:when (not (has-?-before ivl ?-pos*)))
|
|
(and
|
|
(not (has-unguarded-pipe-before-or-after ivl paren-ivl* pipe-pos*))
|
|
(not (has-*-after ivl str))
|
|
(not (has-?-after ivl ?-pos*)))))
|
|
|
|
(define (has-?-before ivl ?-pos*)
|
|
(define pos-before (+ 1 (car ivl))) ;; Well, just inside the paren.
|
|
(for/or ([?pos (in-list ?-pos*)])
|
|
(= pos-before ?pos)))
|
|
|
|
(define (has-?-after ivl ?-pos*)
|
|
(define pos-after (+ 1 (cdr ivl)))
|
|
(for/or ([?pos (in-list ?-pos*)])
|
|
(= pos-after ?pos)))
|
|
|
|
(define (has-*-after ivl str)
|
|
(let ([i (+ 1 (cdr ivl))])
|
|
(and (< i (string-length str))
|
|
(eq? #\* (string-ref str i)))))
|
|
|
|
(define (has-unguarded-pipe-before-or-after ivl paren-ivl* pipe-pos*)
|
|
(define other-paren-ivl*
|
|
(for/list ([ivl2 (in-list paren-ivl*)]
|
|
#:when (not (ivl< ivl ivl2)))
|
|
ivl2))
|
|
(define dangerous-pipe* (ivl-remove* other-paren-ivl* pipe-pos*))
|
|
(not (null? dangerous-pipe*)))
|
|
|
|
;; Does not work for #\\ character
|
|
(define (unescaped-pos* str c*)
|
|
(define L (string-length str))
|
|
(define escaped? (box #f))
|
|
(define most-recent-char (box #f))
|
|
(define (have-char-at-index? c i hist)
|
|
(memv i (hash-ref hist c)))
|
|
(define h-rev
|
|
(for/fold ([hist (for/hasheq ([c (in-list c*)]) (values c '()))])
|
|
([i (in-range L)])
|
|
(define char (string-ref str i))
|
|
(cond
|
|
[(unbox escaped?)
|
|
(when (or (not (eq? #\\ char))
|
|
(eq? #\[ (unbox most-recent-char)))
|
|
(set-box! escaped? #f))
|
|
hist]
|
|
[(eq? #\\ char)
|
|
(set-box! escaped? #t)
|
|
hist]
|
|
;; --- special case for singleton <rng>,
|
|
;; documented at `http://docs.racket-lang.org/reference/regexp.html`
|
|
[(and (eq? #\] char)
|
|
(or (have-char-at-index? #\[ (- i 1) hist) ;; []] pattern
|
|
(and (have-char-at-index? #\[ (- i 2) hist)
|
|
(eq? #\^ (string-ref str (- i 1)))))) ;; [^]] pattern
|
|
hist]
|
|
[else
|
|
(let ([i* (hash-ref hist char #f)])
|
|
(if i*
|
|
(begin
|
|
(set-box! most-recent-char char)
|
|
(hash-set hist char (cons i i*)))
|
|
hist))])))
|
|
;; -- reverse all saved lists
|
|
(for/hasheq ([(c i*) (in-hash h-rev)])
|
|
(values c (reverse i*))))
|
|
|
|
;; (define-type Ivl (Pairof Natural Natural))
|
|
|
|
;; Match a list of left indices with a list of right indices.
|
|
;; Return a list of pairs on success
|
|
;; and the unmatched index on failure.
|
|
;; (-> (Listof Natural) (Listof Natural) (U Natural (Listof Ivl)))
|
|
(define (pair-up l* r*)
|
|
(let loop ([i 0] [l* l*] [r* r*] [prev* '()])
|
|
(cond
|
|
[(null? r*)
|
|
(if (null? l*)
|
|
(if (null? prev*)
|
|
'() ;; good
|
|
(car prev*)) ;; bad
|
|
(car l*))] ;; bad
|
|
[(= i (car r*))
|
|
(if (null? prev*)
|
|
i
|
|
(let ([r (loop (+ i 1) l* (cdr r*) (cdr prev*))])
|
|
(if (integer? r)
|
|
r
|
|
(ivl-insert (cons (car prev*) i) r))))]
|
|
[(or (null? l*) (< i (car l*)))
|
|
(loop (+ i 1) l* r* prev*)]
|
|
[(= i (car l*))
|
|
(loop (+ i 1) (cdr l*) r* (cons i prev*))])))
|
|
|
|
;; Assume `ivl*` is sorted by left position
|
|
;; Insert `ivl` in sorted order
|
|
;; (-> Ivl (Listof Ivl) (Listof Ivl))
|
|
(define (ivl-insert ivl ivl*)
|
|
(cond
|
|
[(null? ivl*)
|
|
(list ivl)]
|
|
[(< (car ivl) (caar ivl*))
|
|
(cons ivl ivl*)]
|
|
[else
|
|
(cons (car ivl*) (ivl-insert ivl (cdr ivl*)))]))
|
|
|
|
(define (ivl-remove* ivl* i*)
|
|
(for/list ([i (in-list i*)]
|
|
#:when (not (for/or ([ivl (in-list ivl*)]) (in-ivl? ivl i))))
|
|
i))
|
|
|
|
(define (ivl< ivl1 ivl2)
|
|
(and (< (car ivl2) (car ivl1))
|
|
(< (cdr ivl1) (cdr ivl2))))
|
|
|
|
(define (in-ivl? ivl i)
|
|
(and (< (car ivl) i)
|
|
(< i (cdr ivl))))
|
|
|
|
)
|
|
|
|
;; -----------------------------------------------------------------------------
|
|
|
|
(define-syntax (define-matcher* stx)
|
|
(syntax-parse stx
|
|
[(_ f*:id ...)
|
|
#:with (f+* ...) (for/list ([f (in-list (syntax-e #'(f* ...)))])
|
|
(format-id stx "~a:" (syntax-e f)))
|
|
#`(begin
|
|
(define-syntax f+* (make-alias #'f*
|
|
(lambda (stx) (syntax-parse stx
|
|
[(_ pat:pattern/groups)
|
|
(syntax-property
|
|
(syntax/loc stx (f* pat.expanded))
|
|
rx-key
|
|
#'pat.evidence)]
|
|
[_ #f])))) ...)]))
|
|
|
|
(define-matcher* regexp pregexp byte-regexp byte-pregexp)
|
|
|
|
(define-syntax define-regexp: (make-keyword-alias 'define rx-define))
|
|
(define-syntax let-regexp: (make-keyword-alias 'let rx-let))
|
|
|
|
(define-syntax regexp-match: (make-alias #'regexp-match
|
|
(lambda (stx) (syntax-parse stx
|
|
[(_ pat:pattern/groups arg* ...)
|
|
#:with capture?* (syntax/loc stx pat.evidence)
|
|
(quasisyntax/loc stx
|
|
(let ([maybe-match (regexp-match pat.expanded arg* ...)])
|
|
(if maybe-match
|
|
;; -- Use `(or ... error)` to force guaranteed-capture groups.
|
|
(let ([rxm-error (lambda (i) (raise-user-error 'regexp-match: "Possible bug: expected group ~a to capture based on rx pattern '~a', but capture failed.\n Please report to 'http://github.com/bennn/trivial/issues' and use Racket's regexp-match in the meantime." i pat.expanded))])
|
|
(list (car maybe-match)
|
|
#,@(for/list ([capture?-stx (in-list (syntax-e #'capture?*))]
|
|
[i (in-naturals 1)])
|
|
(if (syntax-e capture?-stx)
|
|
(quasisyntax/loc stx
|
|
(or (list-ref maybe-match '#,i) (rxm-error '#,i)))
|
|
(quasisyntax/loc stx
|
|
(list-ref maybe-match '#,i))))))
|
|
#f)))]
|
|
[_ #f]))))
|
|
|