Implements negated unicode categories in pregexps
The grammar for pregexps includes: | \p{‹property›} Match (UTF-8 encoded) in ‹property› | \P{‹property›} Match (UTF-8 encoded) not in ‹property› and <property> is defined as: ‹property› ::= ‹category› Includes all characters in ‹category› | ^‹category› Includes all characters not in ‹category› That is to say, there are two independent ways to negate one of these character classes. The Racket implementation of regexps (as opposed to the C implementation) does not recognize negated categories. This PR fixes that.
This commit is contained in:
parent
9fa6e9e25d
commit
b0d753e2d6
|
@ -1626,73 +1626,92 @@
|
|||
(byte-pregexp (bformat "\\P{~a}" str))
|
||||
(pregexp (format "\\P{~a}" str))
|
||||
(byte-pregexp (bformat "\\P{~a}*" str))
|
||||
(pregexp (format "\\P{~a}*" str)))))
|
||||
(pregexp (format "\\P{~a}*" str))
|
||||
|
||||
;; Unicode categories can be negated with \P
|
||||
;; or by prefixing the category name with ^.
|
||||
;; These should be equivalent to the first
|
||||
;; 8 regexps, respectively.
|
||||
(byte-pregexp (bformat "\\P{^~a}" str))
|
||||
(pregexp (format "\\P{^~a}" str))
|
||||
(byte-pregexp (bformat "\\P{^~a}*" str))
|
||||
(pregexp (format "\\P{^~a}*" str))
|
||||
(byte-pregexp (bformat "\\P{~a}" str))
|
||||
(pregexp (format "\\p{^~a}" str))
|
||||
(byte-pregexp (bformat "\\P{~a}*" str))
|
||||
(pregexp (format "\\p{^~a}*" str)))))
|
||||
'("Cn" "Cc" "Cf" "Cs" "Co" "Ll" "Lu" "Lt" "Lm" "Lo" "Nd" "Nl" "No"
|
||||
"Ps" "Pe" "Pi" "Pf" "Pc" "Pd" "Po" "Mn" "Mc" "Me" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs"))
|
||||
(hash-for-each ht
|
||||
(lambda (k v)
|
||||
(let ([bad1 #"\377\377"]
|
||||
[bad2 (regexp-replace #rx#".$"
|
||||
(string->bytes/utf-8 "\U10FFF1")
|
||||
#"\377")]
|
||||
[bad3 (regexp-replace #rx#".$"
|
||||
(string->bytes/utf-8 "\u1234")
|
||||
#"")])
|
||||
(test #f regexp-match (vector-ref v 0) bad1)
|
||||
(test #f regexp-match (vector-ref v 0) bad2)
|
||||
(test #f regexp-match (vector-ref v 0) bad3)
|
||||
(test #f regexp-match (vector-ref v 1) bad1)
|
||||
(test #f regexp-match (vector-ref v 1) bad2)
|
||||
(test #f regexp-match (vector-ref v 1) bad3)
|
||||
(test #f regexp-match (vector-ref v 4) bad1)
|
||||
(test #f regexp-match (vector-ref v 4) bad2)
|
||||
(test #f regexp-match (vector-ref v 4) bad3)
|
||||
(test #f regexp-match (vector-ref v 5) bad1)
|
||||
(test #f regexp-match (vector-ref v 5) bad2)
|
||||
(test #f regexp-match (vector-ref v 5) bad3)
|
||||
(let ([other (ormap (lambda (e)
|
||||
(and (not (eq? k (char-general-category
|
||||
(integer->char (car e)))))
|
||||
(integer->char (car e))))
|
||||
kcrl)])
|
||||
(let* ([s (string other)]
|
||||
[bs (string->bytes/utf-8 s)]
|
||||
[s* (string-append s s s)]
|
||||
[bs* (bytes-append bs bs bs)])
|
||||
(test #f regexp-match (vector-ref v 0) bs)
|
||||
(test #f regexp-match (vector-ref v 1) s)
|
||||
(test '(#"") regexp-match (vector-ref v 2) bs)
|
||||
(test '("") regexp-match (vector-ref v 3) s)
|
||||
(test (list bs) regexp-match (vector-ref v 4) bs)
|
||||
(test (list s) regexp-match (vector-ref v 5) s)
|
||||
(test (list bs*) regexp-match (vector-ref v 6) bs*)
|
||||
(test (list s*) regexp-match (vector-ref v 7) s*))))))
|
||||
(let ([try (lambda (n)
|
||||
(let* ([cat (char-general-category (integer->char n))]
|
||||
[v (hash-ref ht cat #f)])
|
||||
(when v
|
||||
(when just-once?
|
||||
(hash-remove! ht cat))
|
||||
(let* ([s (string (integer->char n))]
|
||||
[bs (string->bytes/utf-8 s)]
|
||||
[bs* (string->bytes/utf-8 (string-append s s s s s))]
|
||||
[regexp-match* (lambda (p s)
|
||||
(let ([v (regexp-match p (bytes->string/utf-8 s))])
|
||||
(and v
|
||||
(map string->bytes/utf-8 v))))])
|
||||
(test (list bs) regexp-match (vector-ref v 0) bs)
|
||||
(test (list bs) regexp-match* (vector-ref v 1) bs)
|
||||
(test (list bs*) regexp-match (vector-ref v 2) bs*)
|
||||
(test (list bs*) regexp-match* (vector-ref v 3) bs*)))))])
|
||||
(for-each (lambda (e)
|
||||
(let ([start (car e)]
|
||||
[end (cadr e)]
|
||||
[uniform? (caddr e)])
|
||||
(let loop ([n start])
|
||||
(try n)
|
||||
(unless (= n end)
|
||||
(loop (if uniform? end (+ n 1)))))))
|
||||
kcrl)))
|
||||
(define equiv-offsets (list 0 8))
|
||||
(for-each
|
||||
(lambda (off)
|
||||
(define (ref v i)
|
||||
(vector-ref v (+ i off)))
|
||||
(hash-for-each ht
|
||||
(lambda (k v)
|
||||
(let ([bad1 #"\377\377"]
|
||||
[bad2 (regexp-replace #rx#".$"
|
||||
(string->bytes/utf-8 "\U10FFF1")
|
||||
#"\377")]
|
||||
[bad3 (regexp-replace #rx#".$"
|
||||
(string->bytes/utf-8 "\u1234")
|
||||
#"")])
|
||||
(test #f regexp-match (ref v 0) bad1)
|
||||
(test #f regexp-match (ref v 0) bad2)
|
||||
(test #f regexp-match (ref v 0) bad3)
|
||||
(test #f regexp-match (ref v 1) bad1)
|
||||
(test #f regexp-match (ref v 1) bad2)
|
||||
(test #f regexp-match (ref v 1) bad3)
|
||||
(test #f regexp-match (ref v 4) bad1)
|
||||
(test #f regexp-match (ref v 4) bad2)
|
||||
(test #f regexp-match (ref v 4) bad3)
|
||||
(test #f regexp-match (ref v 5) bad1)
|
||||
(test #f regexp-match (ref v 5) bad2)
|
||||
(test #f regexp-match (ref v 5) bad3)
|
||||
(let ([other (ormap (lambda (e)
|
||||
(and (not (eq? k (char-general-category
|
||||
(integer->char (car e)))))
|
||||
(integer->char (car e))))
|
||||
kcrl)])
|
||||
(let* ([s (string other)]
|
||||
[bs (string->bytes/utf-8 s)]
|
||||
[s* (string-append s s s)]
|
||||
[bs* (bytes-append bs bs bs)])
|
||||
(test #f regexp-match (ref v 0) bs)
|
||||
(test #f regexp-match (ref v 1) s)
|
||||
(test '(#"") regexp-match (ref v 2) bs)
|
||||
(test '("") regexp-match (ref v 3) s)
|
||||
(test (list bs) regexp-match (ref v 4) bs)
|
||||
(test (list s) regexp-match (ref v 5) s)
|
||||
(test (list bs*) regexp-match (ref v 6) bs*)
|
||||
(test (list s*) regexp-match (ref v 7) s*))))))
|
||||
(let ([try (lambda (n)
|
||||
(let* ([cat (char-general-category (integer->char n))]
|
||||
[v (hash-ref ht cat #f)])
|
||||
(when v
|
||||
(when just-once?
|
||||
(hash-remove! ht cat))
|
||||
(let* ([s (string (integer->char n))]
|
||||
[bs (string->bytes/utf-8 s)]
|
||||
[bs* (string->bytes/utf-8 (string-append s s s s s))]
|
||||
[regexp-match* (lambda (p s)
|
||||
(let ([v (regexp-match p (bytes->string/utf-8 s))])
|
||||
(and v
|
||||
(map string->bytes/utf-8 v))))])
|
||||
(test (list bs) regexp-match (ref v 0) bs)
|
||||
(test (list bs) regexp-match* (ref v 1) bs)
|
||||
(test (list bs*) regexp-match (ref v 2) bs*)
|
||||
(test (list bs*) regexp-match* (ref v 3) bs*)))))])
|
||||
(for-each (lambda (e)
|
||||
(let ([start (car e)]
|
||||
[end (cadr e)]
|
||||
[uniform? (caddr e)])
|
||||
(let loop ([n start])
|
||||
(try n)
|
||||
(unless (= n end)
|
||||
(loop (if uniform? end (+ n 1)))))))
|
||||
kcrl)))
|
||||
equiv-offsets))
|
||||
|
||||
|
||||
(test '(#" ") regexp-match #px#"\t|\\p{Zs}" " ")
|
||||
|
|
|
@ -160,6 +160,10 @@
|
|||
#"aB"
|
||||
1)
|
||||
|
||||
(check #"\\p{^Ll}"
|
||||
#"aB"
|
||||
1)
|
||||
|
||||
(check #".*"
|
||||
#"abaacacaaacacaaacd"
|
||||
100000)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#lang racket/base
|
||||
(require "chyte.rkt"
|
||||
(require (only-in racket/bool xor)
|
||||
"chyte.rkt"
|
||||
"chyte-case.rkt"
|
||||
"ast.rkt"
|
||||
"config.rkt"
|
||||
|
@ -11,8 +12,13 @@
|
|||
(chyte-case/eos
|
||||
s pos
|
||||
[(#\{)
|
||||
(define-values (cat-negated? next-pos)
|
||||
(chyte-case/eos
|
||||
s (add1 pos)
|
||||
[(#\^) (values #t (+ pos 2))]
|
||||
[else (values #f (add1 pos))]))
|
||||
(define-values (l pos2)
|
||||
(let loop ([accum null] [pos (add1 pos)])
|
||||
(let loop ([accum null] [pos next-pos])
|
||||
(chyte-case/eos
|
||||
s pos
|
||||
[(eos)
|
||||
|
@ -67,7 +73,8 @@
|
|||
"unrecognized property name in `\\~a{}`: `~a`"
|
||||
(integer->char p-c)
|
||||
(list->string (map integer->char l)))]))
|
||||
(values (rx:unicode-categories categories (= p-c (char->integer #\p)))
|
||||
(define prop-negated? (= p-c (char->integer #\P)))
|
||||
(values (rx:unicode-categories categories (not (xor prop-negated? cat-negated?)))
|
||||
pos2)]
|
||||
[else
|
||||
(parse-error s pos config
|
||||
|
|
Loading…
Reference in New Issue
Block a user