Implements negated unicode categories in pregexps

The grammar for pregexps includes:

   | \p{‹property›} Match (UTF-8 encoded) in ‹property›
   | \P{‹property›} Match (UTF-8 encoded) not in ‹property›

and <property> is defined as:

   ‹property› ::= ‹category› Includes all characters in ‹category›
               |  ^‹category› Includes all characters not in ‹category›

That is to say, there are two independent ways to negate one of
these character classes. The Racket implementation of regexps
(as opposed to the C implementation) does not recognize negated
categories. This PR fixes that.
This commit is contained in:
Jon Zeppieri 2019-09-03 18:48:45 -04:00 committed by Matthew Flatt
parent 9fa6e9e25d
commit b0d753e2d6
3 changed files with 98 additions and 68 deletions

View File

@ -1626,73 +1626,92 @@
(byte-pregexp (bformat "\\P{~a}" str))
(pregexp (format "\\P{~a}" str))
(byte-pregexp (bformat "\\P{~a}*" str))
(pregexp (format "\\P{~a}*" str)))))
(pregexp (format "\\P{~a}*" str))
;; Unicode categories can be negated with \P
;; or by prefixing the category name with ^.
;; These should be equivalent to the first
;; 8 regexps, respectively.
(byte-pregexp (bformat "\\P{^~a}" str))
(pregexp (format "\\P{^~a}" str))
(byte-pregexp (bformat "\\P{^~a}*" str))
(pregexp (format "\\P{^~a}*" str))
(byte-pregexp (bformat "\\P{~a}" str))
(pregexp (format "\\p{^~a}" str))
(byte-pregexp (bformat "\\P{~a}*" str))
(pregexp (format "\\p{^~a}*" str)))))
'("Cn" "Cc" "Cf" "Cs" "Co" "Ll" "Lu" "Lt" "Lm" "Lo" "Nd" "Nl" "No"
"Ps" "Pe" "Pi" "Pf" "Pc" "Pd" "Po" "Mn" "Mc" "Me" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs"))
(hash-for-each ht
(lambda (k v)
(let ([bad1 #"\377\377"]
[bad2 (regexp-replace #rx#".$"
(string->bytes/utf-8 "\U10FFF1")
#"\377")]
[bad3 (regexp-replace #rx#".$"
(string->bytes/utf-8 "\u1234")
#"")])
(test #f regexp-match (vector-ref v 0) bad1)
(test #f regexp-match (vector-ref v 0) bad2)
(test #f regexp-match (vector-ref v 0) bad3)
(test #f regexp-match (vector-ref v 1) bad1)
(test #f regexp-match (vector-ref v 1) bad2)
(test #f regexp-match (vector-ref v 1) bad3)
(test #f regexp-match (vector-ref v 4) bad1)
(test #f regexp-match (vector-ref v 4) bad2)
(test #f regexp-match (vector-ref v 4) bad3)
(test #f regexp-match (vector-ref v 5) bad1)
(test #f regexp-match (vector-ref v 5) bad2)
(test #f regexp-match (vector-ref v 5) bad3)
(let ([other (ormap (lambda (e)
(and (not (eq? k (char-general-category
(integer->char (car e)))))
(integer->char (car e))))
kcrl)])
(let* ([s (string other)]
[bs (string->bytes/utf-8 s)]
[s* (string-append s s s)]
[bs* (bytes-append bs bs bs)])
(test #f regexp-match (vector-ref v 0) bs)
(test #f regexp-match (vector-ref v 1) s)
(test '(#"") regexp-match (vector-ref v 2) bs)
(test '("") regexp-match (vector-ref v 3) s)
(test (list bs) regexp-match (vector-ref v 4) bs)
(test (list s) regexp-match (vector-ref v 5) s)
(test (list bs*) regexp-match (vector-ref v 6) bs*)
(test (list s*) regexp-match (vector-ref v 7) s*))))))
(let ([try (lambda (n)
(let* ([cat (char-general-category (integer->char n))]
[v (hash-ref ht cat #f)])
(when v
(when just-once?
(hash-remove! ht cat))
(let* ([s (string (integer->char n))]
[bs (string->bytes/utf-8 s)]
[bs* (string->bytes/utf-8 (string-append s s s s s))]
[regexp-match* (lambda (p s)
(let ([v (regexp-match p (bytes->string/utf-8 s))])
(and v
(map string->bytes/utf-8 v))))])
(test (list bs) regexp-match (vector-ref v 0) bs)
(test (list bs) regexp-match* (vector-ref v 1) bs)
(test (list bs*) regexp-match (vector-ref v 2) bs*)
(test (list bs*) regexp-match* (vector-ref v 3) bs*)))))])
(for-each (lambda (e)
(let ([start (car e)]
[end (cadr e)]
[uniform? (caddr e)])
(let loop ([n start])
(try n)
(unless (= n end)
(loop (if uniform? end (+ n 1)))))))
kcrl)))
(define equiv-offsets (list 0 8))
(for-each
(lambda (off)
(define (ref v i)
(vector-ref v (+ i off)))
(hash-for-each ht
(lambda (k v)
(let ([bad1 #"\377\377"]
[bad2 (regexp-replace #rx#".$"
(string->bytes/utf-8 "\U10FFF1")
#"\377")]
[bad3 (regexp-replace #rx#".$"
(string->bytes/utf-8 "\u1234")
#"")])
(test #f regexp-match (ref v 0) bad1)
(test #f regexp-match (ref v 0) bad2)
(test #f regexp-match (ref v 0) bad3)
(test #f regexp-match (ref v 1) bad1)
(test #f regexp-match (ref v 1) bad2)
(test #f regexp-match (ref v 1) bad3)
(test #f regexp-match (ref v 4) bad1)
(test #f regexp-match (ref v 4) bad2)
(test #f regexp-match (ref v 4) bad3)
(test #f regexp-match (ref v 5) bad1)
(test #f regexp-match (ref v 5) bad2)
(test #f regexp-match (ref v 5) bad3)
(let ([other (ormap (lambda (e)
(and (not (eq? k (char-general-category
(integer->char (car e)))))
(integer->char (car e))))
kcrl)])
(let* ([s (string other)]
[bs (string->bytes/utf-8 s)]
[s* (string-append s s s)]
[bs* (bytes-append bs bs bs)])
(test #f regexp-match (ref v 0) bs)
(test #f regexp-match (ref v 1) s)
(test '(#"") regexp-match (ref v 2) bs)
(test '("") regexp-match (ref v 3) s)
(test (list bs) regexp-match (ref v 4) bs)
(test (list s) regexp-match (ref v 5) s)
(test (list bs*) regexp-match (ref v 6) bs*)
(test (list s*) regexp-match (ref v 7) s*))))))
(let ([try (lambda (n)
(let* ([cat (char-general-category (integer->char n))]
[v (hash-ref ht cat #f)])
(when v
(when just-once?
(hash-remove! ht cat))
(let* ([s (string (integer->char n))]
[bs (string->bytes/utf-8 s)]
[bs* (string->bytes/utf-8 (string-append s s s s s))]
[regexp-match* (lambda (p s)
(let ([v (regexp-match p (bytes->string/utf-8 s))])
(and v
(map string->bytes/utf-8 v))))])
(test (list bs) regexp-match (ref v 0) bs)
(test (list bs) regexp-match* (ref v 1) bs)
(test (list bs*) regexp-match (ref v 2) bs*)
(test (list bs*) regexp-match* (ref v 3) bs*)))))])
(for-each (lambda (e)
(let ([start (car e)]
[end (cadr e)]
[uniform? (caddr e)])
(let loop ([n start])
(try n)
(unless (= n end)
(loop (if uniform? end (+ n 1)))))))
kcrl)))
equiv-offsets))
(test '(#" ") regexp-match #px#"\t|\\p{Zs}" " ")

View File

@ -160,6 +160,10 @@
#"aB"
1)
(check #"\\p{^Ll}"
#"aB"
1)
(check #".*"
#"abaacacaaacacaaacd"
100000)

View File

@ -1,5 +1,6 @@
#lang racket/base
(require "chyte.rkt"
(require (only-in racket/bool xor)
"chyte.rkt"
"chyte-case.rkt"
"ast.rkt"
"config.rkt"
@ -11,8 +12,13 @@
(chyte-case/eos
s pos
[(#\{)
(define-values (cat-negated? next-pos)
(chyte-case/eos
s (add1 pos)
[(#\^) (values #t (+ pos 2))]
[else (values #f (add1 pos))]))
(define-values (l pos2)
(let loop ([accum null] [pos (add1 pos)])
(let loop ([accum null] [pos next-pos])
(chyte-case/eos
s pos
[(eos)
@ -67,7 +73,8 @@
"unrecognized property name in `\\~a{}`: `~a`"
(integer->char p-c)
(list->string (map integer->char l)))]))
(values (rx:unicode-categories categories (= p-c (char->integer #\p)))
(define prop-negated? (= p-c (char->integer #\P)))
(values (rx:unicode-categories categories (not (xor prop-negated? cat-negated?)))
pos2)]
[else
(parse-error s pos config