From b0d753e2d655e775e878d6bf1509673b0fd6100d Mon Sep 17 00:00:00 2001 From: Jon Zeppieri Date: Tue, 3 Sep 2019 18:48:45 -0400 Subject: [PATCH] Implements negated unicode categories in pregexps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The grammar for pregexps includes: | \p{‹property›} Match (UTF-8 encoded) in ‹property› | \P{‹property›} Match (UTF-8 encoded) not in ‹property› and is defined as: ‹property› ::= ‹category› Includes all characters in ‹category› | ^‹category› Includes all characters not in ‹category› That is to say, there are two independent ways to negate one of these character classes. The Racket implementation of regexps (as opposed to the C implementation) does not recognize negated categories. This PR fixes that. --- pkgs/racket-test-core/tests/racket/rx.rktl | 149 ++++++++++++--------- racket/src/regexp/demo.rkt | 4 + racket/src/regexp/parse/unicode.rkt | 13 +- 3 files changed, 98 insertions(+), 68 deletions(-) diff --git a/pkgs/racket-test-core/tests/racket/rx.rktl b/pkgs/racket-test-core/tests/racket/rx.rktl index 13db7a54c0..749efd5c65 100644 --- a/pkgs/racket-test-core/tests/racket/rx.rktl +++ b/pkgs/racket-test-core/tests/racket/rx.rktl @@ -1626,73 +1626,92 @@ (byte-pregexp (bformat "\\P{~a}" str)) (pregexp (format "\\P{~a}" str)) (byte-pregexp (bformat "\\P{~a}*" str)) - (pregexp (format "\\P{~a}*" str))))) + (pregexp (format "\\P{~a}*" str)) + + ;; Unicode categories can be negated with \P + ;; or by prefixing the category name with ^. + ;; These should be equivalent to the first + ;; 8 regexps, respectively. + (byte-pregexp (bformat "\\P{^~a}" str)) + (pregexp (format "\\P{^~a}" str)) + (byte-pregexp (bformat "\\P{^~a}*" str)) + (pregexp (format "\\P{^~a}*" str)) + (byte-pregexp (bformat "\\P{~a}" str)) + (pregexp (format "\\p{^~a}" str)) + (byte-pregexp (bformat "\\P{~a}*" str)) + (pregexp (format "\\p{^~a}*" str))))) '("Cn" "Cc" "Cf" "Cs" "Co" "Ll" "Lu" "Lt" "Lm" "Lo" "Nd" "Nl" "No" "Ps" "Pe" "Pi" "Pf" "Pc" "Pd" "Po" "Mn" "Mc" "Me" "Sc" "Sk" "Sm" "So" "Zl" "Zp" "Zs")) - (hash-for-each ht - (lambda (k v) - (let ([bad1 #"\377\377"] - [bad2 (regexp-replace #rx#".$" - (string->bytes/utf-8 "\U10FFF1") - #"\377")] - [bad3 (regexp-replace #rx#".$" - (string->bytes/utf-8 "\u1234") - #"")]) - (test #f regexp-match (vector-ref v 0) bad1) - (test #f regexp-match (vector-ref v 0) bad2) - (test #f regexp-match (vector-ref v 0) bad3) - (test #f regexp-match (vector-ref v 1) bad1) - (test #f regexp-match (vector-ref v 1) bad2) - (test #f regexp-match (vector-ref v 1) bad3) - (test #f regexp-match (vector-ref v 4) bad1) - (test #f regexp-match (vector-ref v 4) bad2) - (test #f regexp-match (vector-ref v 4) bad3) - (test #f regexp-match (vector-ref v 5) bad1) - (test #f regexp-match (vector-ref v 5) bad2) - (test #f regexp-match (vector-ref v 5) bad3) - (let ([other (ormap (lambda (e) - (and (not (eq? k (char-general-category - (integer->char (car e))))) - (integer->char (car e)))) - kcrl)]) - (let* ([s (string other)] - [bs (string->bytes/utf-8 s)] - [s* (string-append s s s)] - [bs* (bytes-append bs bs bs)]) - (test #f regexp-match (vector-ref v 0) bs) - (test #f regexp-match (vector-ref v 1) s) - (test '(#"") regexp-match (vector-ref v 2) bs) - (test '("") regexp-match (vector-ref v 3) s) - (test (list bs) regexp-match (vector-ref v 4) bs) - (test (list s) regexp-match (vector-ref v 5) s) - (test (list bs*) regexp-match (vector-ref v 6) bs*) - (test (list s*) regexp-match (vector-ref v 7) s*)))))) - (let ([try (lambda (n) - (let* ([cat (char-general-category (integer->char n))] - [v (hash-ref ht cat #f)]) - (when v - (when just-once? - (hash-remove! ht cat)) - (let* ([s (string (integer->char n))] - [bs (string->bytes/utf-8 s)] - [bs* (string->bytes/utf-8 (string-append s s s s s))] - [regexp-match* (lambda (p s) - (let ([v (regexp-match p (bytes->string/utf-8 s))]) - (and v - (map string->bytes/utf-8 v))))]) - (test (list bs) regexp-match (vector-ref v 0) bs) - (test (list bs) regexp-match* (vector-ref v 1) bs) - (test (list bs*) regexp-match (vector-ref v 2) bs*) - (test (list bs*) regexp-match* (vector-ref v 3) bs*)))))]) - (for-each (lambda (e) - (let ([start (car e)] - [end (cadr e)] - [uniform? (caddr e)]) - (let loop ([n start]) - (try n) - (unless (= n end) - (loop (if uniform? end (+ n 1))))))) - kcrl))) + (define equiv-offsets (list 0 8)) + (for-each + (lambda (off) + (define (ref v i) + (vector-ref v (+ i off))) + (hash-for-each ht + (lambda (k v) + (let ([bad1 #"\377\377"] + [bad2 (regexp-replace #rx#".$" + (string->bytes/utf-8 "\U10FFF1") + #"\377")] + [bad3 (regexp-replace #rx#".$" + (string->bytes/utf-8 "\u1234") + #"")]) + (test #f regexp-match (ref v 0) bad1) + (test #f regexp-match (ref v 0) bad2) + (test #f regexp-match (ref v 0) bad3) + (test #f regexp-match (ref v 1) bad1) + (test #f regexp-match (ref v 1) bad2) + (test #f regexp-match (ref v 1) bad3) + (test #f regexp-match (ref v 4) bad1) + (test #f regexp-match (ref v 4) bad2) + (test #f regexp-match (ref v 4) bad3) + (test #f regexp-match (ref v 5) bad1) + (test #f regexp-match (ref v 5) bad2) + (test #f regexp-match (ref v 5) bad3) + (let ([other (ormap (lambda (e) + (and (not (eq? k (char-general-category + (integer->char (car e))))) + (integer->char (car e)))) + kcrl)]) + (let* ([s (string other)] + [bs (string->bytes/utf-8 s)] + [s* (string-append s s s)] + [bs* (bytes-append bs bs bs)]) + (test #f regexp-match (ref v 0) bs) + (test #f regexp-match (ref v 1) s) + (test '(#"") regexp-match (ref v 2) bs) + (test '("") regexp-match (ref v 3) s) + (test (list bs) regexp-match (ref v 4) bs) + (test (list s) regexp-match (ref v 5) s) + (test (list bs*) regexp-match (ref v 6) bs*) + (test (list s*) regexp-match (ref v 7) s*)))))) + (let ([try (lambda (n) + (let* ([cat (char-general-category (integer->char n))] + [v (hash-ref ht cat #f)]) + (when v + (when just-once? + (hash-remove! ht cat)) + (let* ([s (string (integer->char n))] + [bs (string->bytes/utf-8 s)] + [bs* (string->bytes/utf-8 (string-append s s s s s))] + [regexp-match* (lambda (p s) + (let ([v (regexp-match p (bytes->string/utf-8 s))]) + (and v + (map string->bytes/utf-8 v))))]) + (test (list bs) regexp-match (ref v 0) bs) + (test (list bs) regexp-match* (ref v 1) bs) + (test (list bs*) regexp-match (ref v 2) bs*) + (test (list bs*) regexp-match* (ref v 3) bs*)))))]) + (for-each (lambda (e) + (let ([start (car e)] + [end (cadr e)] + [uniform? (caddr e)]) + (let loop ([n start]) + (try n) + (unless (= n end) + (loop (if uniform? end (+ n 1))))))) + kcrl))) + equiv-offsets)) (test '(#" ") regexp-match #px#"\t|\\p{Zs}" " ") diff --git a/racket/src/regexp/demo.rkt b/racket/src/regexp/demo.rkt index 2f4a525c4e..a3fbe6681c 100644 --- a/racket/src/regexp/demo.rkt +++ b/racket/src/regexp/demo.rkt @@ -160,6 +160,10 @@ #"aB" 1) +(check #"\\p{^Ll}" + #"aB" + 1) + (check #".*" #"abaacacaaacacaaacd" 100000) diff --git a/racket/src/regexp/parse/unicode.rkt b/racket/src/regexp/parse/unicode.rkt index 9ad2d6354c..6446e0ef63 100644 --- a/racket/src/regexp/parse/unicode.rkt +++ b/racket/src/regexp/parse/unicode.rkt @@ -1,5 +1,6 @@ #lang racket/base -(require "chyte.rkt" +(require (only-in racket/bool xor) + "chyte.rkt" "chyte-case.rkt" "ast.rkt" "config.rkt" @@ -11,8 +12,13 @@ (chyte-case/eos s pos [(#\{) + (define-values (cat-negated? next-pos) + (chyte-case/eos + s (add1 pos) + [(#\^) (values #t (+ pos 2))] + [else (values #f (add1 pos))])) (define-values (l pos2) - (let loop ([accum null] [pos (add1 pos)]) + (let loop ([accum null] [pos next-pos]) (chyte-case/eos s pos [(eos) @@ -67,7 +73,8 @@ "unrecognized property name in `\\~a{}`: `~a`" (integer->char p-c) (list->string (map integer->char l)))])) - (values (rx:unicode-categories categories (= p-c (char->integer #\p))) + (define prop-negated? (= p-c (char->integer #\P))) + (values (rx:unicode-categories categories (not (xor prop-negated? cat-negated?))) pos2)] [else (parse-error s pos config