fix \D, \S, \W in string regexp
The documentation and implementation were confused about whether \D, \S, and \W match non-ASCII characters. Now they do. The new regexp implementation (as used in Racket CS) already matched them.
This commit is contained in:
parent
f4c48dd9b5
commit
30e260835f
|
@ -85,11 +85,11 @@
|
|||
| Modem Like Mode, but in multi mode #mode
|
||||
| Mode-m Like Mode, but not in multi mode #mode
|
||||
Class ::= \d Contains _0_-_9_ #cat 23
|
||||
| \D Contains ASCII other than those in _\d_ #cat
|
||||
| \D Contains characters not in _\d_ #cat
|
||||
| \w Contains _a_-_z_, _A_-_Z_, _0_-_9_, ___ #cat 24
|
||||
| \W Contains ASCII other than those in _\w_ #cat
|
||||
| \W Contains characters not in _\w_ #cat
|
||||
| \s Contains space, tab, newline, formfeed, return #cat 25
|
||||
| \S Contains ASCII other than those in _\s_ #cat
|
||||
| \S Contains characters not in _\s_ #cat
|
||||
Posix ::= [:alpha:] Contains _a_-_z_, _A_-_Z_ #cat
|
||||
| [:upper:] Contains _A_-_Z_ #cat
|
||||
| [:lower:] Contains _a_-_z_ #cat 26
|
||||
|
|
|
@ -1117,6 +1117,19 @@
|
|||
(test '("\u1F39") regexp-match #rx"[^\u1F79-\u3F79]" "\u1F39")
|
||||
(test '("\u1F78") regexp-match #rx"[^\u1F79-\u3F79]" "\u1F78")
|
||||
|
||||
(test '("\u3BB") regexp-match #px"\\D" "\u3BB")
|
||||
(test '("\u3BB") regexp-match #px"[\\D]" "\u3BB")
|
||||
(test '("a") regexp-match #px"[\\D]" "a")
|
||||
(test #f regexp-match #px"\\D" "0")
|
||||
(test #f regexp-match #px"\\D" "9")
|
||||
(test '("\u3BB") regexp-match #px"\\S" "\u3BB")
|
||||
(test '("\u3BB") regexp-match #px"[\\S]" "\u3BB")
|
||||
(test '("a") regexp-match #px"\\S" "a")
|
||||
(test #f regexp-match #px"\\S" " ")
|
||||
(test '("\u3BB") regexp-match #px"\\W" "\u3BB")
|
||||
(test '("\u3BB") regexp-match #px"[\\W]" "\u3BB")
|
||||
(test '("+") regexp-match #px"\\W" "+")
|
||||
(test #f regexp-match #px"\\W" "a")
|
||||
|
||||
;; Regexps that shouldn't parse:
|
||||
(err/rt-test (regexp "[a--b\u1F78]") exn:fail?)
|
||||
|
|
|
@ -1485,7 +1485,7 @@ regatom(int *flagp, int parse_flags, int at_start)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int regcharclass(int c, char *map)
|
||||
static int regcharclass(int c, char *map, int *_non_ascii)
|
||||
{
|
||||
switch(c) {
|
||||
case 'd':
|
||||
|
@ -1497,9 +1497,11 @@ static int regcharclass(int c, char *map)
|
|||
for (c = 0; c < '0'; c++) {
|
||||
map[c] = 1;
|
||||
}
|
||||
for (c = '9' + 1; c < 256; c++) {
|
||||
for (c = '9' + 1; c < (_non_ascii ? 128 : 256); c++) {
|
||||
map[c] = 1;
|
||||
}
|
||||
if (_non_ascii)
|
||||
*_non_ascii = 1;
|
||||
break;
|
||||
case 'w':
|
||||
for (c = 0; c < 26; c++) {
|
||||
|
@ -1521,9 +1523,11 @@ static int regcharclass(int c, char *map)
|
|||
for (c = 'Z' + 1; c < '_'; c++) {
|
||||
map[c] = 1;
|
||||
}
|
||||
for (c = 'z' + 1; c < 256; c++) {
|
||||
for (c = 'z' + 1; c < (_non_ascii ? 128 : 256); c++) {
|
||||
map[c] = 1;
|
||||
}
|
||||
if (_non_ascii)
|
||||
*_non_ascii = 1;
|
||||
break;
|
||||
case 's':
|
||||
map['\t'] = 1;
|
||||
|
@ -1533,7 +1537,7 @@ static int regcharclass(int c, char *map)
|
|||
map[' '] = 1;
|
||||
break;
|
||||
case 'S':
|
||||
for (c = 0; c < 256; c++) {
|
||||
for (c = 0; c < (_non_ascii ? 128 : 256); c++) {
|
||||
switch (c) {
|
||||
case '\t':
|
||||
case '\n':
|
||||
|
@ -1546,6 +1550,8 @@ static int regcharclass(int c, char *map)
|
|||
break;
|
||||
}
|
||||
}
|
||||
if (_non_ascii)
|
||||
*_non_ascii = 1;
|
||||
break;
|
||||
default:
|
||||
if (((c >= 'a') && (c <= 'z'))
|
||||
|
@ -1566,7 +1572,7 @@ static int is_posix_char_class(char *str, int pos, int len, char *map)
|
|||
if (pos + 8 <= len) {
|
||||
if (!scheme_strncmp(":alnum:]", str XFORM_OK_PLUS pos, 8)) {
|
||||
if (map) {
|
||||
regcharclass('d', map);
|
||||
regcharclass('d', map, NULL);
|
||||
for (c = 'a'; c <= 'z'; c++) {
|
||||
map[c] = 1;
|
||||
map[c - ('a' - 'A')] = 1;
|
||||
|
@ -1603,7 +1609,7 @@ static int is_posix_char_class(char *str, int pos, int len, char *map)
|
|||
return 1;
|
||||
} else if (!scheme_strncmp(":digit:]", str XFORM_OK_PLUS pos, 8)) {
|
||||
if (map) {
|
||||
regcharclass('d', map);
|
||||
regcharclass('d', map, NULL);
|
||||
}
|
||||
return 1;
|
||||
} else if (!scheme_strncmp(":graph:]", str XFORM_OK_PLUS pos, 8)) {
|
||||
|
@ -1633,7 +1639,7 @@ static int is_posix_char_class(char *str, int pos, int len, char *map)
|
|||
return 1;
|
||||
} else if (!scheme_strncmp(":space:]", str XFORM_OK_PLUS pos, 8)) {
|
||||
if (map) {
|
||||
regcharclass('s', map);
|
||||
regcharclass('s', map, NULL);
|
||||
}
|
||||
return 1;
|
||||
} else if (!scheme_strncmp(":upper:]", str XFORM_OK_PLUS pos, 8)) {
|
||||
|
@ -1649,7 +1655,7 @@ static int is_posix_char_class(char *str, int pos, int len, char *map)
|
|||
if ((pos + 7 <= len)
|
||||
&& !scheme_strncmp(":word:]", str XFORM_OK_PLUS pos, 7)) {
|
||||
if (map) {
|
||||
regcharclass('w', map);
|
||||
regcharclass('w', map, NULL);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
@ -1657,7 +1663,7 @@ static int is_posix_char_class(char *str, int pos, int len, char *map)
|
|||
if ((pos + 9 <= len)
|
||||
&& !scheme_strncmp(":xdigit:]", str XFORM_OK_PLUS pos, 9)) {
|
||||
if (map) {
|
||||
regcharclass('d', map);
|
||||
regcharclass('d', map, NULL);
|
||||
for (c = 'a'; c <= 'f'; c++) {
|
||||
map[c] = 1;
|
||||
map[c - ('a' - 'A')] = 1;
|
||||
|
@ -1753,7 +1759,7 @@ static char *regrange(int parse_flags, char *map)
|
|||
c = UCHAR(regparsestr[regparse + 1]);
|
||||
if (((c >= 'a') && (c <= 'z'))
|
||||
|| ((c >= 'A') && (c <= 'Z'))) {
|
||||
regcharclass(c, map);
|
||||
regcharclass(c, map, NULL);
|
||||
can_range = 0;
|
||||
} else {
|
||||
map[c] = 1;
|
||||
|
@ -1832,7 +1838,7 @@ regranges(int parse_flags, int at_start)
|
|||
/* unicode char class; give up */
|
||||
break;
|
||||
}
|
||||
regcharclass(regparsestr[regparse], new_map);
|
||||
regcharclass(regparsestr[regparse], new_map, NULL);
|
||||
|
||||
} else
|
||||
new_map[c] = 1;
|
||||
|
@ -4537,7 +4543,7 @@ static int translate(unsigned char *s, int len, char **result, int pcre)
|
|||
|
||||
rs.orig_len = len;
|
||||
rs.size = len;
|
||||
|
||||
|
||||
r = (unsigned char *)scheme_malloc_atomic(rs.size + 1);
|
||||
|
||||
/* We need to translate if the pattern contains any use of ".", if
|
||||
|
@ -4563,9 +4569,11 @@ static int translate(unsigned char *s, int len, char **result, int pcre)
|
|||
while ((k < len) && (s[k] != ']')) {
|
||||
if (s[k] > 127)
|
||||
saw_big = 1;
|
||||
else if (pcre && (s[k] == '\\') && (k + 1 < len))
|
||||
else if (pcre && (s[k] == '\\') && (k + 1 < len)) {
|
||||
if ((s[k+1] == 'D') || (s[k+1] == 'W') || (s[k+1] == 'S'))
|
||||
saw_big = 1;
|
||||
k++;
|
||||
else if (pcre
|
||||
} else if (pcre
|
||||
&& (s[k] == '[')
|
||||
&& (k + 1 < len)
|
||||
&& (s[k+1] == ':')
|
||||
|
@ -4584,6 +4592,7 @@ static int translate(unsigned char *s, int len, char **result, int pcre)
|
|||
} else {
|
||||
/* Need to translate. */
|
||||
char *simple_on;
|
||||
int non_ascii = 0;
|
||||
Scheme_Object *ranges;
|
||||
unsigned int *us, *range_array;
|
||||
int ulen, on_count, range_len, rp, p;
|
||||
|
@ -4593,7 +4602,8 @@ static int translate(unsigned char *s, int len, char **result, int pcre)
|
|||
scheme_utf8_decode(s, rs.i + 1, k, us, 0, -1, NULL, 0, 0);
|
||||
|
||||
/* The simple_on array lists ASCII chars to (not) find
|
||||
for the match */
|
||||
for the match, and `non_ascii` virtually extends
|
||||
to the rest of Unicode */
|
||||
simple_on = (char *)scheme_malloc_atomic(128);
|
||||
memset(simple_on, 0, 128);
|
||||
/* The ranges list is pairs of larger ranges */
|
||||
|
@ -4689,7 +4699,7 @@ static int translate(unsigned char *s, int len, char **result, int pcre)
|
|||
int c = us[p + 1];
|
||||
if (((c >= 'a') && (c <= 'z'))
|
||||
|| ((c >= 'A') && (c <= 'Z'))) {
|
||||
regcharclass(c, simple_on);
|
||||
regcharclass(c, simple_on, &non_ascii);
|
||||
p += 2;
|
||||
} else if (c < 128) {
|
||||
simple_on[c] = 1;
|
||||
|
@ -4751,6 +4761,13 @@ static int translate(unsigned char *s, int len, char **result, int pcre)
|
|||
}
|
||||
}
|
||||
|
||||
if (non_ascii) {
|
||||
/* Replace the ranges array to cover all non-ASCII characters */
|
||||
ranges = scheme_make_pair(scheme_make_pair(scheme_make_integer(128),
|
||||
scheme_make_integer(0x10FFFF)),
|
||||
scheme_null);
|
||||
}
|
||||
|
||||
/* Turn the ranges list into an array */
|
||||
range_len = scheme_list_length(ranges);
|
||||
range_array = (unsigned int *)scheme_malloc_atomic(2 * range_len * sizeof(unsigned int));
|
||||
|
@ -4893,16 +4910,61 @@ static int translate(unsigned char *s, int len, char **result, int pcre)
|
|||
}
|
||||
rs.i = k + 1;
|
||||
} else if (s[rs.i] == '\\') {
|
||||
/* Skip over next char, possibly big: */
|
||||
r[j++] = s[rs.i++];
|
||||
if ((rs.i < len)
|
||||
&& (s[rs.i] > 127)) {
|
||||
r[j++] = s[rs.i++];
|
||||
while ((rs.i < len) && ((s[rs.i] & 0xC0) == 0x80)) {
|
||||
r[j++] = s[rs.i++];
|
||||
}
|
||||
} else
|
||||
r[j++] = s[rs.i++];
|
||||
if (pcre
|
||||
&& (rs.i+1 < len)
|
||||
&& ((s[rs.i+1] == 'D')
|
||||
|| (s[rs.i+1] == 'W')
|
||||
|| (s[rs.i+1] == 'S'))) {
|
||||
/* matches non-ASCII characters, so convert */
|
||||
char *simple_on;
|
||||
int non_ascii;
|
||||
int n;
|
||||
|
||||
simple_on = (char *)scheme_malloc_atomic(128);
|
||||
memset(simple_on, 0, 128);
|
||||
|
||||
rs.i++;
|
||||
regcharclass(s[rs.i++], simple_on, &non_ascii);
|
||||
|
||||
r = make_room(r, j, 4, &rs);
|
||||
r[j++] = '(';
|
||||
r[j++] = '?';
|
||||
r[j++] = ':';
|
||||
r[j++] = '[';
|
||||
|
||||
for (n = 0; n < 128; ) {
|
||||
if (simple_on[n]) {
|
||||
int m;
|
||||
for (m = n + 1; (m < 128) && simple_on[m]; m++) {
|
||||
}
|
||||
r = make_room(r, j, 3, &rs);
|
||||
r[j++] = n;
|
||||
r[j++] = '-';
|
||||
r[j++] = m-1;
|
||||
n = m;
|
||||
} else
|
||||
n++;
|
||||
}
|
||||
r = make_room(r, j, 1, &rs);
|
||||
r[j++] = ']';
|
||||
|
||||
if (non_ascii) { /* we expect this to be true! */
|
||||
r = add_range(r, &j, &rs, 128, 0x10FFFF, 0);
|
||||
r = make_room(r, j, 1, &rs);
|
||||
}
|
||||
r[j++] = ')';
|
||||
} else {
|
||||
/* Skip over next char, possibly big: */
|
||||
r[j++] = s[rs.i++];
|
||||
if ((rs.i < len)
|
||||
&& (s[rs.i] > 127)) {
|
||||
r[j++] = s[rs.i++];
|
||||
while ((rs.i < len) && ((s[rs.i] & 0xC0) == 0x80)) {
|
||||
r[j++] = s[rs.i++];
|
||||
}
|
||||
} else
|
||||
r[j++] = s[rs.i++];
|
||||
}
|
||||
} else if ((s[rs.i] == '.')
|
||||
&& (!pcre
|
||||
|| (rs.i < 3)
|
||||
|
|
Loading…
Reference in New Issue
Block a user