From 408d6bb77381a05e48e98e170d37d98957a3c809 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Mon, 23 Jun 2014 15:27:48 +0100 Subject: [PATCH] Fix UTF-8 symbol repair Commit 6a5a3037b4 was not quite right, because it used sightly the wrong variant among a dozen decoding functions. The test suite caught the problem, but I forgot to run it before pushing. Also, repair the "Inside" documentation on the function that was incorrectly used, and document the new variant. --- .../scribblings/inside/strings.scrbl | 25 ++++++++++++++++--- racket/src/racket/include/mzwin.def | 1 + racket/src/racket/include/mzwin3m.def | 1 + racket/src/racket/include/racket.exp | 1 + racket/src/racket/include/racket3m.exp | 1 + racket/src/racket/src/schemef.h | 3 +++ racket/src/racket/src/schemex.h | 3 +++ racket/src/racket/src/schemex.inc | 1 + racket/src/racket/src/schemexm.h | 1 + racket/src/racket/src/string.c | 8 ++++++ racket/src/racket/src/symbol.c | 6 ++--- 11 files changed, 45 insertions(+), 6 deletions(-) diff --git a/pkgs/racket-pkgs/racket-doc/scribblings/inside/strings.scrbl b/pkgs/racket-pkgs/racket-doc/scribblings/inside/strings.scrbl index a9c0001811..0b92db11e0 100644 --- a/pkgs/racket-pkgs/racket-doc/scribblings/inside/strings.scrbl +++ b/pkgs/racket-pkgs/racket-doc/scribblings/inside/strings.scrbl @@ -57,11 +57,30 @@ If @var{ipos} is non-@cpp{NULL}, it is filled with the first undecoded how many bytes were decoded before decoding stopped. If @var{permissive} is non-zero, it is used as the decoding of bytes - that are not part of a valid UTF-8 encoding. Thus, the function - result can be @cpp{-2} only if @var{permissive} is @cpp{0}. + that are not part of a valid UTF-8 encoding or if the input ends in the + middle of an encoding. Thus, the function + result can be @cpp{-1} or @cpp{-2} only if @var{permissive} is @cpp{0}. This function does not allocate or trigger garbage collection.} +@function[(int scheme_utf8_decode_offset_prefix + [const-unsigned-char* s] + [int start] + [int end] + [mzchar* us] + [int dstart] + [int dend] + [intptr_t* ipos] + [char utf16] + [int permissive])]{ + +Like @cpp{scheme_utf8_decode}, but returns @cpp{-1} if the input ends +in the middle of a UTF-8 encoding even if @var{permission} is +non-zero. + +@history[#:added "6.0.1.13"]} + + @function[(int scheme_utf8_decode_as_prefix [const-unsigned-char* s] [int start] @@ -99,7 +118,7 @@ Like @cpp{scheme_utf8_decode}, but with fewer arguments. The [int permissive])]{ Like @cpp{scheme_utf8_decode}, but with fewer arguments. The - decoding produces UCS-4 @cpp{mzchar}s. If the buffer @var{us} + decoding produces UCS-4 @cpp{mzchar}s. The buffer @var{us} @bold{must} be non-@cpp{NULL}, and it is assumed to be long enough to hold the decoding (which cannot be longer than the length of the input, though it may be shorter). If @var{len} is negative, @cpp{strlen(@var{s})} diff --git a/racket/src/racket/include/mzwin.def b/racket/src/racket/include/mzwin.def index 7108c46b40..c6aaf892dc 100644 --- a/racket/src/racket/include/mzwin.def +++ b/racket/src/racket/include/mzwin.def @@ -338,6 +338,7 @@ EXPORTS scheme_is_cpointer scheme_get_proc_name scheme_utf8_decode + scheme_utf8_decode_offset_prefix scheme_utf8_decode_as_prefix scheme_utf8_decode_all scheme_utf8_decode_prefix diff --git a/racket/src/racket/include/mzwin3m.def b/racket/src/racket/include/mzwin3m.def index b502bb55c5..2fe6d62e67 100644 --- a/racket/src/racket/include/mzwin3m.def +++ b/racket/src/racket/include/mzwin3m.def @@ -353,6 +353,7 @@ EXPORTS scheme_is_cpointer scheme_get_proc_name scheme_utf8_decode + scheme_utf8_decode_offset_prefix scheme_utf8_decode_as_prefix scheme_utf8_decode_all scheme_utf8_decode_prefix diff --git a/racket/src/racket/include/racket.exp b/racket/src/racket/include/racket.exp index 0930dccaf7..737fc804b0 100644 --- a/racket/src/racket/include/racket.exp +++ b/racket/src/racket/include/racket.exp @@ -355,6 +355,7 @@ scheme_make_offset_external_cptr scheme_is_cpointer scheme_get_proc_name scheme_utf8_decode +scheme_utf8_decode_offset_prefix scheme_utf8_decode_as_prefix scheme_utf8_decode_all scheme_utf8_decode_prefix diff --git a/racket/src/racket/include/racket3m.exp b/racket/src/racket/include/racket3m.exp index 8121e41bd4..1905f9ad9f 100644 --- a/racket/src/racket/include/racket3m.exp +++ b/racket/src/racket/include/racket3m.exp @@ -361,6 +361,7 @@ scheme_make_offset_external_cptr scheme_is_cpointer scheme_get_proc_name scheme_utf8_decode +scheme_utf8_decode_offset_prefix scheme_utf8_decode_as_prefix scheme_utf8_decode_all scheme_utf8_decode_prefix diff --git a/racket/src/racket/src/schemef.h b/racket/src/racket/src/schemef.h index a863699b37..81a9922308 100644 --- a/racket/src/racket/src/schemef.h +++ b/racket/src/racket/src/schemef.h @@ -672,6 +672,9 @@ MZ_EXTERN const char *scheme_get_proc_name(Scheme_Object *p, int *len, int for_e MZ_EXTERN intptr_t scheme_utf8_decode(const unsigned char *s, intptr_t start, intptr_t end, unsigned int *us, intptr_t dstart, intptr_t dend, intptr_t *ipos, char utf16, int permissive); +MZ_EXTERN intptr_t scheme_utf8_decode_offset_prefix(const unsigned char *s, intptr_t start, intptr_t end, + unsigned int *us, intptr_t dstart, intptr_t dend, + intptr_t *ipos, char utf16, int permissive); MZ_EXTERN intptr_t scheme_utf8_decode_as_prefix(const unsigned char *s, intptr_t start, intptr_t end, unsigned int *us, intptr_t dstart, intptr_t dend, intptr_t *ipos, char utf16, int permissive); diff --git a/racket/src/racket/src/schemex.h b/racket/src/racket/src/schemex.h index 166741631e..f636aa6f71 100644 --- a/racket/src/racket/src/schemex.h +++ b/racket/src/racket/src/schemex.h @@ -541,6 +541,9 @@ const char *(*scheme_get_proc_name)(Scheme_Object *p, int *len, int for_error); intptr_t (*scheme_utf8_decode)(const unsigned char *s, intptr_t start, intptr_t end, unsigned int *us, intptr_t dstart, intptr_t dend, intptr_t *ipos, char utf16, int permissive); +intptr_t (*scheme_utf8_decode_offset_prefix)(const unsigned char *s, intptr_t start, intptr_t end, + unsigned int *us, intptr_t dstart, intptr_t dend, + intptr_t *ipos, char utf16, int permissive); intptr_t (*scheme_utf8_decode_as_prefix)(const unsigned char *s, intptr_t start, intptr_t end, unsigned int *us, intptr_t dstart, intptr_t dend, intptr_t *ipos, char utf16, int permissive); diff --git a/racket/src/racket/src/schemex.inc b/racket/src/racket/src/schemex.inc index a90dfd9929..9a40c6ce4b 100644 --- a/racket/src/racket/src/schemex.inc +++ b/racket/src/racket/src/schemex.inc @@ -398,6 +398,7 @@ scheme_extension_table->scheme_is_cpointer = scheme_is_cpointer; scheme_extension_table->scheme_get_proc_name = scheme_get_proc_name; scheme_extension_table->scheme_utf8_decode = scheme_utf8_decode; + scheme_extension_table->scheme_utf8_decode_offset_prefix = scheme_utf8_decode_offset_prefix; scheme_extension_table->scheme_utf8_decode_as_prefix = scheme_utf8_decode_as_prefix; scheme_extension_table->scheme_utf8_decode_all = scheme_utf8_decode_all; scheme_extension_table->scheme_utf8_decode_prefix = scheme_utf8_decode_prefix; diff --git a/racket/src/racket/src/schemexm.h b/racket/src/racket/src/schemexm.h index 80ad02891f..32425656a7 100644 --- a/racket/src/racket/src/schemexm.h +++ b/racket/src/racket/src/schemexm.h @@ -398,6 +398,7 @@ #define scheme_is_cpointer (scheme_extension_table->scheme_is_cpointer) #define scheme_get_proc_name (scheme_extension_table->scheme_get_proc_name) #define scheme_utf8_decode (scheme_extension_table->scheme_utf8_decode) +#define scheme_utf8_decode_offset_prefix (scheme_extension_table->scheme_utf8_decode_offset_prefix) #define scheme_utf8_decode_as_prefix (scheme_extension_table->scheme_utf8_decode_as_prefix) #define scheme_utf8_decode_all (scheme_extension_table->scheme_utf8_decode_all) #define scheme_utf8_decode_prefix (scheme_extension_table->scheme_utf8_decode_prefix) diff --git a/racket/src/racket/src/string.c b/racket/src/racket/src/string.c index ddaeb18cc4..e770f84bbb 100644 --- a/racket/src/racket/src/string.c +++ b/racket/src/racket/src/string.c @@ -5476,6 +5476,14 @@ intptr_t scheme_utf8_decode(const unsigned char *s, intptr_t start, intptr_t end ipos, NULL, utf16, utf16, NULL, 0, permissive); } +intptr_t scheme_utf8_decode_offset_prefix(const unsigned char *s, intptr_t start, intptr_t end, + unsigned int *us, intptr_t dstart, intptr_t dend, + intptr_t *ipos, char utf16, int permissive) +{ + return utf8_decode_x(s, start, end, us, dstart, dend, + ipos, NULL, utf16, utf16, NULL, 1, permissive); +} + intptr_t scheme_utf8_decode_as_prefix(const unsigned char *s, intptr_t start, intptr_t end, unsigned int *us, intptr_t dstart, intptr_t dend, intptr_t *ipos, char utf16, int permissive) diff --git a/racket/src/racket/src/symbol.c b/racket/src/racket/src/symbol.c index 41b031e670..f2de0e28de 100644 --- a/racket/src/racket/src/symbol.c +++ b/racket/src/racket/src/symbol.c @@ -613,9 +613,9 @@ const char *scheme_symbol_name_and_size(Scheme_Object *sym, uintptr_t *length, i mzchar buf[2]; int ul = 1; while (1) { - if (scheme_utf8_decode((unsigned char *)s, i, i + ul, - buf, 0, 1, - NULL, 0, '?') > 0) + if (scheme_utf8_decode_offset_prefix((unsigned char *)s, i, i + ul, + buf, 0, 1, + NULL, 0, '?') > 0) break; ul++; }