From d8793e5b8b096c0019b97fb07d0df7240698eeb5 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Fri, 10 Oct 2014 11:30:28 -0600 Subject: [PATCH] Always convert string<->paths with UTF-8 on Windows Also, document representation information on paths. In particular, explain that Unix and Mac OS X paths are natively byte strings, while Windows paths are natively UTF-16 code-unit sequences. The byte-string representation of a Windows path is a UTF-8-like encoding of the UTF-16 code-unit sequence, which is why it makes no sense to convert it using the current locale's encoding. --- .../scribblings/reference/bytes.scrbl | 6 +- .../scribblings/reference/paths.scrbl | 76 ++++++++++++------- .../scribblings/reference/unix-paths.scrbl | 12 ++- .../scribblings/reference/windows-paths.scrbl | 27 ++++++- racket/src/racket/src/file.c | 8 ++ 5 files changed, 95 insertions(+), 34 deletions(-) diff --git a/pkgs/racket-pkgs/racket-doc/scribblings/reference/bytes.scrbl b/pkgs/racket-pkgs/racket-doc/scribblings/reference/bytes.scrbl index acd03b1417..4b2f62222f 100644 --- a/pkgs/racket-pkgs/racket-doc/scribblings/reference/bytes.scrbl +++ b/pkgs/racket-pkgs/racket-doc/scribblings/reference/bytes.scrbl @@ -463,10 +463,10 @@ Certain encoding combinations are always available: include UTF-16 code units that are unpaired surrogates, and the corresponding output includes an encoding of each surrogate in a natural extension of UTF-8. On @|AllUnix|, surrogates are - assumed to be paired: a pair of bytes with the bits @racket[#xD800] - starts a surrogate pair, and the @racket[#x03FF] bits are used from + assumed to be paired: a pair of bytes with the bits @code{#xD800} + starts a surrogate pair, and the @code{#x03FF} bits are used from the pair and following pair (independent of the value of the - @racket[#xDC00] bits). On all platforms, performance may be poor + @code{#xDC00} bits). On all platforms, performance may be poor when decoding from an odd offset within an input byte string.} ] diff --git a/pkgs/racket-pkgs/racket-doc/scribblings/reference/paths.scrbl b/pkgs/racket-pkgs/racket-doc/scribblings/reference/paths.scrbl index 2979f78979..0a3d383e59 100644 --- a/pkgs/racket-pkgs/racket-doc/scribblings/reference/paths.scrbl +++ b/pkgs/racket-pkgs/racket-doc/scribblings/reference/paths.scrbl @@ -6,7 +6,10 @@ When a Racket procedure takes a filesystem path as an argument, the path can be provided either as a string or as an instance of the @deftech{path} datatype. If a string is provided, it is converted to a -path using @racket[string->path]. A Racket procedure that generates a +path using @racket[string->path]. Beware that some paths may not +be representable as strings; see @secref["unixpathrep"] and +@secref["windowspathrep"] for more information. +A Racket procedure that generates a filesystem path always generates a @tech{path} value. By default, paths are created and manipulated for the current @@ -33,8 +36,8 @@ path before using it. Procedures that build paths or merely check the form of a path do not cleanse paths, with the exceptions of @racket[cleanse-path], @racket[expand-user-path], and @racket[simplify-path]. For more information about path cleansing and -other platform-specific details, see @secref["unixpaths"] for -@|AllUnix| paths and @secref["windowspaths"] for Windows paths. +other platform-specific details, see @secref["unixpaths"] and +@secref["windowspaths"]. @;------------------------------------------------------------------------ @section{Manipulating Paths} @@ -56,35 +59,49 @@ current platform or a non-empty string without nul characters, Returns @racket[#t] if @racket[v] is a path value for some platform (not a string), @racket[#f] otherwise.} + @defproc[(string->path [str string?]) path?]{ -Produces a path whose byte-string name is -@racket[(string->bytes/locale string (char->integer #\?))]. +Produces a path whose byte-string encoding is +@racket[(string->bytes/locale str (char->integer #\?))] on @|AllUnix| +or @racket[(string->bytes/utf-8 str)] on Windows. Beware that the current locale might not encode every string, in which case @racket[string->path] can produce the same path for different @racket[str]s. See also @racket[string->path-element], which should be used instead of @racket[string->path] when a string represents a -single @tech{path element}. +single @tech{path element}. For information on how strings and byte +strings encode paths, see @secref["unixpathrep"] and +@secref["windowspathrep"]. + +See also @racket[string->some-system-path], and see +@secref["unixpathrep"] and @secref["windowspathrep"] for information +on how strings encode paths. + +@history[#:changed "6.1.1.1" @elem{Changed Windows conversion to always use UTF-8.}]} -See also @racket[string->some-system-path].} @defproc[(bytes->path [bstr bytes?] [type (or/c 'unix 'windows) (system-path-convention-type)]) path?]{ -Produces a path (for some platform) whose byte-string name is +Produces a path (for some platform) whose byte-string encoding is @racket[bstr]. The optional @racket[type] specifies the convention to use for the path. For converting relative @tech{path elements} from literals, use instead @racket[bytes->path-element], which applies a suitable encoding for -individual elements.} +individual elements. + +For information on how byte strings encode paths, see +@secref["unixpathrep"] and @secref["windowspathrep"].} + @defproc[(path->string [path path?]) string?]{ Produces a string that represents @racket[path] by decoding -@racket[path]'s byte-string name using the current locale's encoding; +@racket[path]'s byte-string encoding using the current locale +on @|AllUnix| and by using UTF-8 on Windows. In the former case, @litchar{?} is used in the result string where encoding fails, and if the encoding result is the empty string, then the result is @racket["?"]. @@ -101,11 +118,14 @@ instead, to avoid special encodings use to represent some relative paths. See @secref["windowspaths"] for specific information about the conversion of Windows paths. -See also @racket[some-system-path->string].} +See also @racket[some-system-path->string]. + +@history[#:changed "6.1.1.1" @elem{Changed Windows conversion to always use UTF-8.}]} + @defproc[(path->bytes [path path-for-some-system?]) bytes?]{ -Produces @racket[path]'s byte string representation. No information is +Produces @racket[path]'s byte-string representation. No information is lost in this translation, so that @racket[(bytes->path (path->bytes path) (path-convention-type path))] always produces a path that is @racket[equal?] to @racket[path]. The @racket[path] argument can be a @@ -116,23 +136,26 @@ unmarshaling paths, but manipulating the byte form of a path is generally a mistake. In particular, the byte string may start with a @litchar{\\?\REL} encoding for Windows paths. Instead of @racket[path->bytes], use @racket[split-path] and -@racket[path-element->bytes] to manipulate individual @tech{path elements}.} +@racket[path-element->bytes] to manipulate individual @tech{path elements}. + +For information on how byte strings encode paths, see +@secref["unixpathrep"] and @secref["windowspathrep"].} + @defproc[(string->path-element [str string?]) path?]{ Like @racket[string->path], except that @racket[str] corresponds to a single relative element in a path, and it is encoded as necessary to -convert it to a path. See @secref["unixpaths"] for more information -on the conversion for @|AllUnix| paths, and see -@secref["windowspaths"] for more information on the conversion for -Windows paths. +convert it to a path. See @secref["unixpaths"] and +@secref["windowspaths"] for more information on the conversion of +paths. If @racket[str] does not correspond to any @tech{path element} (e.g., it is an absolute path, or it can be split), or if it corresponds to an up-directory or same-directory indicator on @|AllUnix|, then @exnraise[exn:fail:contract]. -As for @racket[path->string], information can be lost from +Like @racket[path->string], information can be lost from @racket[str] in the locale-specific conversion to a path.} @@ -157,7 +180,7 @@ other path is deconstructed with @racket[split-path] and Like @racket[path->string], except that trailing path separators are removed (as by @racket[split-path]). On Windows, any @litchar{\\?\REL} encoding prefix is also removed; see -@secref["windowspaths"] for more information on Windows paths. +@secref["windowspaths"] for more information. The @racket[path] argument must be such that @racket[split-path] applied to @racket[path] would return @racket['relative] as its first @@ -245,9 +268,8 @@ is empty or contains a nul character), the The @racket[build-path] procedure builds a path @italic{without} checking the validity of the path or accessing the filesystem. -See @secref["unixpaths"] for more information on the construction -of @|AllUnix| paths, and see @secref["windowspaths"] for more -information on the construction of Windows paths. +See @secref["unixpaths"] and @secref["windowspaths"] for more +information on the construction of paths. The following examples assume that the current directory is @filepath{/home/joeuser} for Unix examples and @filepath{C:\Joe's Files} for @@ -420,9 +442,8 @@ true, but the source or simplified path might be a non-existent path. If still involve a cycle of links if the cycle did not inhibit the simplification). -See @secref["unixpaths"] for more information on simplifying -@|AllUnix| paths, and see @secref["windowspaths"] for more -information on simplifying Windows paths.} +See @secref["unixpaths"] and @secref["windowspaths"] for more +information on simplifying paths.} @defproc[(normal-case-path [path (or/c path-string? path-for-some-system?)]) @@ -489,9 +510,8 @@ platform, and resulting paths for the same platform. This procedure does not access the filesystem. -See @secref["unixpaths"] for more information on splitting -@|AllUnix| paths, and see @secref["windowspaths"] for more -information on splitting Windows paths.} +See @secref["unixpaths"] and @secref["windowspaths"] for more +information on splitting paths.} @defproc[(explode-path [path (or/c path-string? path-for-some-system?)]) diff --git a/pkgs/racket-pkgs/racket-doc/scribblings/reference/unix-paths.scrbl b/pkgs/racket-pkgs/racket-doc/scribblings/reference/unix-paths.scrbl index ef426811a4..e24925fd95 100644 --- a/pkgs/racket-pkgs/racket-doc/scribblings/reference/unix-paths.scrbl +++ b/pkgs/racket-pkgs/racket-doc/scribblings/reference/unix-paths.scrbl @@ -3,7 +3,7 @@ @title[#:tag "unixpaths"]{@|AllUnix| Paths} -In @|AllUnix| paths, a @litchar{/} separates elements of the path, +In a path on @|AllUnix|, a @litchar{/} separates elements of the path, @litchar{.} as a path element always means the directory indicated by preceding path, and @litchar{..} as a path element always means the parent of the directory indicated by the preceding path. A leading @@ -35,3 +35,13 @@ _path)]. Since that is not the case for other platforms, however, be used when converting individual path elements. On Mac OS X, Finder aliases are zero-length files. + + +@section[#:tag "unixpathrep"]{Unix Path Representation} + +A path on @|AllUnix| is natively a byte string. For presentation to +users and for other string-based operations, a path is converted +to/from a string using the current locale's encoding with @litchar{?} +(encoding) or @code{#\uFFFD} (decoding) in place of errors. Beware +that the encoding may not accommodate all possible paths as +distinct strings. diff --git a/pkgs/racket-pkgs/racket-doc/scribblings/reference/windows-paths.scrbl b/pkgs/racket-pkgs/racket-doc/scribblings/reference/windows-paths.scrbl index c93af4b0b1..119a80d6ab 100644 --- a/pkgs/racket-pkgs/racket-doc/scribblings/reference/windows-paths.scrbl +++ b/pkgs/racket-pkgs/racket-doc/scribblings/reference/windows-paths.scrbl @@ -3,7 +3,7 @@ @(define MzAdd (italic "Racket-specific:")) -@title[#:tag "windowspaths"]{Windows Path Conventions} +@title[#:tag "windowspaths"]{Windows Paths} In general, a Windows pathname consists of an optional drive specifier and a drive-specific path. A Windows path can be @defterm{absolute} @@ -101,7 +101,8 @@ include @litchar{\}. @litchar{\\}@nonterm{machine}@litchar{\}@nonterm{volume} counts as the drive specifier.} - @item{Normally, a path element cannot contain any of the following + @item{Normally, a path element cannot contain a character in the + range @racket[#\x00] to @racket[#\x1F] nor any of the following characters: @centerline{@litchar{<} @litchar{>} @litchar{:} @litchar{"} @litchar{/} @litchar{\} @litchar{|}} @@ -314,3 +315,25 @@ produces @litchar{\\?\C:\x~\} and @litchar{\\?\REL\\aux}; the @litchar{\\?\} is needed in these cases to preserve a trailing space after @litchar{x} and to avoid referring to the AUX device instead of an @filepath{aux} file. + +@section[#:tag "windowspathrep"]{Windows Path Representation} + +A path on Windows is natively a sequence of UTF-16 code units, where +the sequence can include unpaired surrogates. This sequence is encoded +as a byte string through an extension of UTF-8, where unpaired +surrogates in the UTF-16 code-unit sequence are converted as if they +were non-surrogate values. The extended encodings are implemented on +Windows as the @racket["platform-UTF-16"] and +@racket["platform-UTF-8"] encodings for @racket[bytes-open-converter]. + +Racket's internal representation of a Windows path is a byte string, +so that @racket[path->bytes] and @racket[bytes->path] are always +inverses. When converting a path to a native UTF-16 code-unit +sequence, @racket[#\tab] is used in place of platform-UTF-8 decoding +errors (on the grounds that tab is normally disallowed as a character +in a Windows path, unlike @code{#\uFFFD}). + +A Windows path is converted to a string by treating the platform-UTF-8 +encoding as a UTF-8 encoding with @code{#\uFFFD} in place of +decoding errors. Similarly, a string is converted to a path by UTF-8 +encoding (in which case no errors are possible). diff --git a/racket/src/racket/src/file.c b/racket/src/racket/src/file.c index 7323bd575e..cb63119e7c 100644 --- a/racket/src/racket/src/file.c +++ b/racket/src/racket/src/file.c @@ -813,7 +813,11 @@ static Scheme_Object *append_path(Scheme_Object *a, Scheme_Object *b) Scheme_Object *scheme_char_string_to_path(Scheme_Object *p) { +#ifdef DOS_FILE_SYSTEM + p = scheme_char_string_to_byte_string(p); +#else p = scheme_char_string_to_byte_string_locale(p); +#endif p->type = SCHEME_PLATFORM_PATH_KIND; return p; } @@ -889,7 +893,11 @@ Scheme_Object *scheme_path_to_char_string(Scheme_Object *p) { Scheme_Object *s; +#ifdef DOS_FILE_SYSTEM + s = scheme_byte_string_to_char_string(p); +#else s = scheme_byte_string_to_char_string_locale(p); +#endif if (!SCHEME_CHAR_STRLEN_VAL(s)) return scheme_make_utf8_string("?");