%-encoding extension in module paths; new r6rs library name encoding

svn: r10697
This commit is contained in:
Matthew Flatt 2008-07-09 15:41:38 +00:00
parent 2f22ed7c41
commit a4cc0f7890
7 changed files with 172 additions and 36 deletions

View File

@ -0,0 +1,24 @@
#lang scheme/base
(provide encode-name)
(define (encode-name s)
(let ([s (if (symbol? s)
(symbol->string s)
s)])
(cond
[(regexp-match #rx"(.*?)([^a-zA-Z0-9_+-]+)(.*)" s)
=> (lambda (m)
(string-append
(cadr m)
(apply
string-append
(map (lambda (c)
(let ([s (format "0~x" c)])
(string-append
"%"
(substring s (- (string-length s) 2)))))
(bytes->list (string->bytes/utf-8 (caddr m)))))
(encode-name (cadddr m))))]
[else s])))

View File

@ -1,6 +1,7 @@
#lang scheme/base
(require "find-version.ss"
"encode-name.ss"
(for-template scheme/base))
(provide parse-import)
@ -37,10 +38,19 @@
(andmap identifier? (syntax->list #'(id2 ...)))
(is-version-reference? #'(vers ...)))
(let-values ([(coll file)
(let ([strs (map (lambda (id)
(let* ([strs (map (lambda (id)
(symbol->string (syntax-e id)))
(syntax->list #'(id1 id2 ...)))])
(if (= 1 (length strs))
(syntax->list #'(id1 id2 ...)))]
[len (length strs)]
[strs (map
encode-name
(if (and (= 2 len) (regexp-match? #rx"^main_*$" (cadr strs)))
;; rename (X main_*) => (X main__*)
(list (car strs)
(string-append (cadr strs) "_"))
;; no rename
strs))])
(if (= 1 len)
(values (list (car strs)) "main")
(values (reverse (cdr (reverse strs)))
(car (reverse strs)))))])

View File

@ -6,7 +6,8 @@
setup/dirs
scheme/port
scheme/file
"private/readtable.ss")
"private/readtable.ss"
"private/encode-name.ss")
(define install-mode (make-parameter #f))
(define compile-mode (make-parameter #f))
@ -123,11 +124,23 @@
(loop (cdr name)))]))))
(define (name->path name)
(let* ([name (if (or (= (length name) 1)
(and (= (length name) 2)
(let* ([name (let ([len (length name)])
(cond
[(or (= len 1)
(and (= len 2)
(not (symbol? (cadr name)))))
(list* (car name) 'main (cdr name))
name)])
;; Add implicit "main":
(list* (car name) 'main (cdr name))]
[(and (or (= len 2)
(and (= len 3)
(not (symbol? (caddr name)))))
(regexp-match #rx"^main_*$" (symbol->string (cadr name))))
;; Rename (X main_*) => (X main__*)
(list* (car name)
(string->symbol
(string-append (symbol->string (cadr name)) "_"))
(cddr name))]
[else name]))])
(apply build-path
(if (install-all-users)
(find-collects-dir)
@ -140,7 +153,7 @@
;; versioned:
(list
(format "~a~a.ss"
(car name)
(encode-name (car name))
(apply
string-append
(map (lambda (v)
@ -148,9 +161,9 @@
(cadr name)))))]
[(null? (cdr name))
;; unversioned:
(list (format "~a.ss" (car name)))]
(list (format "~a.ss" (encode-name (car name))))]
[else
(cons (symbol->string (car name))
(cons (encode-name (car name))
(loop (cdr name)))])))))
;; ----------------------------------------

View File

@ -175,8 +175,9 @@ a name is converted to a PLT Scheme module pathname (see @secref[#:doc
guide-src "module-paths"]) by concatenating the symbols with a
@litchar{/} separator, and then appending the version integers each
with a preceeding @litchar{-}. As a special case, when an @|r6rs| path
contains a single symbol followed by a version, a @schemeidfont{main}
symbol is effectively inserted after the initial symbol.
contains a single symbol (optionally followed by a version), a
@schemeidfont{main} symbol is effectively inserted after the initial
symbol. See below for further encoding considerations.
When an @|r6rs| library or top-level program refers to another
library, it can supply version constraints rather than naming a
@ -190,13 +191,41 @@ search order for file extensions is @filepath{.mzscheme.ss},
resolving version constraints, these extensions are all tried when
looking for matches.
To ensure that all @|r6rs| library names can be converted to a unique
and distinct library module path, the following conversions are
applied to each symbol before concatenating them:
@itemize{
@item{The symbol is encoded using UTF-8, and the resulting bytes are
treated as Latin-1 encoded characters. ASCII letters, digits,
@litchar{+}, @litchar{-}, and @litchar{_} are left as-is; other
characters are replaced by @litchar{%} followed by two lowercase
hexadecimal digits. Note that UTF-8 encodes ASCII letters, digits,
@|etc| as themselves, so typical library names correspond to readable
module paths.}
@item{If the @|r6rs| library reference has two symbol elements and
the second one is @schemeidfont{main} followed by any number of
underscores, then an extra underscore is added to that symbol. This
conversion avoids a collision between an explicit @schemeidfont{main}
and the implicit @schemeidfont{main} when a library path has a single
symbol element.}
}
Examples (assuming a typical PLT Scheme installation):
@schemeblock[
(rnrs io simple (6)) #, @elem{means} (lib "rnrs/io/simple-6.ss")
(rnrs) #, @elem{means} (lib "rnrs/main-6.ss")
(rnrs main) #, @elem{means} (lib "rnrs/main_.ss")
(rnrs (6)) #, @elem{means} (lib "rnrs/main-6.ss")
(scheme base) #, @elem{means} (lib "scheme/base.ss")
(achtung!) #, @elem{means} (lib "achtung%21/main.ss")
(funco new-λ) #, @elem{means} (lib "funco/new-%ce%bb.ss")
]

View File

@ -325,10 +325,20 @@ corresponds to the default @tech{module name resolver}.
@litchar{/} is the path delimiter (multiple adjacent @litchar{/}s are
treated as a single delimiter), @litchar{..} accesses the parent
directory, and @litchar{.} accesses the current directory. The path
cannot be empty or contain a leading or trailing slash, path
elements before than the last one cannot include a file suffix,
cannot be empty or contain a leading or trailing slash, path elements
before than the last one cannot include a file suffix (i.e., a
@litchar{.} in an element other than @litchar{.} or @litchar{..}),
and the only allowed characters are ASCII letters, ASCII digits,
@litchar{-}, @litchar{+}, @litchar{_}, @litchar{.}, and @litchar{/}.}
@litchar{-}, @litchar{+}, @litchar{_}, @litchar{.}, @litchar{/}, and
@litchar{%}. Furthermore, a @litchar{%} is allowed only when followed
by two lowercase hexadecimal digits, and the digits must form a
number that is not the ASCII value of a letter, digit, @litchar{-},
@litchar{+}, or @litchar{_}.
@margin-note{The @litchar{%} provision is intended to support a
one-to-one encoding of arbitrary strings as path elements (after
UTF-8 encoding). Such encodings are not decoded to arrive at a
filename, but instead preserved in the file access.}}
@defsubform[(lib rel-string ...+)]{A path to a module installed into
a @tech{collection} (see @secref["collects"]). The @scheme[rel-string]s in
@ -406,17 +416,19 @@ corresponds to the default @tech{module name resolver}.
]
and where an @nonterm{elem} is a non-empty sequence of characters
that are ASCII letters, ASCII digits, @litchar{-}, @litchar{+}, or
@litchar{_}, and an @nonterm{int} is a non-empty sequence of ASCII
digits. As this shorthand is expended, a @filepath{.plt} extension is
added to @nonterm{pkg}, and a @filepath{.ss} extension is added to
that are ASCII letters, ASCII digits, @litchar{-}, @litchar{+},
@litchar{_}, or @litchar{%} followed by lowercase hexadecimal digits
(that do not encode one of the other allowed characters), and an
@nonterm{int} is a non-empty sequence of ASCII digits. As this
shorthand is expended, a @filepath{.plt} extension is added to
@nonterm{pkg}, and a @filepath{.ss} extension is added to
@scheme{path}; if no @nonterm{path} is included, @filepath{main.ss}
is used in the expansion.
A @scheme[(planet string)] form is like a @scheme[(planet id)] form
with the identifier converted to a string, except that the
@scheme[string] can optionally end with a file extension for a
@nonterm{path}.
@scheme[string] can optionally end with a file extension (i.e., a
@litchar{.}) for a @nonterm{path}.
In the more general last form of a @scheme[planet] module path, the
@scheme[rel-string]s are similar to the @scheme[lib] form, except

View File

@ -307,14 +307,24 @@
(test #t module-path? "hello")
(test #t module-path? "hello.ss")
(test #f module-path? "hello*ss")
(test #t module-path? "hello%2ess")
(test #t module-path? "hello%00ss")
(test #f module-path? "hello%2Ess")
(test #f module-path? "hello%41ss")
(test #f module-path? "hello%4")
(test #f module-path? "hello%")
(test #f module-path? "hello%q0")
(test #f module-path? "hello%0q")
(test #f module-path? "foo.ss/hello")
(test #f module-path? "foo/")
(test #f module-path? "a/foo/")
(test #f module-path? "/foo.ss")
(test #f module-path? "/a/foo.ss")
(test #f module-path? "a/foo.ss/b")
(test #t module-path? "a/foo%2ess/b")
(test #t module-path? "a/_/b")
(test #t module-path? "a/0123456789+-_/b.---")
(test #t module-path? "a/0123456789+-_/b.-%2e")
(test #t module-path? "../foo.ss")
(test #t module-path? "x/../foo.ss")
(test #t module-path? "x/./foo.ss")
@ -324,6 +334,8 @@
(test #t module-path? 'hello)
(test #f module-path? 'hello/)
(test #f module-path? 'hello.ss)
(test #t module-path? 'hello%2ess)
(test #f module-path? 'hello%2Ess)
(test #f module-path? 'hello/a.ss)
(test #f module-path? '/hello/a.ss)
(test #f module-path? '/hello)
@ -356,6 +368,9 @@
(test #f module-path? '(planet))
(test #f module-path? '(planet robby))
(test #t module-path? '(planet robby/redex))
(test #t module-path? '(planet robby%2e/%2eredex))
(test #f module-path? '(planet robby%2/redex))
(test #f module-path? '(planet robby/redex%2))
(test #f module-path? '(planet robby/redex/))
(test #f module-path? '(planet robby/redex/foo/))
(test #f module-path? '(planet /robby/redex/foo))
@ -373,6 +388,7 @@
(test #t module-path? '(planet robby/redex:7:8-9/foo))
(test #t module-path? '(planet robby/redex:7:8-9))
(test #t module-path? '(planet robby/redex:700:800-00900/foo))
(test #t module-path? '(planet robby/redex:700:800-00900/foo%2e))
(test #f module-path? '(planet robby/redex:=7/foo))
(test #f module-path? '(planet robby/redex::8/foo))
(test #f module-path? '(planet robby/redex:7:/foo))
@ -389,6 +405,7 @@
(test #t module-path? '(planet "foo.ss" ("robby" "redex.plt" 7 8)))
(test #t module-path? '(planet "foo.ss" ("robby" "redex.plt" 7 (= 8))))
(test #t module-path? '(planet "foo.ss" ("robby" "redex.plt") "sub" "deeper"))
(test #t module-path? '(planet "foo%2e.ss" ("robby%2e" "redex%2e.plt") "sub%2e" "%2edeeper"))
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

View File

@ -1818,6 +1818,39 @@ static Scheme_Object *namespace_unprotect_module(int argc, Scheme_Object *argv[]
return scheme_void;
}
static int plain_char(int c)
{
return (((c >= 'a') && (c <= 'z'))
|| ((c >= 'A') && (c <= 'Z'))
|| ((c >= '0') && (c <= '9'))
|| (c == '-')
|| (c == '_')
|| (c == '+'));
}
static int ok_hex(int c)
{
return (((c >= 'a') && (c <= 'f'))
|| ((c >= '0') && (c <= '9')));
}
static int ok_escape(int c1, int c2)
{
c1 = (((c1 >= 'a') && (c1 <= 'f'))
? (c1 - 'a' + 10)
: (c1 - '0'));
c2 = (((c2 >= 'a') && (c2 <= 'f'))
? (c2 - 'a' + 10)
: (c2 - '0'));
c1 = (c1 << 4) + c2;
if (plain_char(c1))
return 0;
else
return 1;
}
static int ok_path_string(Scheme_Object *obj, int dir_ok, int just_file_ok, int file_end_ok, int for_planet)
{
mzchar *s = SCHEME_CHAR_STR_VAL(obj);
@ -1929,12 +1962,11 @@ static int ok_path_string(Scheme_Object *obj, int dir_ok, int just_file_ok, int
}
prev_was_slash = 0;
} else {
if (((c >= 'a') && (c <= 'z'))
|| ((c >= 'A') && (c <= 'Z'))
|| ((c >= '0') && (c <= '9'))
|| (c == '-')
|| (c == '_')
|| (c == '+')) {
if (plain_char(c)
|| ((c == '%')
&& ok_hex(s[i+1])
&& ok_hex(s[i+2])
&& ok_escape(s[i+1], s[i+2]))) {
prev_was_slash = 0;
} else if ((i < start_package_pos) || (i >= end_package_pos))
return 0;
@ -1998,13 +2030,12 @@ static int ok_planet_string(Scheme_Object *obj)
while (i--) {
c = s[i];
if (((c >= 'a') && (c <= 'z'))
|| ((c >= 'A') && (c <= 'Z'))
|| ((c >= '0') && (c <= '9'))
|| (c == '.')
|| (c == '-')
|| (c == '_')
|| (c == '+')) {
if ((c == '%')
&& ok_hex(s[i+1])
&& ok_hex(s[i+2])
&& ok_escape(s[i+1], s[i+2])) {
/* ok */
} else if (plain_char(c) || (c == '.')) {
/* ok */
} else
return 0;