URL parsing is closer to the RFC now

svn: r5965
This commit is contained in:
Eli Barzilay 2007-04-17 11:56:20 +00:00
parent cc8e660737
commit e885e79295
2 changed files with 54 additions and 47 deletions

View File

@ -13,8 +13,8 @@
(require (lib "file.ss")
(lib "unit.ss")
(lib "port.ss")
(lib "string.ss")
(lib "list.ss")
(lib "string.ss")
"url-structs.ss"
"uri-codec.ss"
"url-sig.ss"
@ -351,26 +351,37 @@
(if (char=? (string-ref string 0) #\/) "file" "http"))
url)))))
;; string->url : str -> url
;; New implementation, mostly provided by Neil Van Dyke
;; URL parsing regexp
;; this is following the regexp in Appendix B of rfc 3986, except for using
;; `*' instead of `+' for the scheme part (it is checked later anyway, and
;; we don't want to parse it as a path element), and the user@host:port is
;; parsed here.
(define url-rx
(regexp (string-append
"^"
"[ \t\f\r\n]*"
"(?:" ; <A front-opt
"(?:([^:&;?/]*):)?" ; =1 scheme-colon-opt (see below)
"(?:" ; <B slashslash-opt
"//"
"(?:([^:/@;?#]*)@)?" ; =2 user-at-opt
"([^:/@;?#]*)?" ; =3 host-opt
"(?::([0-9]*))?" ; =4 colon-port-opt
")?" ; >B slashslash-opt
")?" ; >A front-opt
"([^?#]*)" ; =5 path
"(?:\\?([^#]*))?" ; =6 question-query-opt
"(?:#(.*))?" ; =7 hash-fragment-opt
"[ \t\f\r\n]*"
"(?:" ; / scheme-colon-opt
"([^:/?#]*)" ; | #1 = scheme-opt
":)?" ; \
"(?://" ; / slash-slash-authority-opt
"(?:" ; | / user-at-opt
"([^/?#@]*)" ; | | #2 = user-opt
"@)?" ; | \
"([^/?#:]*)?" ; | #3 = host-opt
"(?::" ; | / colon-port-opt
"([0-9]*)" ; | | #4 = port-opt
")?" ; | \
")?" ; \
"([^?#]*)" ; #5 = path
"(?:\\?" ; / question-query-opt
"([^#]*)" ; | #6 = query-opt
")?" ; \
"(?:#" ; / hash-fragment-opt
"(.*)" ; | #7 = fragment-opt
")?" ; \
"$")))
;; string->url : str -> url
;; Original version by Neil Van Dyke
(define (string->url str)
(apply
(lambda (scheme user host port path query fragment)
@ -382,21 +393,14 @@
(eq? 'windows url:os-type))
(set! path (string-append host ":" path))
(set! host #f))
(let* ([user (uri-decode/maybe user)]
[port (and port (string->number port))]
[abs? (and (not (= 0 (string-length path)))
(char=? #\/ (string-ref path 0)))]
[path (separate-path-strings
;; If path is "" and the input is an absolute URL
;; with a hostname, then the intended path is "/",
;; but the URL is missing a "/" at the end.
path
#;
(if (and (string=? path "") host) "/" path))]
[query (if query (form-urlencoded->alist query) '())]
(let* ([scheme (and scheme (string-downcase scheme))]
[host (and host (string-downcase host))]
[user (uri-decode/maybe user)]
[port (and port (string->number port))]
[abs? (regexp-match? #rx"^/" path)]
[path (separate-path-strings path)]
[query (if query (form-urlencoded->alist query) '())]
[fragment (uri-decode/maybe fragment)])
(when (string? scheme) (string-lowercase! scheme))
(when (string? host) (string-lowercase! host))
(make-url scheme user host port abs? path query fragment)))
(cdr (or (regexp-match url-rx str)
(url-error "Invalid URL string: ~e" str)))))
@ -405,7 +409,7 @@
;; If #f, and leave unmolested any % that is followed by hex digit
;; if a % is not followed by a hex digit, replace it with %25
;; in an attempt to be "friendly"
(and f (uri-decode (regexp-replace* "%([^0-9a-fA-F])" f "%25\\1"))))
(and f (uri-decode (regexp-replace* #rx"%([^0-9a-fA-F])" f "%25\\1"))))
;; separate-path-strings : string[starting with /] -> (listof path/param)
(define (separate-path-strings str)
@ -545,5 +549,4 @@
(tcp-abandon-port client->server)
server->client))
))

View File

@ -252,12 +252,16 @@
(test-s->u #("a+b-c456.d" #f "www.foo.com" #f #t (#("")) () #f)
"a+b-c456.d://www.foo.com/")
;; a colon can appear in absolute paths
(test-s->u #(#f #f #f #f #t (#("x:y") #("z")) () #f)
"/x:y/z")
;; a colon and other junk (`sub-delims') can appear in usernames
(test #("http" "x:!$&'()*+,;=y" "www.drscheme.org" #f #t (#("a")) () #f)
string->url/vec
"http://x:!$&'()*+,;=y@www.drscheme.org/a")
;; a colon and atsign can appear in absolute paths
(test-s->u #(#f #f #f #f #t (#("x:@y") #("z")) () #f)
"/x:@y/z")
;; and in relative paths as long as it's not in the first element
(test-s->u #(#f #f #f #f #f (#("x") #("y:z")) () #f)
"x/y:z")
(test-s->u #(#f #f #f #f #f (#("x") #("y:@z")) () #f)
"x/y:@z")
;; test bad schemes
(err/rt-test (string->url "://www.foo.com/") url-exception?)