URL parsing is closer to the RFC now
svn: r5965
This commit is contained in:
parent
cc8e660737
commit
e885e79295
|
@ -13,8 +13,8 @@
|
||||||
(require (lib "file.ss")
|
(require (lib "file.ss")
|
||||||
(lib "unit.ss")
|
(lib "unit.ss")
|
||||||
(lib "port.ss")
|
(lib "port.ss")
|
||||||
(lib "string.ss")
|
|
||||||
(lib "list.ss")
|
(lib "list.ss")
|
||||||
|
(lib "string.ss")
|
||||||
"url-structs.ss"
|
"url-structs.ss"
|
||||||
"uri-codec.ss"
|
"uri-codec.ss"
|
||||||
"url-sig.ss"
|
"url-sig.ss"
|
||||||
|
@ -351,26 +351,37 @@
|
||||||
(if (char=? (string-ref string 0) #\/) "file" "http"))
|
(if (char=? (string-ref string 0) #\/) "file" "http"))
|
||||||
url)))))
|
url)))))
|
||||||
|
|
||||||
;; string->url : str -> url
|
;; URL parsing regexp
|
||||||
;; New implementation, mostly provided by Neil Van Dyke
|
;; this is following the regexp in Appendix B of rfc 3986, except for using
|
||||||
|
;; `*' instead of `+' for the scheme part (it is checked later anyway, and
|
||||||
|
;; we don't want to parse it as a path element), and the user@host:port is
|
||||||
|
;; parsed here.
|
||||||
(define url-rx
|
(define url-rx
|
||||||
(regexp (string-append
|
(regexp (string-append
|
||||||
"^"
|
"^"
|
||||||
"[ \t\f\r\n]*"
|
"(?:" ; / scheme-colon-opt
|
||||||
"(?:" ; <A front-opt
|
"([^:/?#]*)" ; | #1 = scheme-opt
|
||||||
"(?:([^:&;?/]*):)?" ; =1 scheme-colon-opt (see below)
|
":)?" ; \
|
||||||
"(?:" ; <B slashslash-opt
|
"(?://" ; / slash-slash-authority-opt
|
||||||
"//"
|
"(?:" ; | / user-at-opt
|
||||||
"(?:([^:/@;?#]*)@)?" ; =2 user-at-opt
|
"([^/?#@]*)" ; | | #2 = user-opt
|
||||||
"([^:/@;?#]*)?" ; =3 host-opt
|
"@)?" ; | \
|
||||||
"(?::([0-9]*))?" ; =4 colon-port-opt
|
"([^/?#:]*)?" ; | #3 = host-opt
|
||||||
")?" ; >B slashslash-opt
|
"(?::" ; | / colon-port-opt
|
||||||
")?" ; >A front-opt
|
"([0-9]*)" ; | | #4 = port-opt
|
||||||
"([^?#]*)" ; =5 path
|
")?" ; | \
|
||||||
"(?:\\?([^#]*))?" ; =6 question-query-opt
|
")?" ; \
|
||||||
"(?:#(.*))?" ; =7 hash-fragment-opt
|
"([^?#]*)" ; #5 = path
|
||||||
"[ \t\f\r\n]*"
|
"(?:\\?" ; / question-query-opt
|
||||||
|
"([^#]*)" ; | #6 = query-opt
|
||||||
|
")?" ; \
|
||||||
|
"(?:#" ; / hash-fragment-opt
|
||||||
|
"(.*)" ; | #7 = fragment-opt
|
||||||
|
")?" ; \
|
||||||
"$")))
|
"$")))
|
||||||
|
|
||||||
|
;; string->url : str -> url
|
||||||
|
;; Original version by Neil Van Dyke
|
||||||
(define (string->url str)
|
(define (string->url str)
|
||||||
(apply
|
(apply
|
||||||
(lambda (scheme user host port path query fragment)
|
(lambda (scheme user host port path query fragment)
|
||||||
|
@ -382,21 +393,14 @@
|
||||||
(eq? 'windows url:os-type))
|
(eq? 'windows url:os-type))
|
||||||
(set! path (string-append host ":" path))
|
(set! path (string-append host ":" path))
|
||||||
(set! host #f))
|
(set! host #f))
|
||||||
(let* ([user (uri-decode/maybe user)]
|
(let* ([scheme (and scheme (string-downcase scheme))]
|
||||||
[port (and port (string->number port))]
|
[host (and host (string-downcase host))]
|
||||||
[abs? (and (not (= 0 (string-length path)))
|
[user (uri-decode/maybe user)]
|
||||||
(char=? #\/ (string-ref path 0)))]
|
[port (and port (string->number port))]
|
||||||
[path (separate-path-strings
|
[abs? (regexp-match? #rx"^/" path)]
|
||||||
;; If path is "" and the input is an absolute URL
|
[path (separate-path-strings path)]
|
||||||
;; with a hostname, then the intended path is "/",
|
[query (if query (form-urlencoded->alist query) '())]
|
||||||
;; but the URL is missing a "/" at the end.
|
|
||||||
path
|
|
||||||
#;
|
|
||||||
(if (and (string=? path "") host) "/" path))]
|
|
||||||
[query (if query (form-urlencoded->alist query) '())]
|
|
||||||
[fragment (uri-decode/maybe fragment)])
|
[fragment (uri-decode/maybe fragment)])
|
||||||
(when (string? scheme) (string-lowercase! scheme))
|
|
||||||
(when (string? host) (string-lowercase! host))
|
|
||||||
(make-url scheme user host port abs? path query fragment)))
|
(make-url scheme user host port abs? path query fragment)))
|
||||||
(cdr (or (regexp-match url-rx str)
|
(cdr (or (regexp-match url-rx str)
|
||||||
(url-error "Invalid URL string: ~e" str)))))
|
(url-error "Invalid URL string: ~e" str)))))
|
||||||
|
@ -405,7 +409,7 @@
|
||||||
;; If #f, and leave unmolested any % that is followed by hex digit
|
;; If #f, and leave unmolested any % that is followed by hex digit
|
||||||
;; if a % is not followed by a hex digit, replace it with %25
|
;; if a % is not followed by a hex digit, replace it with %25
|
||||||
;; in an attempt to be "friendly"
|
;; in an attempt to be "friendly"
|
||||||
(and f (uri-decode (regexp-replace* "%([^0-9a-fA-F])" f "%25\\1"))))
|
(and f (uri-decode (regexp-replace* #rx"%([^0-9a-fA-F])" f "%25\\1"))))
|
||||||
|
|
||||||
;; separate-path-strings : string[starting with /] -> (listof path/param)
|
;; separate-path-strings : string[starting with /] -> (listof path/param)
|
||||||
(define (separate-path-strings str)
|
(define (separate-path-strings str)
|
||||||
|
@ -545,5 +549,4 @@
|
||||||
(tcp-abandon-port client->server)
|
(tcp-abandon-port client->server)
|
||||||
server->client))
|
server->client))
|
||||||
|
|
||||||
|
|
||||||
))
|
))
|
||||||
|
|
|
@ -252,12 +252,16 @@
|
||||||
(test-s->u #("a+b-c456.d" #f "www.foo.com" #f #t (#("")) () #f)
|
(test-s->u #("a+b-c456.d" #f "www.foo.com" #f #t (#("")) () #f)
|
||||||
"a+b-c456.d://www.foo.com/")
|
"a+b-c456.d://www.foo.com/")
|
||||||
|
|
||||||
;; a colon can appear in absolute paths
|
;; a colon and other junk (`sub-delims') can appear in usernames
|
||||||
(test-s->u #(#f #f #f #f #t (#("x:y") #("z")) () #f)
|
(test #("http" "x:!$&'()*+,;=y" "www.drscheme.org" #f #t (#("a")) () #f)
|
||||||
"/x:y/z")
|
string->url/vec
|
||||||
|
"http://x:!$&'()*+,;=y@www.drscheme.org/a")
|
||||||
|
;; a colon and atsign can appear in absolute paths
|
||||||
|
(test-s->u #(#f #f #f #f #t (#("x:@y") #("z")) () #f)
|
||||||
|
"/x:@y/z")
|
||||||
;; and in relative paths as long as it's not in the first element
|
;; and in relative paths as long as it's not in the first element
|
||||||
(test-s->u #(#f #f #f #f #f (#("x") #("y:z")) () #f)
|
(test-s->u #(#f #f #f #f #f (#("x") #("y:@z")) () #f)
|
||||||
"x/y:z")
|
"x/y:@z")
|
||||||
|
|
||||||
;; test bad schemes
|
;; test bad schemes
|
||||||
(err/rt-test (string->url "://www.foo.com/") url-exception?)
|
(err/rt-test (string->url "://www.foo.com/") url-exception?)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user