diff --git a/collects/net/url-sig.ss b/collects/net/url-sig.ss new file mode 100644 index 0000000..a131573 --- /dev/null +++ b/collects/net/url-sig.ss @@ -0,0 +1,18 @@ +(module url-sig mzscheme + (require (lib "unitsig.ss")) + (provide net:url^) + + (define-signature net:url^ + ((struct url (scheme host port path params query fragment)) + (struct mime-header (name value)) + get-pure-port ;; url [x list (str)] -> in-port + get-impure-port ;; url [x list (str)] -> in-port + display-pure-port ;; in-port -> () + purify-port ;; in-port -> list (mime-header) + netscape/string->url ;; (string -> url) + string->url ;; str -> url + url->string + call/input-url ;; url x (url -> in-port) x + ;; (in-port -> T) + ;; [x list (str)] -> T + combine-url/relative))) ;; url x str -> url diff --git a/collects/net/url-unit.ss b/collects/net/url-unit.ss new file mode 100644 index 0000000..e4c83db --- /dev/null +++ b/collects/net/url-unit.ss @@ -0,0 +1,530 @@ +;; To do: +;; Handle HTTP/file errors. +;; Not throw away MIME headers. +;; Determine file type. + +;; ---------------------------------------------------------------------- + +;; Input ports have two statuses: +;; "impure" = they have text waiting +;; "pure" = the MIME headers have been read + +(module url-unit mzscheme + (require (lib "file.ss") + (lib "unitsig.ss") + "url-sig.ss") + (provide url@) + + (define url@ + (unit/sig net:url^ + (import) + + (define-struct (url-exception struct:exn) ()) + + ;; This is commented out; it's here for debugging. + ;; It used to be outside the unit. + + (quote + (begin + (invoke-open-unit/sig mzlib:url@ #f) + (define url:cs (string->url "http://www.cs.rice.edu/")) + (define url:me (string->url "http://www.cs.rice.edu/~shriram/")) + (define (test url) + (call/input-url url + get-pure-port + display-pure-port)))) + + (define url-error + (lambda (fmt . args) + (let ((s (apply format fmt (map (lambda (arg) + (if (url? arg) + (url->string arg) + arg)) + args)))) + (raise (make-url-exception s (current-continuation-marks)))))) + + ;; scheme : str + #f + ;; host : str + #f + ;; port : num + #f + ;; path : str + ;; params : str + #f + ;; query : str + #f + ;; fragment : str + #f + (define-struct url (scheme host port path params query fragment)) + + ;; name : str (all lowercase; not including the colon) + ;; value : str (doesn't have the eol delimiter) + (define-struct mime-header (name value)) + + (define url->string + (lambda (url) + (let ((scheme (url-scheme url)) + (host (url-host url)) + (port (url-port url)) + (path (url-path url)) + (params (url-params url)) + (query (url-query url)) + (fragment (url-fragment url))) + (cond + ((and scheme (string=? scheme "file")) + (string-append "file:" path)) + (else + (let ((sa string-append)) + (sa (if scheme (sa scheme "://") "") + (if host host "") + (if port (sa ":" (number->string port)) "") + ; There used to be a "/" here, but that causes an + ; extra leading slash -- wonder why it ever worked! + path + (if params (sa ";" params) "") + (if query (sa "?" query) "") + (if fragment (sa "#" fragment) "")))))))) + + ;; url->default-port : url -> num + (define url->default-port + (lambda (url) + (let ((scheme (url-scheme url))) + (cond + ((not scheme) 80) + ((string=? scheme "http") 80) + (else + (url-error "Scheme ~a not supported" (url-scheme url))))))) + + ;; make-ports : url -> in-port x out-port + (define make-ports + (lambda (url) + (let ((port-number (or (url-port url) + (url->default-port url)))) + (tcp-connect (url-host url) port-number)))) + + ;; http://get-impure-port : url [x list (str)] -> in-port + (define http://get-impure-port + (case-lambda + [(url) (http://get-impure-port url '())] + [(url strings) + (let-values (((server->client client->server) + (make-ports url))) + (let ((access-string + (url->string + (make-url #f #f #f + (url-path url) (url-params url) + (url-query url) (url-fragment url))))) + (for-each (lambda (s) + (display s client->server) + (newline client->server)) + (cons (format "GET ~a HTTP/1.0" access-string) + (cons (format "Host: ~a" (url-host url)) + strings)))) + (newline client->server) + (close-output-port client->server) + server->client)])) + + ;; file://get-pure-port : url -> in-port + (define file://get-pure-port + (lambda (url) + (when (url-host url) + (url-error "Don't know how to get files from remote hosts")) + (open-input-file (url-path url)))) + + ;; get-impure-port : url [x list (str)] -> in-port + (define get-impure-port + (case-lambda + [(url) (get-impure-port url '())] + [(url strings) + (let ((scheme (url-scheme url))) + (cond + ((not scheme) + (url-error "Scheme unspecified in ~a" url)) + ((string=? scheme "http") + (http://get-impure-port url strings)) + ((string=? scheme "file") + (url-error "There are no impure file: ports")) + (else + (url-error "Scheme ~a unsupported" scheme))))])) + + ;; get-pure-port : url [x list (str)] -> in-port + (define get-pure-port + (case-lambda + [(url) (get-pure-port url '())] + [(url strings) + (let ((scheme (url-scheme url))) + (cond + ((not scheme) + (url-error "Scheme unspecified in ~a" url)) + ((string=? scheme "http") + (let ((port (http://get-impure-port url strings))) + (purify-port port) + port)) + ((string=? scheme "file") + (file://get-pure-port url)) + (else + (url-error "Scheme ~a unsupported" scheme))))])) + + ;; display-pure-port : in-port -> () + (define display-pure-port + (lambda (server->client) + (let loop () + (let ((c (read-char server->client))) + (unless (eof-object? c) + (display c) + (loop)))) + (close-input-port server->client))) + + (define empty-url? + (lambda (url) + (and (not (url-scheme url)) (not (url-params url)) + (not (url-query url)) (not (url-fragment url)) + (andmap (lambda (c) (char=? c #\space)) + (string->list (url-path url)))))) + + ;; file://combine-url/relative : fs-path x s/t/r -> fs-path + + (define file://combine-url/relative + (let ((path-segment-regexp (regexp "([^/]*)/(.*)")) + (translate-dir + (lambda (s) + (cond + [(string=? s "") 'same] ;; handle double slashes + [(string=? s "..") 'up] + [(string=? s ".") 'same] + [else s])))) + (lambda (index offset) + (let*-values ([(simple-index) (simplify-path index)] + [(base name dir?) + (split-path simple-index)]) + (if (string=? "" offset) + (build-path base name) + (build-path + (if (or dir? + (directory-exists? simple-index)) + simple-index + (if (eq? base 'relative) + 'same + base)) + (let loop ((str offset)) + (let ((m (regexp-match path-segment-regexp str))) + (cond + [(not m) str] + [else + (if (string=? "" (caddr m)) + (translate-dir (cadr m)) + (build-path (translate-dir (cadr m)) + (loop (caddr m))))]))))))))) + + ;; combine-url/relative : url x str -> url + (define combine-url/relative + (lambda (base string) + (let ((relative (string->url string))) + (cond + ((empty-url? base) ; Step 1 + relative) + ((empty-url? relative) ; Step 2a + base) + ((url-scheme relative) ; Step 2b + relative) + (else ; Step 2c + (set-url-scheme! relative (url-scheme base)) + (cond + ;; This case is here because the above tests + ;; ensure the relative extension is not really + ;; an absolute path itself, so we need not + ;; examine its contents further. + ((and (url-scheme base) ; Interloper step + (string=? (url-scheme base) "file")) + (set-url-path! relative + (file://combine-url/relative + (url-path base) + (url-path relative))) + relative) + ((url-host relative) ; Step 3 + relative) + (else + (set-url-host! relative (url-host base)) + (set-url-port! relative (url-port base)) ; Unspecified! + (let ((rel-path (url-path relative))) + (cond + ((and rel-path ; Step 4 + (not (string=? "" rel-path)) + (char=? #\/ (string-ref rel-path 0))) + relative) + ((or (not rel-path) ; Step 5 + (string=? rel-path "")) + (set-url-path! relative (url-path base)) + (or (url-params relative) + (set-url-params! relative (url-params base))) + (or (url-query relative) + (set-url-query! relative (url-query base))) + relative) + (else ; Step 6 + (merge-and-normalize + (url-path base) relative))))))))))) + + (define merge-and-normalize + (lambda (base-path relative-url) + (let ((rel-path (url-path relative-url))) + (let ((base-list (string->list base-path)) + (rel-list (string->list rel-path))) + (let* + ((joined-list + (let loop ((base (reverse base-list))) + (if (null? base) + rel-list + (if (char=? #\/ (car base)) + (append (reverse base) rel-list) + (loop (cdr base)))))) + (grouped + (let loop ((joined joined-list) (current '())) + (if (null? joined) + (list (list->string (reverse current))) + (if (char=? #\/ (car joined)) + (cons (list->string + (reverse (cons #\/ current))) + (loop (cdr joined) '())) + (loop (cdr joined) + (cons (car joined) current)))))) + (grouped + (let loop ((grouped grouped)) + (if (null? grouped) '() + (if (string=? "./" (car grouped)) + (loop (cdr grouped)) + (cons (car grouped) (loop (cdr grouped))))))) + (grouped + (let loop ((grouped grouped)) + (if (null? grouped) '() + (if (null? (cdr grouped)) + (if (string=? "." (car grouped)) '() + grouped) + (cons (car grouped) (loop (cdr grouped))))))) + (grouped + (let remove-loop ((grouped grouped)) + (let walk-loop ((r-pre '()) (post grouped)) + (if (null? post) + (reverse r-pre) + (let ((first (car post)) + (rest (cdr post))) + (if (null? rest) + (walk-loop (cons first r-pre) rest) + (let ((second (car rest))) + (if (and (not (string=? first "../")) + (string=? second "../")) + (remove-loop + (append (reverse r-pre) (cddr post))) + (walk-loop (cons first r-pre) rest))))))))) + (grouped + (let loop ((grouped grouped)) + (if (null? grouped) '() + (if (null? (cdr grouped)) grouped + (if (and (null? (cddr grouped)) + (not (string=? (car grouped) "../")) + (string=? (cadr grouped) "..")) + '() + (cons (car grouped) (loop (cdr grouped))))))))) + (set-url-path! relative-url + (apply string-append grouped)) + relative-url))))) + + ;; call/input-url : url x (url -> in-port) x (in-port -> T) + ;; [x list (str)] -> T + (define call/input-url + (let ((handle-port (lambda (server->client handler) + (dynamic-wind (lambda () 'do-nothing) + (lambda () (handler server->client)) + (lambda () (close-input-port server->client)))))) + (case-lambda + ((url getter handler) + (handle-port (getter url) handler)) + ((url getter handler params) + (handle-port (getter url params) handler))))) + + (define empty-line? + (lambda (chars) + (or (null? chars) + (and (memv (car chars) '(#\return #\linefeed #\tab #\space)) + (empty-line? (cdr chars)))))) + + (define extract-mime-headers-as-char-lists + (lambda (port) + (let headers-loop ((headers '())) + (let char-loop ((header '())) + (let ((c (read-char port))) + (if (eof-object? c) + (reverse headers) ; CHECK: INCOMPLETE MIME: SERVER BUG + (if (char=? c #\newline) + (if (empty-line? header) + (reverse headers) + (begin + (headers-loop (cons (reverse header) headers)))) + (char-loop (cons c header))))))))) + + ;; purify-port : in-port -> list (mime-header) + (define purify-port + (lambda (port) + (let ((headers-as-chars (extract-mime-headers-as-char-lists port))) + (let header-loop ((headers headers-as-chars)) + (if (null? headers) + '() + (let ((header (car headers))) + (let char-loop ((pre '()) (post header)) + (if (null? post) + (header-loop (cdr headers)) + (if (char=? #\: (car post)) + (cons (make-mime-header + (list->string (reverse pre)) + (list->string post)) + (header-loop (cdr headers))) + (char-loop (cons (char-downcase (car post)) pre) + (cdr post))))))))))) + + (define character-set-size 256) + + (define marker-list + '(#\: #\; #\? #\#)) + + (define ascii-marker-list + (map char->integer marker-list)) + + (define marker-locations + (make-vector character-set-size)) + + (define first-position-of-marker + (lambda (c) + (vector-ref marker-locations (char->integer c)))) + + ;; netscape/string->url : str -> url + (define netscape/string->url + (lambda (string) + (let ((url (string->url string))) + (if (url-scheme url) + url + (if (string=? string "") + (url-error "Can't resolve empty string as URL") + (begin + (set-url-scheme! url + (if (char=? (string-ref string 0) #\/) + "file" + "http")) + url)))))) + + ;; string->url : str -> url + (define string->url + (lambda (string) + (let loop ((markers ascii-marker-list)) + (unless (null? markers) + (vector-set! marker-locations (car markers) #f) + (loop (cdr markers)))) + (let loop ((chars (string->list string)) (index 0)) + (unless (null? chars) + (let ((first (car chars))) + (when (memq first marker-list) + (let ((posn (char->integer first))) + (unless (vector-ref marker-locations posn) + (vector-set! marker-locations posn index))))) + (loop (cdr chars) (add1 index)))) + (let + ((first-colon (first-position-of-marker #\:)) + (first-semicolon (first-position-of-marker #\;)) + (first-question (first-position-of-marker #\?)) + (first-hash (first-position-of-marker #\#))) + (let + ((scheme-start (and first-colon 0)) + (path-start (if first-colon (add1 first-colon) 0)) + (params-start (and first-semicolon (add1 first-semicolon))) + (query-start (and first-question (add1 first-question))) + (fragment-start (and first-hash (add1 first-hash)))) + (let ((total-length (string-length string))) + (let* + ((scheme-finish (and scheme-start first-colon)) + (path-finish (if first-semicolon first-semicolon + (if first-question first-question + (if first-hash first-hash + total-length)))) + (fragment-finish (and fragment-start total-length)) + (query-finish (and query-start + (if first-hash first-hash + total-length))) + (params-finish (and params-start + (if first-question first-question + (if first-hash first-hash + total-length))))) + (let ((scheme (and scheme-start + (substring string + scheme-start scheme-finish)))) + (if (and scheme + (string=? scheme "file")) + (let ((path (substring string path-start total-length))) + (if (or (relative-path? path) + (absolute-path? path)) + (make-url + scheme + #f ; host + #f ; port + path + #f ; params + #f ; query + #f) ; fragment + (url-error "scheme 'file' path ~s neither relative nor absolute" path))) + (let-values (((host port path) + (parse-host/port/path + string path-start path-finish))) + (make-url + scheme + host + port + path + (and params-start + (substring string params-start params-finish)) + (and query-start + (substring string query-start query-finish)) + (and fragment-start + (substring string fragment-start + fragment-finish)))))))))))) + + ;; parse-host/port/path : str x num x num -> (str + #f) + (num + #f) + str + (define parse-host/port/path + (lambda (path begin-point end-point) + (when (> begin-point end-point) + (url-error "Path ~s contains illegal characters" path)) + (let ((has-host? (and (>= (- end-point begin-point) 2) + (char=? (string-ref path begin-point) #\/) + (char=? (string-ref path (add1 begin-point)) + #\/)))) + (let ((begin-point (if has-host? + (+ begin-point 2) + begin-point))) + (let loop ((index begin-point) + (first-colon #f) + (first-slash #f)) + (cond + ((>= index end-point) + ;; We come here only if the string has not had a / + ;; yet. This can happen in two cases: + ;; 1. The input is a relative URL, and the hostname + ;; will not be specified. In such cases, has-host? + ;; will be false. + ;; 2. The input is an absolute URL with a hostname, + ;; and the intended path is "/", but the URL is missing + ;; a "/" at the end. has-host? must be true. + (let ((host/path (substring path begin-point end-point))) + (if has-host? + (values host/path #f "/") + (values #f #f host/path)))) + ((char=? #\: (string-ref path index)) + (loop (add1 index) (or first-colon index) first-slash)) + ((char=? #\/ (string-ref path index)) + (if first-colon + (values + (substring path begin-point first-colon) + (string->number (substring path (add1 first-colon) + index)) + (substring path index end-point)) + (if has-host? + (values + (substring path begin-point index) + #f + (substring path index end-point)) + (values + #f + #f + (substring path begin-point end-point))))) + (else + (loop (add1 index) first-colon first-slash)))))))))))