
Fix a bug in encoding strings with characters between 128 and 256, avoid extra bindings for conversion vectors; remove calling `noels-tests' from `main' since it's already called from `tests', etc.
241 lines
9.8 KiB
Racket
241 lines
9.8 KiB
Racket
#lang racket/base
|
|
|
|
#|
|
|
People used to wonder why semicolons were the default. We then
|
|
decided to switch the default back to ampersands --
|
|
|
|
http://www.w3.org/TR/html401/appendix/notes.html#h-B.2.2
|
|
|
|
We recommend that HTTP server implementors, and in particular, CGI
|
|
implementors support the use of ";" in place of "&" to save authors
|
|
the trouble of escaping "&" characters in this manner.
|
|
|
|
See more in PR8831.
|
|
|#
|
|
|
|
;;; <uri-codec.rkt> ---- En/Decode URLs and form-urlencoded data
|
|
;;; Time-stamp: <03/04/25 10:31:31 noel>
|
|
;;;
|
|
;;; Copyright (C) 2002 by Noel Welsh.
|
|
;;;
|
|
;;; This file is part of Net.
|
|
|
|
;;; Net is free software; you can redistribute it and/or
|
|
;;; modify it under the terms of the GNU Lesser General Public
|
|
;;; License as published by the Free Software Foundation; either
|
|
;;; version 2.1 of the License, or (at your option) any later version.
|
|
|
|
;;; Net is distributed in the hope that it will be useful,
|
|
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
;;; Lesser General Public License for more details.
|
|
|
|
;;; You should have received a copy of the GNU Lesser General Public
|
|
;;; License along with Net; if not, write to the Free Software
|
|
;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
|
;;; 02110-1301 USA.
|
|
|
|
;;; Author: Noel Welsh <noelwelsh@yahoo.com>
|
|
;;
|
|
;; Commentary:
|
|
|
|
;; The module provides functions to encode and decode strings using
|
|
;; the URI encoding rules given in RFC 2396, and to encode and decode
|
|
;; name/value pairs using the application/x-www-form-urlencoded
|
|
;; mimetype given the in HTML 4.0 specification. There are minor
|
|
;; differences between the two encodings.
|
|
|
|
;; The URI encoding uses allows a few characters to be represented `as
|
|
;; is': a-Z, A-Z, 0-9, -, _, ., !, ~, *, ', ( and ). The remaining
|
|
;; characters are encoded as %xx, where xx is the hex representation
|
|
;; of the integer value of the character (where the mapping
|
|
;; character<->integer is determined by US-ASCII if the integer is
|
|
;; <128).
|
|
|
|
;; The encoding, inline with RFC 2396's recommendation, represents a
|
|
;; character as is, if possible. The decoding allows any characters
|
|
;; to be represented by their hex values, and allows characters to be
|
|
;; incorrectly represented `as is'.
|
|
|
|
;; The rules for the application/x-www-form-urlencoded mimetype given
|
|
;; in the HTML 4.0 spec are:
|
|
|
|
;; 1. Control names and values are escaped. Space characters are
|
|
;; replaced by `+', and then reserved characters are escaped as
|
|
;; described in [RFC1738], section 2.2: Non-alphanumeric characters
|
|
;; are replaced by `%HH', a percent sign and two hexadecimal digits
|
|
;; representing the ASCII code of the character. Line breaks are
|
|
;; represented as "CR LF" pairs (i.e., `%0D%0A').
|
|
|
|
;; 2. The control names/values are listed in the order they appear
|
|
;; in the document. The name is separated from the value by `=' and
|
|
;; name/value pairs are separated from each other by `&'.
|
|
|
|
;; NB: RFC 2396 supersedes RFC 1738.
|
|
|
|
;; This differs slightly from the straight encoding in RFC 2396 in
|
|
;; that `+' is allowed, and represents a space. We follow this
|
|
;; convention, encoding a space as `+' and decoding `+' as a space.
|
|
;; There appear to be some brain-dead decoders on the web, so we also
|
|
;; encode `!', `~', `'', `(' and ')' using their hex representation.
|
|
;; This is the same choice as made by the Java URLEncoder.
|
|
|
|
;; Draws inspiration from encode-decode.scm by Kurt Normark and a code
|
|
;; sample provided by Eli Barzilay
|
|
|
|
(require racket/string racket/list)
|
|
|
|
(provide uri-encode uri-decode
|
|
uri-path-segment-encode uri-path-segment-decode
|
|
uri-userinfo-encode uri-userinfo-decode
|
|
uri-unreserved-encode uri-unreserved-decode
|
|
uri-path-segment-unreserved-encode uri-path-segment-unreserved-decode
|
|
form-urlencoded-encode form-urlencoded-decode
|
|
alist->form-urlencoded form-urlencoded->alist
|
|
current-alist-separator-mode)
|
|
|
|
(define (self-map-chars str) (map (λ (ch) (cons ch ch)) (string->list str)))
|
|
|
|
;; The characters that always map to themselves
|
|
(define alphanumeric-mapping
|
|
(self-map-chars
|
|
"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"))
|
|
|
|
;; Characters that sometimes map to themselves
|
|
;; called 'mark' in RFC 3986
|
|
(define safe-mapping (self-map-chars "-_.!~*'()"))
|
|
|
|
;; The strict URI mapping
|
|
(define uri-mapping `(,@alphanumeric-mapping ,@safe-mapping))
|
|
|
|
;; The uri path segment mapping from RFC 3986
|
|
(define path-segment-extra-mapping (self-map-chars "@+,=$&:"))
|
|
(define uri-path-segment-mapping `(,@uri-mapping ,@path-segment-extra-mapping))
|
|
|
|
;; from RFC 3986
|
|
(define unreserved-mapping `(,@alphanumeric-mapping ,@(self-map-chars "-._~")))
|
|
|
|
;; The uri path segment mapping from RFC 3986
|
|
(define uri-path-segment-unreserved-mapping
|
|
`(,@unreserved-mapping ,@path-segment-extra-mapping))
|
|
|
|
;; from RFC 3986
|
|
(define sub-delims-mapping (self-map-chars "!$&'()*+,;="))
|
|
|
|
;; The uri userinfo mapping from RFC 3986
|
|
(define uri-userinfo-mapping
|
|
`(,@unreserved-mapping ,@sub-delims-mapping ,@(self-map-chars ":")))
|
|
|
|
;; The form-urlencoded mapping
|
|
(define form-urlencoded-mapping
|
|
`(,@(self-map-chars ".-*_") (#\space . #\+) ,@alphanumeric-mapping))
|
|
|
|
(define (number->hex-string number)
|
|
(define (hex n) (string-ref "0123456789ABCDEF" n))
|
|
(string #\% (hex (quotient number 16)) (hex (modulo number 16))))
|
|
|
|
(define ascii-size 128)
|
|
|
|
;; (listof (cons char char)) -> (values (vectorof string) (vectorof int))
|
|
(define (make-codec-tables alist)
|
|
(define encoding-table (build-vector ascii-size number->hex-string))
|
|
(define decoding-table (build-vector ascii-size values))
|
|
(for ([orig+enc (in-list alist)])
|
|
(vector-set! encoding-table
|
|
(char->integer (car orig+enc))
|
|
(string (cdr orig+enc)))
|
|
(vector-set! decoding-table
|
|
(char->integer (cdr orig+enc))
|
|
(char->integer (car orig+enc))))
|
|
(values encoding-table decoding-table))
|
|
|
|
;; vector string -> string
|
|
(define (encode table str)
|
|
;; First, check for an ASCII string with no conversion needed:
|
|
(if (for/and ([char (in-string str)])
|
|
(define v (char->integer char))
|
|
(and (< v ascii-size)
|
|
(let ([s (vector-ref table v)])
|
|
(and (= 1 (string-length s))
|
|
(eq? char (string-ref s 0))))))
|
|
str
|
|
(string-append* (for/list ([byte (in-bytes (string->bytes/utf-8 str))])
|
|
(if (< byte ascii-size)
|
|
(vector-ref table byte)
|
|
(number->hex-string byte))))))
|
|
|
|
;; vector string -> string
|
|
(define (decode table str)
|
|
(define max-ascii (integer->char ascii-size))
|
|
(define (internal-decode l)
|
|
(if (null? l) '()
|
|
(let* ([c (car l)] [l (cdr l)]
|
|
[hex (and (equal? #\% c) (pair? l) (pair? (cdr l))
|
|
(string->number (string (car l) (cadr l)) 16))])
|
|
(if hex (cons hex (internal-decode (cddr l)))
|
|
(cons (if (char<? c max-ascii)
|
|
(vector-ref table (char->integer c))
|
|
;; This should probably error, but strings to be decoded
|
|
;; might come from misbehaving sources; maybe it's better
|
|
;; to add some parameter for a permissive mode
|
|
(bytes->list (string->bytes/utf-8 (string c))))
|
|
(internal-decode l))))))
|
|
(bytes->string/utf-8 (apply bytes (internal-decode (string->list str)))))
|
|
|
|
;; Utility for defining codecs
|
|
(define-syntax-rule (define-codecs [encoder decoder mapping] ...)
|
|
(begin (define-values [encoder decoder]
|
|
(let-values ([(v:en v:de) (make-codec-tables mapping)])
|
|
(define (encoder str) (encode v:en str))
|
|
(define (decoder str) (decode v:de str))
|
|
(values encoder decoder)))
|
|
...))
|
|
|
|
;; All of these are string -> string
|
|
(define-codecs
|
|
[uri-encode uri-decode uri-mapping]
|
|
[uri-path-segment-encode uri-path-segment-decode uri-path-segment-mapping]
|
|
[uri-userinfo-encode uri-userinfo-decode uri-userinfo-mapping]
|
|
[uri-unreserved-encode uri-unreserved-decode unreserved-mapping]
|
|
[uri-path-segment-unreserved-encode uri-path-segment-unreserved-decode
|
|
uri-path-segment-unreserved-mapping]
|
|
[form-urlencoded-encode form-urlencoded-decode form-urlencoded-mapping])
|
|
|
|
;; listof (cons string string) -> string
|
|
;; http://www.w3.org/TR/html401/appendix/notes.html#ampersands-in-uris
|
|
;; listof (cons symbol string) -> string
|
|
(define (alist->form-urlencoded args)
|
|
(string-join
|
|
(for/list ([arg (in-list args)])
|
|
(define name (form-urlencoded-encode (symbol->string (car arg))))
|
|
(define value (and (cdr arg) (form-urlencoded-encode (cdr arg))))
|
|
(if value (string-append name "=" value) name))
|
|
(if (memq (current-alist-separator-mode) '(semi semi-or-amp)) ";" "&")))
|
|
|
|
;; string -> listof (cons string string)
|
|
;; http://www.w3.org/TR/html401/appendix/notes.html#ampersands-in-uris
|
|
(define (form-urlencoded->alist str)
|
|
(define sep-regexp
|
|
(case (current-alist-separator-mode)
|
|
[(semi) #rx"[;]"]
|
|
[(amp) #rx"[&]"]
|
|
[else #rx"[&;]"]))
|
|
(if (equal? "" str) '()
|
|
(for/list ([keyval (in-list (regexp-split sep-regexp str))])
|
|
;; m = #f => no "=..." part
|
|
(define m (regexp-match-positions #rx"=" keyval))
|
|
(cons (string->symbol (form-urlencoded-decode
|
|
(if m (substring keyval 0 (caar m)) keyval)))
|
|
(and m (form-urlencoded-decode (substring keyval (cdar m))))))))
|
|
|
|
(define current-alist-separator-mode
|
|
(make-parameter 'amp-or-semi
|
|
(lambda (s)
|
|
(unless (memq s '(amp semi amp-or-semi semi-or-amp))
|
|
(raise-type-error 'current-alist-separator-mode
|
|
"'amp, 'semi, 'amp-or-semi, or 'semi-or-amp"
|
|
s))
|
|
s)))
|
|
|
|
;;; uri-codec.rkt ends here
|