racket/collects/little-helper/indexer/planet/bit-io.scm
Jens Axel Soegaard ac47e02961 Initial checkin of Little Helper.
Little Helper contains a full text search engine. 
Currently it indexes all html-files in /collects/doc.
A mockup web-interface is present in order to facilitate easy experimentation with searches.
Run run-indexer to generate the index for your documentation dir. 
Run launch to start a web-server with the search interface.
Note: Currently assumes w3m is in your path (used to precompute the 
           preview-snippets shown in the search results.

svn: r8836
2008-03-01 13:26:18 +00:00

572 lines
22 KiB
Scheme

;;; bit-io.scm -- Jens Axel Søgaard -- april 2006
; This file started as a PLT port of Oleg's bit-reader
; <http://okmij.org/ftp/Scheme/binary-io.html#bit-reader>,
; then a bit-writer was added, and finally bit-ports were
; added.
(module bit-io mzscheme
(provide with-input-from-bit-file
with-output-to-bit-file
open-input-bit-file
open-output-bit-file
close-input-bit-port
close-output-bit-port
read-bits
write-bits
current-output-bit-port
current-input-bit-port
flush-bits
bit-file-position
call-with-input-bit-file
call-with-output-bit-file
; LOW LEVEL
;make-bit-reader
;make-bit-writer
;current-bit-reader
;current-bit-writer
;current-bit-flusher
)
; A BIT-PORT consists of an underlying (byte) port and
; the current (bit)position.
(define-struct bit-port (port pos))
; An INPUT-BIT-PORT is a bit-port with a concrete byte-reader
; which from which the corresponding bit-reader read is constructed.
(define-struct (input-bit-port bit-port) (byte-reader read))
; An OUTPUT-BIT-PORT is a bit-port with a concrete byte-writer
; from which the corresponding bit-writer write is constructed.
; Bits aren't written to the underlying byte port until a whole
; byte is received - a flush operations is thus sometimes needed
; at the end of a file.
(define-struct (output-bit-port bit-port) (byte-writer write flush))
; open-input-bit-file : path [symbol ... ] -> input-bit-port
; analog to open-input-file
(define (open-input-bit-file file . options)
(let ([byte-port (apply open-input-file file options)])
(make-input-bit-port byte-port
0
(λ () (read-byte byte-port))
(make-bit-reader (λ () (read-byte byte-port))))))
; open-output-bit-file : path [symbol ...] -> output-bit-port
; analog to open-output-file
(define (open-output-bit-file file . options)
(let ([byte-port (apply open-output-file file options)])
(let-values ([(bit-writer bit-flusher)
(make-bit-writer (λ (b) (write-byte b byte-port)))])
(make-output-bit-port byte-port
0
(λ (b) (write-byte b byte-port))
bit-writer
bit-flusher))))
; current bit-ports
(define current-input-bit-port (make-parameter 'none-yet))
(define current-output-bit-port (make-parameter 'none-yet))
; close-input-bit-port : input-bit-port ->
(define (close-input-bit-port bit-port)
(close-input-port (bit-port-port bit-port)))
; close-output-bit-port : output-bit-port ->
(define (close-output-bit-port bit-port)
((output-bit-port-flush bit-port))
(close-output-port (bit-port-port bit-port)))
; with-input-from-bit-file : path (-> alpha) [symbol ...] -> alpha
; analog to with-input-from-file
(define (with-input-from-bit-file file thunk . options)
(let ([in (apply open-input-bit-file file options)])
(begin0
(parameterize ([current-input-bit-port in])
(thunk))
(close-input-bit-port in))))
; with-output-to-bit-file : path (-> alpha) [symbol ...] -> alpha
; analog to with-to-file
(define (with-output-to-bit-file file thunk . options)
(let ([out (apply open-output-bit-file file options)])
(begin0
(parameterize ([current-output-bit-port out])
(thunk))
(close-output-bit-port out))))
(define (call-with-input-bit-file file proc . options)
(let ([in (apply open-input-bit-file file options)])
(begin0
(proc in))
(close-input-bit-port in)))
(define (call-with-output-bit-file file proc . options)
(let ([out (apply open-output-bit-file file options)])
(begin0
(proc out)
(close-output-bit-port out))))
; write-bits : natural natural [output-bit-port] ->
; write n lower bits from bs to the output-bit-port,
; if no output-bit-port is given use current-output-bit-port
(define write-bits
(case-lambda
[(n bs) (write-bits n bs (current-output-bit-port))]
[(n bs bit-port) (begin
(set-bit-port-pos! bit-port (+ (bit-port-pos bit-port) n))
((output-bit-port-write bit-port) n bs))]))
; read-bits : natural [input-bit-port] -> natural
; read n bits from the input-bit-port,
; if no input-bit-port is given, use current-input-bit-port
(define read-bits
(case-lambda
[(n) (read-bits n (current-input-bit-port))]
[(n bit-port) (begin
(set-bit-port-pos! bit-port (+ (bit-port-pos bit-port) n))
((input-bit-port-read bit-port) n))]))
; flush-bits : [output-bit-port] ->
; flush remaining bits in the cache by append zeros until a
; whole byte can be written
(define flush-bits
(case-lambda
[() (flush-bits (current-output-bit-port))]
[(out-bit-port) ((output-bit-port-flush out-bit-port))]))
; bit-file-position : bit-port -> natural
; return the bit-position of the bit-port,
; the bit-position of an input-bit-port is the number of bits read so far,
; for an output-bit-port it is the number of bits written so far
; bit-file-position : bit-port natural -> natural
; set the bit-position of the bit-port
(define bit-file-position
; todo: this only sets the position on an input port
(case-lambda
[(bit-port) (bit-port-pos bit-port)]
[(bit-port n) (begin
(unless (input-bit-port? bit-port)
(error
(string-append
"(bit-file-position bit-port n) is not implemented "
"for output bit ports.")))
(file-position (bit-port-port bit-port) (quotient n 8))
(read-bits (remainder n 8) bit-port))]))
;;;
;;; LOW LEVEL INTERFACE
;;;
; The following bit reader comes from
; <http://okmij.org/ftp/Scheme/binary-io.html#bit-reader>
; Binary parsing
;----------------------------------------
; Apologia
;
; Binary parsing and unparsing are transformations between primitive or
; composite Scheme values and their external binary representations.
;
; Examples include reading and writing JPEG, TIFF, MP3, ELF file
; formats, communicating with DNS, Kerberos, LDAP, SLP internet
; services, participating in Sun RPC and CORBA/IIOP distributed systems,
; storing and retrieving (arrays of) floating-point numbers in a
; portable and efficient way. This project will propose a set of low- and
; intermediate- level procedures that make binary parsing possible.
; Scheme is a good language to do research in text compression. Text
; compression involves a great deal of building and traversing
; dictionaries, trees and similar data structures, where Scheme
; excels. Performance doesn't matter in research, but the size of
; compressed files does (to figure out the bpc for the common
; benchmarks). Variable-bit i/o is a necessity. It is implemented
; in the present file.
; ASN.1 corresponds to a higher-level parsing (LR parser
; vs. lexer). Information in LDAP responses and X.509 certificates is
; structural and recursive, and so lends itself to be processed in
; Scheme. Variable bit i/o is necessary, and so is a binary lexer for
; a LR parser. Parsing of ASN.1 is a highly profitable enterprise
;----------------------------------------
; The outline of the project
;
; Primitives and streams
;
; - read-byte
; - read-u8vector (cf. read-string)
; - with-input-from-u8vector, with-input-from-encoded-u8vector 'base64,...
; building binary i/o streams from a sequence of bytes. Streams over
; u8vector, u16vector, etc. provide a serial access to memory. See SRFI-4
;
; - read-bit, read-bits via overlayed streams given read-byte
; implemented in the present file.
;
; - mmap-u8vector, munmap-u8vector
;
; Conversions
; - u8vector->integer u8vector endianness,
; u8vector->sinteger u8vector endianness
; These conversion procedures turn a sequence of bytes to an unsigned or
; signed integer, minding the byte order. The u8vector in question can
; have size 1,2,4,8, 3 etc. bytes. These two functions therefore can be
; used to read shorts, longs, extra longs, etc. numbers.
; - u8vector-reverse and other useful u8vector operations
;
; - modf, frexp, ldexp
; The above primitives can be emulated in R5RS, yet they are quite handy
; (for portable FP manipulation) and can be executed very efficiently by
; an FPU.
;
; Higher-level parsing and combinators
; These are combinators that can compose primitives above for more
; complex (possibly iterative) actions.
;
; - skip-bits, next-u8token,...
; - IIOP, RPC/XDR, RMI
; - binary lexer for existing LR/LL-parsers
;
; The composition of primitives and combinators will represent binary
; parsing language in a _full_ notation. This is similar to XPath
; expressions in full notation. Later we need to find out the
; most-frequently used patterns of the binary parsing language and
; design an abbreviated notation. The latter will need a special
; "interpreter". The abbreviated notation may turn out to look like
; Olin's regular expressions.
;;========================================================================
;; Configuration section
;;
; Performance is very important for binary parsing. We have to get all
; help from a particular Scheme system we can get. If a Scheme function
; can support the following primitives faster, we should take
; advantage of that fact.
;; Configuration for PLT
(define-syntax << (syntax-rules () [(_ x n) (arithmetic-shift x n)]))
(define-syntax >> (syntax-rules () [(_ x n) (arithmetic-shift x (- n))]))
(define-syntax <<1 (syntax-rules () [(_ x) (arithmetic-shift x 1)]))
(define-syntax >>1 (syntax-rules () [(_ x) (arithmetic-shift x -1)]))
(define-syntax bit-set? (syntax-rules () [(_ x mask) (not (zero? (bitwise-and x mask)))]))
;; End configuration for PLT
; combine bytes in the MSB order. A byte may be #f
(define (combine-two b1 b2) ; The result is for sure a fixnum
(and b1 b2 (bitwise-ior (<< b1 8) b2)))
(define (combine-three b1 b2 b3) ; The result is for sure a fixnum
(and b1 b2 b3 (bitwise-ior (<< (bitwise-ior (<< b1 8) b2) 8) b3)))
; Here the result may be a BIGNUM
(define (combine-bytes . bytes)
(cond
((null? bytes) 0)
((not (car bytes)) #f)
(else
(let loop ((bytes (cdr bytes)) (result (car bytes)))
(cond
((null? bytes) result)
((not (car bytes)) #f)
(else (loop (cdr bytes) (+ (car bytes) (* 256 result)))))))))
;========================================================================
; Reading a byte
; Read-byte is a fundamental primitive; it needs to be
; added to the standard. Most of the other functions are library
; procedures. The following is an approximation, which clearly doesn't
; hold if the port is a Unicode (especially UTF-8) character stream.
; The mzscheme read-byte is used.
; Return a byte as an exact integer [0,255], or the EOF object
#;(define (read-byte port)
(let ((c (read-char port)))
(if (eof-object? c) c (char->integer c))))
; The same as above, but returns #f on EOF.
(define (read-byte-f port)
(let ([b (read-byte port)])
(and (not (eof-object? b)) b)))
;========================================================================
; Bit stream
; -- Function: make-bit-reader BYTE-READER
; Given a BYTE-READER (a thunk), construct and return a function
; bit-reader N
;
; that reads N bits from a byte-stream represented by the BYTE-READER.
; The BYTE-READER is a function that takes no arguments and returns
; the current byte as an exact integer [0-255]. The byte reader
; should return #f on EOF.
; The bit reader returns N bits as an exact unsigned integer,
; 0 -... (no limit). N must be a positive integer, otherwise the bit reader
; returns #f. There is no upper limit on N -- other than the size of the
; input stream itself and the amount of (virtual) memory an OS is willing
; to give to your process. If you want to read 1M of _bits_, go ahead.
;
; It is assumed that the bit order is the most-significant bit first.
;
; Note the bit reader keeps the following condition true at all times:
; (= current-inport-pos (ceiling (/ no-bits-read 8)))
; That is, no byte is read until the very moment we really need (some of)
; its bits. The bit reader does _not_ "byte read ahead".
; Therefore, it can be used to handle a concatenation of different
; bit/byte streams *STRICTLY* sequentially, _without_ 'backing up a char',
; 'unreading-char' etc. tricks.
; For example, make-bit-reader has been used to read GRIB files of
; meteorological data, which made of several bitstreams with headers and
; tags.
; Thus careful attention to byte-buffering and optimization are the
; features of this bit reader.
;
; Usage example:
; (define bit-reader (make-bit-reader (lambda () #b11000101)))
; (bit-reader 3) ==> 6
; (bit-reader 4) ==> 2
; The test driver below is another example.
;
; Notes on the algorithm.
; The function recognizes and handles the following special cases:
; - the buffer is empty and 8, 16, 24 bits are to be read
; - reading all bits which are currently in the byte-buffer
; (and then maybe more)
; - reading only one bit
; Since the bit reader is going to be called many times, optimization is
; critical. We need all the help from the compiler/interpreter
; we can get.
(define (make-bit-reader byte-reader)
(let ((buffer 0) (mask 0) ; mask = 128 means that the buffer is full and
; the msb bit is the current (yet unread) bit
(bits-in-buffer 0))
; read the byte into the buffer and set up the counters.
; return #f on eof
(define (set-buffer)
(set! buffer (byte-reader))
(and buffer
(begin
(set! bits-in-buffer 8)
(set! mask 128)
#t)))
; Read fewer bits than there are in the buffer
(define (read-few-bits n)
(let ((value (bitwise-and buffer ; all bits in buffer
(sub1 (<<1 mask)))))
(set! bits-in-buffer (- bits-in-buffer n))
(set! mask (>> mask n))
(>> value bits-in-buffer))) ; remove extra bits
; read n bits given an empty buffer, and append them to value, n>=8
(define (add-more-bits value n)
(let loop ((value value) (n n))
(cond
((zero? n) value)
((< n 8)
(let ((rest (read-n-bits n)))
(and rest (+ (* value (<< 1 n)) rest))))
(else
(let ((b (byte-reader)))
(and b (loop (+ (* value 256) b) (- n 8))))))))
; The main module
(define (read-n-bits n)
; Check the most common cases first
(cond
((not (positive? n)) #f)
((zero? bits-in-buffer) ; the bit-buffer is empty
(case n
((8) (byte-reader))
((16)
(let ((b (byte-reader)))
(combine-two b (byte-reader))))
((24)
(let* ((b1 (byte-reader)) (b2 (byte-reader)))
(combine-three b1 b2 (byte-reader))))
(else
(cond
((< n 8)
(and (set-buffer) (read-few-bits n)))
((< n 16)
(let ((b (byte-reader)))
(and (set-buffer)
(bitwise-ior (<< b (- n 8))
(read-few-bits (- n 8))))))
(else
(let ((b (byte-reader)))
(and b (add-more-bits b (- n 8)))))))))
((= n 1) ; read one bit
(let ((value (if (bit-set? buffer mask) 1 0)))
(set! mask (>>1 mask))
(set! bits-in-buffer (sub1 bits-in-buffer))
value))
((>= n bits-in-buffer) ; will empty the buffer
(let ((n-rem (- n bits-in-buffer))
(value (bitwise-and buffer ; for mask=64, it'll be &63
(sub1 (<<1 mask)))))
(set! bits-in-buffer 0)
(cond
((zero? n-rem) value)
((<= n-rem 16)
(let ((rest (read-n-bits n-rem)))
(and rest (bitwise-ior (<< value n-rem) rest))))
(else (add-more-bits value n-rem)))))
(else (read-few-bits n))))
read-n-bits)
)
;;;
;;; BIT WRITER
;;;
; -- Function: make-bit-writer BYTE-WRITER
; Given a BYTE-WRITER (function of one argument), construct and return a function
; bit-writer N B
;
; that writes N bits represented by the integer B to a byte-stream represented
; by the BYTE-WRITER.
; The BYTE-WRITER is a function that takes one argument and writes
; the given byte as an exact integer [0-255].
; It is assumed that the bit order is the most-significant bit first.
;
; Note the bit writer will output bytes as soon as possible. That is
; the maximum number of waiting bits are 7. Call bit-writer with a
; non-number as argument to flush the remainin bits.
(define (make-bit-writer byte-writer)
(let ((buffer 0)
(bits-in-buffer 0))
(define (empty-buffer!)
(set! buffer 0)
(set! bits-in-buffer 0))
(define (low-bits n b)
(bitwise-and b (vector-ref #(0 1 3 7 15 31 63 127 255) n)))
(define (extend-buffer! n b)
(set! buffer (bitwise-ior (<< buffer n) b))
(set! bits-in-buffer (+ bits-in-buffer n)))
(define (set-buffer! n b)
(set! buffer b)
(set! bits-in-buffer n))
(define (integer-length n)
(unless (and (integer? n) (not (negative? n)))
(error "a non-negative integer was expected, got: " n))
(if (<= n 1)
1
(+ 1 (integer-length (arithmetic-shift n -1)))))
(define (flush-buffer)
(unless (zero? bits-in-buffer)
(byte-writer (<< buffer (- 8 bits-in-buffer)))))
(define (write-n-bits n b)
(when (and (number? n) (not (zero? n))
(> (integer-length b) n))
(error "doh!" (list n b)))
; (set! b (low-bits n b))
(cond
((not (positive? n))
#f)
((zero? bits-in-buffer) ; the bit-buffer is empty
(case n
((8) (byte-writer b))
((16) (byte-writer (>> (bitwise-and #b1111111100000000 b) 8))
(byte-writer (bitwise-and #b11111111 b)))
(else
(let ([r (remainder n 8)])
(cond
((zero? r) (for-each byte-writer
(let loop ([n n] [b b] [l '()])
(if (= n 0)
l
(loop (- n 8)
(>> b 8)
(cons (bitwise-and #b11111111 b) l))))))
((< n 8) (set-buffer! n b))
((< n 16) (byte-writer (>> b (- n 8)))
(set-buffer! (- n 8) (bitwise-and b (>> #b11111111 (- 16 n)))))
(else (let ([bits-to-buffer (remainder n 8)]) ; output all whole bytes, and buffer the rest
(write-n-bits (- n bits-to-buffer)
(>> b bits-to-buffer))
(set-buffer! bits-to-buffer
(bitwise-and b (vector-ref #(0 1 3 7 15 31 63 127 255) r))))))))))
((< n (- 8 bits-in-buffer)) ; everything goes to the buffer
(extend-buffer! n b))
(else
(let ([m (- 8 bits-in-buffer)])
;(display (list buffer bits-in-buffer n) (current-error-port))
; (flush-output (current-error-port))
; the buffer and the initial bits make a byte
(byte-writer (bitwise-ior (<< buffer m)
(>> b (- n m))))
(empty-buffer!)
; write the rest
(write-n-bits (- n m) (bitwise-xor
b (<< (>> b (- n m))
(- n m))))))))
(values write-n-bits flush-buffer)))
;;; TEST
#;
(define (test)
(define (naturals n)
(do ([i 0 (+ i 1)]
[l '() (cons i l)])
[(= i n) l]))
; write the numbers 999 ... 1 to "tmp" and read them again
(with-output-to-file "tmp"
(lambda ()
(let-values
([(write flush) (make-bit-writer write-byte)])
(for-each (lambda (n)
(write n n))
(naturals 100))
(flush)))
'replace)
(with-input-from-file "tmp"
(lambda ()
(let ([r (make-bit-reader read-byte)])
(for-each (lambda (n)
(display (r n))
(display " "))
(naturals 100))))))
;;;
;;; PARAMETERS
;;;
(define current-bit-reader (make-parameter (make-bit-reader read-byte)))
(define-values (current-bit-writer current-bit-flusher)
(let-values ([(writer flusher) (make-bit-writer write-byte)])
(values (make-parameter writer)
(make-parameter flusher))))
)