racket/collects/little-helper/indexer/lexicon.scm
Jens Axel Soegaard ac47e02961 Initial checkin of Little Helper.
Little Helper contains a full text search engine. 
Currently it indexes all html-files in /collects/doc.
A mockup web-interface is present in order to facilitate easy experimentation with searches.
Run run-indexer to generate the index for your documentation dir. 
Run launch to start a web-server with the search interface.
Note: Currently assumes w3m is in your path (used to precompute the 
           preview-snippets shown in the search results.

svn: r8836
2008-03-01 13:26:18 +00:00

391 lines
13 KiB
Scheme
Executable File

;;; lexicon.scm
; This file contains the disk-based lexicon.
; It is used for lookups only - i.e. it is not
; used under index generation.
;;; IDEAS OF IMPROVEMENT:
; - replace generic read with a my-read-string
; that simply reads a string "..." -- and measure
; whether it is faster.
; - cache some memory blocks
;;; TERMINOLOGY
; A LEXICON is a collection of term/term-number pairs.
; A TERM is a string. A TERM-NUMBER is a natural number.
; Given a term, it is possible to determine whether
; the term is present in the lexicon, and if so, to
; find the associated term-number.
;;; INTENDED USAGE
; This implementation assume that lexicon creation happens once,
; and that lookups happen often.
; That is: Lookups should be fast and require very little memory.
; For the search engine this lexicon is used by the web server,
; when queries are made. Another lexicon is used during the
; indexing proces.
;;; FILE FORMAT
; All term/term-number pairs follow each other with no seperation.
; The term is enclosed in quotation marks. The term number
; is written (for now) as a normal integer.
; "term1"num1"term2"num2...
; Rationale:
; The total length of all terms in current lexicon
; is 1.700.000 for PLT Source Search, so the total
; lexicon size ought to be around 3-4 megabytes, which
; is fine for the web-server.
;;; PREAMBLE
;(print-struct #t)
(module lexicon mzscheme
(provide create-disk-index
for-each-term-in-disk-lexicon
lookup)
(require (lib "42.ss" "srfi")
(only (lib "43.ss" "srfi") vector-copy)
"planet/file.scm") ; (planet "io.ss" ("dherman" "io.plt"))
; Just in case, the representation is going to change
(define read-term read)
(define read-number read)
(define read-pos read)
(define write-term write)
(define write-number write)
(define write-pos write)
;;; DISK INDICES
; A disk index contains term/file-position pairs in the
; same format a lexicon is stored in. It is used to
; find relevant blocks in a lexicon.
; create-disk-index : filename filename natural ->
; Construct a disk index with term/file-position pairs
; for lexicon-file in index-file.
(define (create-disk-index lexicon-file index-file block-size)
(call-with-input-file lexicon-file
(lambda (in)
(call-with-output-file index-file
(lambda (out)
(let loop ([n 0])
(let ([pos (file-position in)]
[term (read-term in)]
[number (read-number in)])
(unless (or (eof-object? term)
(eof-object? number))
(when (zero? (modulo n block-size))
(write-term term out)
(write-pos pos out))
(loop (+ n 1))))))
'truncate))))
;;; MEMORY INDICES
; A memory index is meant to be kept in-memory.
; It is used to find appropriate blocks in the
; lexicon file.
(define-struct memory-index (terms numbers) (make-inspector))
; where terms is a vector of strings in ascending order,
; and numbers is a vector of numbers.
; It represents a mapping from terms to numbers.
; memory-index-lookup : memory-index term -> natural
; lookup term in the given memory-index:
; return
; numbers[i]
; where i is an index such that
; term[i] <= term < term[i+1]
(define (memory-index-lookup mi term)
(let ([terms (memory-index-terms mi)]
[numbers (memory-index-numbers mi)])
; binary search to find index
(let loop ([from 0]
[to (vector-length terms)])
(if (= (- to from) 1)
(vector-ref numbers from)
(let ([mid (+ from (quotient (- to from) 2))])
(if (string>=? term (vector-ref terms mid))
(loop mid to)
(loop from mid)))))))
; read-disk-index-size : filename -> natural
; return the number of terms in the disk index
; TODO: consider putting the size in the beginning of the index file.
(define (read-disk-index-size filename)
(call-with-input-file filename
(lambda (in)
(let loop ([n 0])
(let ([term (read-term in)]
[number (read-number in)])
(if (or (eof-object? term)
(eof-object? number))
n
(loop (+ n 1))))))))
; read-disk-index-block : filename natural natural -> memory-index
; read the block of size block-size situated at the file position pos
(define (read-disk-index-block index-file block-size pos)
(call-with-input-file index-file
(lambda (port)
; go to start of block
(file-position port pos)
; read block
(let ([terms (make-vector block-size)]
[numbers (make-vector block-size)])
(let loop ([n 0])
(if (= n block-size)
'normal
(let ([term (read-term port)]
[number (read-number port)])
(if (or (eof-object? term)
(eof-object? number))
; early file ending => truncate terms and numbers
(begin
(set! terms (vector-copy terms 0 n))
(set! numbers (vector-copy numbers 0 n)))
; continue
(begin
(vector-set! terms n term)
(vector-set! numbers n number)
(loop (+ n 1)))))))
; return block
(make-memory-index terms numbers)))))
; read-disk-index : filename -> memory-index-block
; read a whole disk-index into a memory-index-block
(define (read-disk-index file)
(read-disk-index-block file (read-disk-index-size file) 0))
;;; MEMORY LEXICON BLOCKS
; A memory lexicon block is an in-memory representation
; of a block of the lexicon file.
(define-struct memory-lexicon-block (terms numbers) (make-inspector))
; where terms is vector of strings
; and numbers is a vector of numbers.
; memory-lexicon-block-lookup : memory-lexicon-block term -> number or #f
; lookup term in the given memory-lexicon-block:
; returns
; numbers[i],
; where i is an index such that
; term[i] = term
; or #f if no such index exists.
(define (memory-lexicon-block-lookup mlb term)
;(display "memory-lexicon-block-lookup: ") (display mlb) (newline)
(let ([terms (memory-lexicon-block-terms mlb)]
[numbers (memory-lexicon-block-numbers mlb)])
; binary search to find index
(let loop ([from 0]
[to (vector-length terms)])
(if (= (- to from) 1)
(if (string=? term (vector-ref terms from))
(vector-ref numbers from)
#f)
(let ([mid (+ from (quotient (- to from) 2))])
(if (string>=? term (vector-ref terms mid))
(loop mid to)
(loop from mid)))))))
; read-lexicon-block : filename natural natural -> memory-lexicon-block
; read the block of size block-size situated at the file position pos
(define (read-lexicon-block file block-size pos)
(call-with-input-file file
(lambda (port)
; go to start of block
(file-position port pos)
; read block
(let ([terms (make-vector block-size)]
[numbers (make-vector block-size)])
(let loop ([n 0])
(if (= n block-size)
'normal
(let ([term (read port)]
[number (read port)])
(if (or (eof-object? term)
(eof-object? number))
; early file ending => truncate terms and numbers
(begin
(set! terms (vector-copy terms 0 n))
(set! numbers (vector-copy numbers 0 n)))
; continue
(begin
(vector-set! terms n term)
(vector-set! numbers n number)
(loop (+ n 1)))))))
; return block
(make-memory-lexicon-block terms numbers)))))
; write-memory-lexicon-block : memory-lexicon-block port ->
; write the lexicon block to port
(define (write-memory-lexicon-block mlb port)
(let ([terms (memory-lexicon-block-terms mlb)]
[numbers (memory-lexicon-block-numbers mlb)])
(do-ec (:parallel
(: t terms)
(: n numbers))
(begin
(write t port)
(write n port)))))
;;; LOOKUP
(define (lookup term lexicon-file index-file block-size)
(let* ([mem-index (get-disk-index index-file)]
[block-pos (memory-index-lookup mem-index term)]
[mem-lexicon-block (get-lexicon-block lexicon-file block-size block-pos)])
;(display (list mem-index block-pos mem-lexicon-block)) (newline)
(memory-lexicon-block-lookup mem-lexicon-block term)))
; get-disk-index : index-file-name -> memory-index-block
; cached version of read-disk-index
(define get-disk-index
(let ([cache (make-hash-table 'equal)])
(lambda (index-file)
(hash-table-get cache index-file
(lambda ()
(let ([mem-index (read-disk-index index-file)])
(hash-table-put! cache index-file mem-index)
mem-index))))))
(define get-lexicon-block
(let ([cache (make-hash-table 'equal 'weak)])
(lambda (lexicon-file block-size block-pos)
(let ([key (list lexicon-file block-size block-pos)])
(hash-table-get cache key
(lambda ()
(let ([mlb (read-lexicon-block lexicon-file block-size block-pos)])
(hash-table-put! cache key mlb)
mlb)))))))
;;; TODO TODO CACHING
;;; - cache memory index
;;; - cache lexicon blocks - even one would be an improvement
;;; SPECIAL PURPOSE
(require "config.scm")
(define (for-each-term-in-disk-lexicon index-name f)
(call-with-input-file (lexicon-path index-name)
(lambda (port)
(let loop ()
(let ([term (read-term port)]
[number (read-number port)])
(if (or (eof-object? term)
(eof-object? number))
(void)
(begin
(f term number)
(loop))))))))
; NOTE: Not used in current design
; If for-each-term-in-disk-lexicon is too slow,
; then something to try is flat-lexicon + grep
(define (create-flat-lexicon index-name)
(call-with-output-file (flat-lexicon-path index-name)
(lambda (out)
(for-each-term-in-disk-lexicon
index-name
(lambda (term number)
(display term out)
(newline out))))))
;;;
;;; TEST
;;;
(define (test-memory-index-lookup)
(define mi
(make-memory-index
(vector "a" "b" "c" "d" "e")
(vector 0 1 2 3 4)))
(equal? (map (lambda (t) (memory-index-lookup mi t))
(list "a" "b" "c" "d" "e" "f"
"aa" "bb" "cc" "dd" "ee" "ff"))
(list 0 1 2 3 4 4
0 1 2 3 4 4)))
(define (test-lexicon-block-lookup)
(define mlb
(make-memory-lexicon-block
(vector "a" "b" "c" "d" "e")
(vector 0 1 2 3 4)))
(equal? (map (lambda (t) (memory-lexicon-block-lookup mlb t))
(list "a" "b" "c" "d" "e" "f"
"aa" "bb" "cc" "dd" "ee" "ff"))
(list 0 1 2 3 4 #f
#f #f #f #f #f #f)))
(define (test-read-lexicon-block)
(define mlb
(make-memory-lexicon-block
(vector "a" "b" "c" "d" "e")
(vector 0 1 2 3 4)))
(with-temporary-file file ()
(call-with-output-file file
(lambda (port)
(write-memory-lexicon-block mlb port))
'truncate)
(let ([pos 0]
[size (vector-length
(memory-lexicon-block-terms mlb))])
(equal? (read-lexicon-block file size pos)
mlb))))
(define (test)
(and (test-memory-index-lookup)
(test-lexicon-block-lookup)
(test-read-lexicon-block)))
;;; GENERATING TEST INPUT
(define (write-alphabet-lexicon file)
;"a"0"b"1"c"2 ...
(call-with-output-file file
(lambda (out)
(do-ec (: i (char->integer (string-ref "a" 0)) (+ 1 (char->integer (string-ref "z" 0))))
(begin
(write (list->string
(list
(integer->char i)))
out)
(write (- i (char->integer (string-ref "a" 0)))
out))))
'truncate))
#;(begin
(write-alphabet-lexicon "alphabet-lexicon.txt")
(create-disk-index "alphabet-lexicon.txt" "alphabet-disk-index.txt" 5)
; (lookup term lexicon-file index-file block-size)
(list-ec (: i
(char->integer (string-ref "a" 0))
(+ 1 (char->integer (string-ref "z" 0))))
(lookup (string (integer->char i))
"alphabet-lexicon.txt" "alphabet-disk-index.txt" 5))
)
)