racket/collects/little-helper/indexer/indexer.scm
Jens Axel Soegaard ac47e02961 Initial checkin of Little Helper.
Little Helper contains a full text search engine. 
Currently it indexes all html-files in /collects/doc.
A mockup web-interface is present in order to facilitate easy experimentation with searches.
Run run-indexer to generate the index for your documentation dir. 
Run launch to start a web-server with the search interface.
Note: Currently assumes w3m is in your path (used to precompute the 
           preview-snippets shown in the search results.

svn: r8836
2008-03-01 13:26:18 +00:00

481 lines
18 KiB
Scheme

;;; indexer.scm
(module indexer mzscheme
(provide (struct index (name documents lexicon positions))
(struct lexicon (terms term-numbers))
index-files
delete-index
load-index
save-index
(struct document (path weight))
lookup-document-path
term-number->term
read-frequency
number-of-documents
number-of-terms
read-inverted-list
token-case-sensitive)
;;;
;;; INDEXER - SORT BASED - WITH COMPRESSION - GAPS VERSION
;;;
; This version stores the term numbers as gaps. This implies
; that we need to write the initial temporary file in sorted
; blocks. The tokenizing and the in-memory sorting phases are
; therefore combined.
;; The sort-based algorithm:
; 1. For each term in each document a record is made in
; a temporary file.
; A record consists of a term number, a document number
; and the frequency of the term in the document.
; 2. Internal sorting is done with MAX-RECORDS-IN-MEMORY
; records at a time. Two records are compared first
; by term number, then by document number.
; 3. Pairwise merging is used to get the whole file sorted.
; 4. The inverted file is output.
(require
(lib "match.ss")
(lib "file.ss")
(lib "list.ss")
(lib "serialize.ss")
; (all-except (planet "42.ss" ("soegaard" "srfi.plt" 1 2)) index)
(all-except (file "planet/srfi.plt/1/2/42.ss") index)
(rename (file "planet/srfi.plt/1/2/42.ss") Index index)
"planet/bit-io.scm" ; (planet "bit-io.scm" ("soegaard" "bit-io.plt" 2 0))
"planet/dotimes.scm" ; (planet "control.scm" ("soegaard" "control.plt")) ; dotimes
"config.scm"
"compression.scm"
"lexer.scm")
;;;
;;; INDEX
;;;
; An INDEX is a
(define-serializable-struct index (name documents lexicon positions) (make-inspector))
; where name is unused,
; and documents is a hash-table from document numbers to document structures,
; and lexicon is lexicon structure (the in-memory kind while constructing kind, see below),
; and positions is a vector, which associates a term-number with its bit-position
; in the inverted file.
; NOTE:
(define (save-index path i . options)
(apply with-output-to-file path
(λ () (write (serialize i)))
options))
(define (delete-index path)
(when (file-exists? path)
(delete-file path)))
(define (load-index path)
(cond [(file-exists? path) ; TODO for debug
(with-input-from-file path
(λ () (deserialize (read))))]
[else
(error (format "the index file '~a' doesn't exist\n" path))]))
(define (number-of-terms index)
(lexicon-size (index-lexicon index)))
;;;
;;; DOCUMENTS
;;;
; A DOCUMENT is a
(define-serializable-struct document (path weight) (make-inspector))
; where
; path is the file path
; n 2
; weight is W = sqrt( sum w ) , where w = 1 + ln( f )
; d t=1 d,t d,t d,t
(define (make-empty-documents)
(make-hash-table 'equal))
(define (number-of-documents index)
(hash-table-count (index-documents index)))
(define (lookup-document-path index d)
(let ([doc (hash-table-get (index-documents index) d)])
(if doc
(document-path doc)
#f)))
;;;
;;; LEXICON
;;;
; Each term is associated with a term number (id).
; During tokenizing we need term->term-number lookups.
; In later phases we need term-number->term lookups.
; A LEXICON is a
(define-serializable-struct lexicon (terms term-numbers) (make-inspector))
; where terms is a hash table associating byte strings with term numbers,
; and term-numbers is an vector of terms (byte strings).
(define (make-empty-lexicon)
(make-lexicon (make-hash-table 'equal)
#f))
(define (lexicon-size lexicon)
(hash-table-count (lexicon-terms lexicon)))
(define (lexicon-insert! lexicon term)
; return the term-number of term, if term is not
; present insert it first
(let ([terms (lexicon-terms lexicon)])
(hash-table-get terms term
(λ ()
(let ([id (add1 (lexicon-size lexicon))])
(hash-table-put! terms term id)
id)))))
; lookup-term : lexicon bytes -> (union term-number #f)
(define (lookup-term lexicon term)
(hash-table-get (lexicon-terms lexicon) term (λ () #f)))
; term-number->term : index natural -> term
; return the term with term-number t
(define (term-number->term index t)
(vector-ref (lexicon-term-numbers (index-lexicon index)) t))
;;;
;;; RECORDS
;;;
(define (make-record term-number doc-number freq)
(list term-number doc-number freq))
(define write-term-gap write-number/unary)
(define read-term-gap read-number/unary)
(define write-document-number write-number/delta)
(define read-document-number read-number/delta)
(define write-frequency write-number/gamma)
(define read-frequency read-number/gamma)
(define write-record
(case-lambda
[(term-number+doc-number+freq)
(write-record term-number+doc-number+freq (current-output-bit-port))]
[(term-number+doc-number+freq out-bit-port)
(match term-number+doc-number+freq
[(term-number doc-number freq)
(write-term-gap term-number out-bit-port)
(write-document-number doc-number out-bit-port)
(write-frequency freq out-bit-port)])]))
(define read-record
(case-lambda
[() (read-record (current-input-bit-port))]
[(in-bit-port) (list (read-term-gap in-bit-port)
(read-document-number in-bit-port)
(read-frequency in-bit-port))]))
(define (skip-records n port)
(do-ec (:repeat n)
(read-record port)))
(define (record<? r1 r2)
(or (< (car r1) (car r2))
(and (= (car r1) (car r2))
(< (cadr r1) (cadr r2)))))
(define (write-sorted-records rs)
(do-ec (:let prev-t 1)
(:list r rs)
(:match (t d f) r)
(:let tg (- t prev-t))
(begin
(write-record (list tg d f))
(set! prev-t t))))
;;;
;;; UTIL
;;;
(define-syntax swap!
(syntax-rules ()
[(swap! x y)
(let ([t x])
(set! x y)
(set! y t))]))
;;;
;;; INDEXING
;;;
(define (merge-two-blocks size1 in1 size2 in2 out)
(let ([prev-t1 1]
[prev-t2 1]
[prev-t 1])
(define (read1)
(match (read-record in1)
[(tg d f)
(set! prev-t1 (+ prev-t1 tg))
(list prev-t1 d f)]))
(define (read2)
(match (read-record in2)
[(tg d f)
(set! prev-t2 (+ prev-t2 tg))
(list prev-t2 d f)]))
(define (write r)
(match r
[(t d f)
(write-record (list (- t prev-t) d f) out)
(set! prev-t t)]))
(skip-records size1 in2)
(let loop ([r1 (read1)]
[r2 (read2)]
[s1 (sub1 size1)]
[s2 (sub1 size2)])
(cond
[(record<? r1 r2) (write r1)
(if (> s1 0)
(loop (read1) r2
(sub1 s1) s2)
(begin
(write r2)
(dotimes (_ s2)
(write (read2)))))]
[else (write r2)
(if (> s2 0)
(loop r1 (read2)
s1 (sub1 s2))
(begin
(write r1)
(dotimes (_ s1)
(write (read1)))))]))
(skip-records size2 in1)))
;
(define (external-merge-sort number-of-records block-size
in-file out-file . options)
(cond
[(>= block-size number-of-records)
(values in-file out-file)]
[else
(display "*")
(let ([number-of-block-pairs
(quotient number-of-records (* 2 block-size))])
(let* ([in1 (open-input-bit-file in-file)]
[in2 (open-input-bit-file in-file)]
[out (apply open-output-bit-file out-file options)])
(dotimes (_ number-of-block-pairs)
(merge-two-blocks block-size in1 block-size in2 out))
(let ([left (- number-of-records
(* 2 number-of-block-pairs block-size))])
(if (> left block-size)
(merge-two-blocks block-size in1 (- left block-size) in2 out)
(do-ec (:repeat left)
(write-record (read-record in1) out))))
(close-input-bit-port in1)
(close-input-bit-port in2)
(close-output-bit-port out)
(apply external-merge-sort
number-of-records (* 2 block-size)
out-file in-file options)))]))
(define (index-files files-to-index result-path name lexer)
; 1. Initialization
(display "INITIALIZATION\n")
(let ([documents (make-empty-documents)]
[lexicon (make-empty-lexicon)]
[tmp (make-temporary-file "tmp1-~a.data" #f "/tmp/")]
[tmp2 (make-temporary-file "tmp2-~a.data" #f "/tmp/")])
; 2. Process text and write records to a temporary file
(display "TOKENIZING\n")
(let ([number-of-records
(with-output-to-bit-file tmp
(λ ()
(letrec ([records '()]
[len 0]
[add-record! (λ(r)
(set! len (+ len 1))
(set! records (cons r records)))]
[sort-and-save (λ ()
(write-sorted-records (quicksort records record<?))
(set! records '()))])
(begin0
(sum-ec (:parallel
;(:repeat 10) ; early stopping for test runs
(: file-path (Index doc-number) files-to-index))
(begin (hash-table-put! documents doc-number (make-document file-path 0)))
; (if (indexable-file? file-path))
;(begin (hash-table-put! documents doc-number (make-document file-path 0)))
(: f (calculate-term-frequencies lexicon file-path lexer))
(:match (term-number . freq) f)
(begin
(add-record! (make-record term-number doc-number freq))
(if (zero? (remainder len MAX-RECORDS-IN-MEMORY))
(sort-and-save))
1))
(sort-and-save))))
'replace)])
; 3. Internal sorting of blocks of size MAX-RECORDS-IN-MEMORY
; NOTE: In this version of the indexer, this phase was put into the previous phase.
; - see the book.
; (display "INTERNAL SORT\n")
; (internal-sort number-of-records MAX-RECORDS-IN-MEMORY
; read-record write-record
; tmp tmp2 'update)
; (swap! tmp tmp2)
; 4. Merging
(display "MERGING\n")
(let-values ([(sorted-file temporary-file)
(external-merge-sort number-of-records MAX-RECORDS-IN-MEMORY
tmp tmp2 'replace)])
(delete-file temporary-file)
; 5. i) Output inverted file
; Format for each entry:
; t n g1 f1 ... gn gn
; where t is the term number, gi are gaps between document numbers, i.e.
; d1=g1, d2=g1+g2, etc, and fi are frequencies.
; ii) Record the bit position of each term in the inverted file.
(let* ([number-of-terms (lexicon-size lexicon)]
[positions (make-vector (add1 number-of-terms) #f)]
[ns (make-vector (add1 number-of-terms) 0)])
; pass 1: for each t find n
(with-input-from-bit-file sorted-file
(λ ()
(do-ec (:let t 1)
(:repeat number-of-records)
(:match (tg d f) (read-record))
(begin
(set! t (+ t tg))
(vector-set! ns t (add1 (vector-ref ns t)))))))
; pass 2: output the index
(with-input-from-bit-file sorted-file
(λ ()
(with-output-to-bit-file result-path
(λ ()
(do-ec (:let t 1)
(: k 1 (add1 number-of-terms))
; first record
(:let n (vector-ref ns k))
(:let pos (bit-file-position (current-output-bit-port)))
(:match (tg d f) (read-record))
(begin (set! t (+ t tg)))
(:let dg d)
(begin
(vector-set! positions t pos)
(write-number t)
(write-number n)
(write-number dg)
(write-frequency f))
; remaining n-1 records
(:let prev-d d)
(:repeat (- n 1))
(:match (tg d f) (read-record))
(begin (set! t (+ t tg)))
(:let dg (- d prev-d))
(begin
(write-number dg)
(write-frequency f)
(set! prev-d d))))
'replace)))
(delete-file sorted-file)
; 5. Finish up
(let ([term-numbers (make-vector (add1 (lexicon-size lexicon)) #"")])
; run through the hash-table and make the term-number to term association
(hash-table-for-each (lexicon-terms lexicon)
(λ (term term-number)
(vector-set! term-numbers term-number term)))
(set-lexicon-term-numbers! lexicon term-numbers)
(calculate-document-weights (make-index name documents lexicon positions) result-path)))))))
; calculate-term-frequencies : lexicon path -> (list (cons term-number frequecy))
; return a list of term-number/frequecy pairs
; terms in file will be added to the lexicon
(define (calculate-term-frequencies lexicon file lexer)
(let ([frequencies (make-hash-table 'equal)])
(define (increase-frequency! term-number)
(let ([f (hash-table-get frequencies term-number
(λ ()
(hash-table-put! frequencies term-number 0)
0))])
(hash-table-put! frequencies term-number (add1 f))))
(define (insert-and-increase! term)
(let ([term-number (lexicon-insert! lexicon term)])
(increase-frequency! term-number)))
; for each token in the file:
(lexer file (λ (term pos) (insert-and-increase! term)))
(let ([fs '()])
(hash-table-for-each frequencies
(λ (term freq)
(set! fs (cons (cons term freq) fs))))
fs)))
(define (read-inverted-list)
; Format for each entry:
; t n g1 f1 ... gn gn
; where t is the term number, gi are gaps between
; document numbers, i.e.
; d1=g1, d2=g1+g2, ...
; and fi are frequencies.
(let* ([t (read-number (current-input-bit-port))]
[n (read-number)])
(list t n
(list-ec (:let d 0)
(: _ n)
(:let dg (read-number))
(:let f (read-frequency))
(begin
(set! d (+ d dg))
(list d f))))))
(define (calculate-document-weights an-index path)
(let ([ws (make-vector (add1 (number-of-documents an-index)) 0)]
[N (number-of-documents an-index)])
; sum the wdt^2 = log(1 + N/fdt)^2
(match an-index
[($ index ignored-path documents lexicon positions)
(with-input-from-bit-file path
(λ ()
(do-ec (:repeat (number-of-terms an-index))
(:let il (read-inverted-list))
(:match (t n dfs) il)
(:list df dfs)
(:let d (car df))
(:let f (cadr df))
(vector-set! ws d (+ (vector-ref ws d)
(expt (log (+ 1.0 (/ N f))) 2))))))])
; take square roots
(do-ec (:vector w (Index i) ws)
(vector-set! ws i (sqrt w)))
; store the results
(hash-table-map
(index-documents an-index)
(λ (d doc)
(set-document-weight! doc (vector-ref ws d))
doc))
; return index
an-index))
)