;;; indexer.scm (module indexer mzscheme (provide (struct index (name documents lexicon positions)) (struct lexicon (terms term-numbers)) index-files delete-index load-index save-index (struct document (path weight)) lookup-document-path term-number->term read-frequency number-of-documents number-of-terms read-inverted-list token-case-sensitive) ;;; ;;; INDEXER - SORT BASED - WITH COMPRESSION - GAPS VERSION ;;; ; This version stores the term numbers as gaps. This implies ; that we need to write the initial temporary file in sorted ; blocks. The tokenizing and the in-memory sorting phases are ; therefore combined. ;; The sort-based algorithm: ; 1. For each term in each document a record is made in ; a temporary file. ; A record consists of a term number, a document number ; and the frequency of the term in the document. ; 2. Internal sorting is done with MAX-RECORDS-IN-MEMORY ; records at a time. Two records are compared first ; by term number, then by document number. ; 3. Pairwise merging is used to get the whole file sorted. ; 4. The inverted file is output. (require (lib "match.ss") (lib "file.ss") (lib "list.ss") (lib "serialize.ss") ; (all-except (planet "42.ss" ("soegaard" "srfi.plt" 1 2)) index) (all-except (file "planet/srfi.plt/1/2/42.ss") index) (rename (file "planet/srfi.plt/1/2/42.ss") Index index) "planet/bit-io.scm" ; (planet "bit-io.scm" ("soegaard" "bit-io.plt" 2 0)) "planet/dotimes.scm" ; (planet "control.scm" ("soegaard" "control.plt")) ; dotimes "config.scm" "compression.scm" "lexer.scm") ;;; ;;; INDEX ;;; ; An INDEX is a (define-serializable-struct index (name documents lexicon positions) (make-inspector)) ; where name is unused, ; and documents is a hash-table from document numbers to document structures, ; and lexicon is lexicon structure (the in-memory kind while constructing kind, see below), ; and positions is a vector, which associates a term-number with its bit-position ; in the inverted file. ; NOTE: (define (save-index path i . options) (apply with-output-to-file path (λ () (write (serialize i))) options)) (define (delete-index path) (when (file-exists? path) (delete-file path))) (define (load-index path) (cond [(file-exists? path) ; TODO for debug (with-input-from-file path (λ () (deserialize (read))))] [else (error (format "the index file '~a' doesn't exist\n" path))])) (define (number-of-terms index) (lexicon-size (index-lexicon index))) ;;; ;;; DOCUMENTS ;;; ; A DOCUMENT is a (define-serializable-struct document (path weight) (make-inspector)) ; where ; path is the file path ; n 2 ; weight is W = sqrt( sum w ) , where w = 1 + ln( f ) ; d t=1 d,t d,t d,t (define (make-empty-documents) (make-hash-table 'equal)) (define (number-of-documents index) (hash-table-count (index-documents index))) (define (lookup-document-path index d) (let ([doc (hash-table-get (index-documents index) d)]) (if doc (document-path doc) #f))) ;;; ;;; LEXICON ;;; ; Each term is associated with a term number (id). ; During tokenizing we need term->term-number lookups. ; In later phases we need term-number->term lookups. ; A LEXICON is a (define-serializable-struct lexicon (terms term-numbers) (make-inspector)) ; where terms is a hash table associating byte strings with term numbers, ; and term-numbers is an vector of terms (byte strings). (define (make-empty-lexicon) (make-lexicon (make-hash-table 'equal) #f)) (define (lexicon-size lexicon) (hash-table-count (lexicon-terms lexicon))) (define (lexicon-insert! lexicon term) ; return the term-number of term, if term is not ; present insert it first (let ([terms (lexicon-terms lexicon)]) (hash-table-get terms term (λ () (let ([id (add1 (lexicon-size lexicon))]) (hash-table-put! terms term id) id))))) ; lookup-term : lexicon bytes -> (union term-number #f) (define (lookup-term lexicon term) (hash-table-get (lexicon-terms lexicon) term (λ () #f))) ; term-number->term : index natural -> term ; return the term with term-number t (define (term-number->term index t) (vector-ref (lexicon-term-numbers (index-lexicon index)) t)) ;;; ;;; RECORDS ;;; (define (make-record term-number doc-number freq) (list term-number doc-number freq)) (define write-term-gap write-number/unary) (define read-term-gap read-number/unary) (define write-document-number write-number/delta) (define read-document-number read-number/delta) (define write-frequency write-number/gamma) (define read-frequency read-number/gamma) (define write-record (case-lambda [(term-number+doc-number+freq) (write-record term-number+doc-number+freq (current-output-bit-port))] [(term-number+doc-number+freq out-bit-port) (match term-number+doc-number+freq [(term-number doc-number freq) (write-term-gap term-number out-bit-port) (write-document-number doc-number out-bit-port) (write-frequency freq out-bit-port)])])) (define read-record (case-lambda [() (read-record (current-input-bit-port))] [(in-bit-port) (list (read-term-gap in-bit-port) (read-document-number in-bit-port) (read-frequency in-bit-port))])) (define (skip-records n port) (do-ec (:repeat n) (read-record port))) (define (record s1 0) (loop (read1) r2 (sub1 s1) s2) (begin (write r2) (dotimes (_ s2) (write (read2)))))] [else (write r2) (if (> s2 0) (loop r1 (read2) s1 (sub1 s2)) (begin (write r1) (dotimes (_ s1) (write (read1)))))])) (skip-records size2 in1))) ; (define (external-merge-sort number-of-records block-size in-file out-file . options) (cond [(>= block-size number-of-records) (values in-file out-file)] [else (display "*") (let ([number-of-block-pairs (quotient number-of-records (* 2 block-size))]) (let* ([in1 (open-input-bit-file in-file)] [in2 (open-input-bit-file in-file)] [out (apply open-output-bit-file out-file options)]) (dotimes (_ number-of-block-pairs) (merge-two-blocks block-size in1 block-size in2 out)) (let ([left (- number-of-records (* 2 number-of-block-pairs block-size))]) (if (> left block-size) (merge-two-blocks block-size in1 (- left block-size) in2 out) (do-ec (:repeat left) (write-record (read-record in1) out)))) (close-input-bit-port in1) (close-input-bit-port in2) (close-output-bit-port out) (apply external-merge-sort number-of-records (* 2 block-size) out-file in-file options)))])) (define (index-files files-to-index result-path name lexer) ; 1. Initialization (display "INITIALIZATION\n") (let ([documents (make-empty-documents)] [lexicon (make-empty-lexicon)] [tmp (make-temporary-file "tmp1-~a.data" #f "/tmp/")] [tmp2 (make-temporary-file "tmp2-~a.data" #f "/tmp/")]) ; 2. Process text and write records to a temporary file (display "TOKENIZING\n") (let ([number-of-records (with-output-to-bit-file tmp (λ () (letrec ([records '()] [len 0] [add-record! (λ(r) (set! len (+ len 1)) (set! records (cons r records)))] [sort-and-save (λ () (write-sorted-records (quicksort records record (list (cons term-number frequecy)) ; return a list of term-number/frequecy pairs ; terms in file will be added to the lexicon (define (calculate-term-frequencies lexicon file lexer) (let ([frequencies (make-hash-table 'equal)]) (define (increase-frequency! term-number) (let ([f (hash-table-get frequencies term-number (λ () (hash-table-put! frequencies term-number 0) 0))]) (hash-table-put! frequencies term-number (add1 f)))) (define (insert-and-increase! term) (let ([term-number (lexicon-insert! lexicon term)]) (increase-frequency! term-number))) ; for each token in the file: (lexer file (λ (term pos) (insert-and-increase! term))) (let ([fs '()]) (hash-table-for-each frequencies (λ (term freq) (set! fs (cons (cons term freq) fs)))) fs))) (define (read-inverted-list) ; Format for each entry: ; t n g1 f1 ... gn gn ; where t is the term number, gi are gaps between ; document numbers, i.e. ; d1=g1, d2=g1+g2, ... ; and fi are frequencies. (let* ([t (read-number (current-input-bit-port))] [n (read-number)]) (list t n (list-ec (:let d 0) (: _ n) (:let dg (read-number)) (:let f (read-frequency)) (begin (set! d (+ d dg)) (list d f)))))) (define (calculate-document-weights an-index path) (let ([ws (make-vector (add1 (number-of-documents an-index)) 0)] [N (number-of-documents an-index)]) ; sum the wdt^2 = log(1 + N/fdt)^2 (match an-index [($ index ignored-path documents lexicon positions) (with-input-from-bit-file path (λ () (do-ec (:repeat (number-of-terms an-index)) (:let il (read-inverted-list)) (:match (t n dfs) il) (:list df dfs) (:let d (car df)) (:let f (cadr df)) (vector-set! ws d (+ (vector-ref ws d) (expt (log (+ 1.0 (/ N f))) 2))))))]) ; take square roots (do-ec (:vector w (Index i) ws) (vector-set! ws i (sqrt w))) ; store the results (hash-table-map (index-documents an-index) (λ (d doc) (set-document-weight! doc (vector-ref ws d)) doc)) ; return index an-index)) )