
Little Helper contains a full text search engine. Currently it indexes all html-files in /collects/doc. A mockup web-interface is present in order to facilitate easy experimentation with searches. Run run-indexer to generate the index for your documentation dir. Run launch to start a web-server with the search interface. Note: Currently assumes w3m is in your path (used to precompute the preview-snippets shown in the search results. svn: r8836
255 lines
8.9 KiB
Scheme
255 lines
8.9 KiB
Scheme
(module query mzscheme
|
|
(require (lib "match.ss")
|
|
(lib "list.ss")
|
|
(lib "67.ss" "srfi")
|
|
|
|
"planet/bit-io.scm" ; (planet "bit-io.scm" ("soegaard" "bit-io.plt"))
|
|
; (all-except (planet "42.ss" ("soegaard" "srfi.plt" 1 2)) index)
|
|
(all-except (file "planet/srfi.plt/1/2/42.ss") index)
|
|
|
|
; is = integer set
|
|
; (prefix is: (planet "set.scm" ("soegaard" "galore.plt" 2 2)))
|
|
(prefix is: (file "planet/galore.plt/2/2/set.scm"))
|
|
|
|
(prefix disk: "lexicon.scm")
|
|
"indexer.scm"
|
|
"lexer.scm"
|
|
"config.scm")
|
|
|
|
(provide (all-defined))
|
|
|
|
;;;
|
|
;;; BASIC QUERY
|
|
;;;
|
|
|
|
; The basic query is given a single term (byte-string) and an index,
|
|
; to lookup the term and return the corresponding inverted-list.
|
|
|
|
; The inverted list for a term with term number t is
|
|
; (t n ((d1 f1) ... (dn fn))
|
|
; where t is the term number,
|
|
; n is the number of documents the term occurs in,
|
|
; di is a document number,
|
|
; fi is the frequency of the term in document di
|
|
|
|
(define (position-of-term index-name term)
|
|
(disk:lookup term
|
|
(lexicon-path index-name)
|
|
(lexicon-index-path index-name)
|
|
(current-lexicon-block-size)))
|
|
|
|
; search : byte-string -> inverted-list
|
|
(define (search term)
|
|
(let* ([index-name (current-index-name)]
|
|
[pos (position-of-term index-name term)])
|
|
(if pos
|
|
(with-input-from-bit-file (inverted-path index-name)
|
|
(λ ()
|
|
(bit-file-position (current-input-bit-port) pos)
|
|
(read-inverted-list)))
|
|
#f)))
|
|
|
|
(define (inverted-list->human index il)
|
|
(if (not il)
|
|
#f
|
|
(match il
|
|
[(t n dfs)
|
|
(list ; (term-number->term index t)
|
|
n
|
|
(map (λ (df)
|
|
(match df
|
|
[(d f) (list (document-path (hash-table-get (index-documents index) d))
|
|
f)]))
|
|
dfs))])))
|
|
|
|
;;;
|
|
;;; CONJUCTIVE QUERY
|
|
;;;
|
|
|
|
; search-or : (list terms) -> (list document-number)
|
|
; find documents that contain at least one given term
|
|
(define (search-or list-of-terms ils)
|
|
; inverted-list -> set of document numbers
|
|
(define (il->is:ds il) (is:list->set integer-compare (map first (third il))))
|
|
(cond
|
|
; 0 terms
|
|
[(null? list-of-terms)
|
|
'()]
|
|
; 1 term
|
|
[(null? (cdr list-of-terms))
|
|
(cond
|
|
[(search (car list-of-terms))
|
|
=> inverted-list->document-numbers]
|
|
[else
|
|
'()])]
|
|
[else
|
|
; 2 or more terms
|
|
(let (; ignore terms with no documents
|
|
[ils (filter (λ (x) x) ils)])
|
|
; calculate union
|
|
(is:elements
|
|
(foldl (λ (il cds) (is:union cds (il->is:ds il)))
|
|
(il->is:ds (first ils))
|
|
(rest ils))))]))
|
|
|
|
|
|
; search-and : (list terms) -> (list document-number)
|
|
; find documents containing all given terms
|
|
(define (search-and list-of-terms ils)
|
|
(define (il->is:ds il) (is:list->set integer-compare (map first (third il))))
|
|
(cond
|
|
[(null? list-of-terms)
|
|
'()]
|
|
[(null? (cdr list-of-terms))
|
|
(cond
|
|
[(search (car list-of-terms))
|
|
=> inverted-list->document-numbers]
|
|
[else
|
|
'()])]
|
|
[else
|
|
; for each term find the inverted list
|
|
(cond
|
|
[(not (andmap (λ (x) x) ils))
|
|
'()]
|
|
[else
|
|
; order the inverted list, so the rarest terms are first
|
|
; (this speeds up the intersection operation)
|
|
(let ([ils (quicksort ils (λ (il1 il2) (< (second il1) (second il2))))])
|
|
; find the intersection of all document document numbers
|
|
(is:elements
|
|
(foldl (λ (il cds)
|
|
(is:intersection cds (il->is:ds il)))
|
|
(il->is:ds (first ils))
|
|
(rest ils))))])]))
|
|
|
|
(define (inverted-list->document-numbers il)
|
|
(map first (third il)))
|
|
|
|
(define (bytes->string b)
|
|
(bytes->string/utf-8 b (string-ref " " 0)))
|
|
|
|
(define (split-in-terms query-string sensitivity)
|
|
; use whitespace to split the search string
|
|
(let ([terms '()])
|
|
(parameterize ([current-input-port (open-input-string query-string)]
|
|
[token-case-sensitive sensitivity])
|
|
(for-each-token
|
|
(λ (t) (set! terms (cons (bytes->string (car t)) terms)))))
|
|
(reverse terms)))
|
|
|
|
|
|
; The inverted list for a term with term number t is
|
|
; (t n ((d1 f1) ... (dn fn))
|
|
; where t is the term number,
|
|
; n is the number of documents the term occurs in,
|
|
; di is a document number,
|
|
; fi is the frequency of the term in document di
|
|
|
|
(define (inverted-list-term-number il)
|
|
(first il))
|
|
|
|
(define (inverted-list-number-of-documents il)
|
|
(second il))
|
|
|
|
(define (inverted-list-document-frequencies il)
|
|
(third il))
|
|
|
|
|
|
(define (rank index ils)
|
|
; ils are the inverted lists for all terms t in the query
|
|
; ds is a list of document numbers
|
|
|
|
; Note: This hash-table relies on eq? on fixnums
|
|
(let ([A (make-hash-table)]
|
|
[N (exact->inexact (number-of-documents index))])
|
|
(do-ec (:list il ils)
|
|
(if il) ; il is #f if the term isn't in the index
|
|
(:let t (inverted-list-term-number il))
|
|
(:let ft (inverted-list-number-of-documents il))
|
|
(:let dfs (inverted-list-document-frequencies il))
|
|
(:let wt (+ 1.0 (log (/ N ft))))
|
|
(:list d+f dfs)
|
|
(:let d (car d+f))
|
|
(:let fdt (cadr d+f))
|
|
(hash-table-put! A d (+ (hash-table-get A d (λ () 0.0))
|
|
(* wt (log (+ 1.0 fdt))))))
|
|
|
|
(let ([documents (index-documents index)])
|
|
(hash-table-map A (λ (d score)
|
|
(let ([Wd (document-weight (hash-table-get documents d))])
|
|
(cons d (/ score Wd))))))))
|
|
|
|
|
|
; query-normal : ... -> (list document-number)
|
|
(define (query-normal index query-string sensitive? contain-all-terms)
|
|
; handle and/or queries
|
|
(let* ([terms (split-in-terms query-string sensitive?)]
|
|
[ils (map (λ (term) (search term)) terms)])
|
|
(display terms (current-error-port))
|
|
(newline (current-error-port))
|
|
(let ([dss (mergesort (rank index ils)
|
|
(λ (p1 p2)
|
|
(> (cdr p1) (cdr p2))))])
|
|
(let ([result-ds
|
|
(if contain-all-terms
|
|
; and
|
|
(is:list->set integer-compare
|
|
(search-and terms ils))
|
|
; or
|
|
(is:list->set integer-compare
|
|
(search-or terms ils)))])
|
|
(filter (λ (ds)
|
|
(is:member? (car ds) result-ds))
|
|
dss)))))
|
|
|
|
|
|
; search-regular : (list terms) -> (list document-number)
|
|
; find documents that contain at least one given term, that
|
|
; match the regular expression
|
|
(define (search-regular-expression index a-regexp)
|
|
(let (; 1. Compile the regular expression
|
|
[re (regexp a-regexp)]
|
|
; 2. Find all terms that match the pattern
|
|
[ts '()])
|
|
(display "pattern match: " (current-error-port))
|
|
(display a-regexp (current-error-port))
|
|
(newline (current-error-port))
|
|
(display "search-regular-expression: for-each-term-in-lexicon\n"
|
|
(current-error-port))
|
|
(time (disk:for-each-term-in-disk-lexicon
|
|
(index-name index)
|
|
(lambda (t n)
|
|
(when (regexp-match re t)
|
|
(set! ts (cons t ts))))))
|
|
(set! ts (reverse ts))
|
|
; 3. Convert from terms to document numbers
|
|
(display "search-regular-expression: search for all matches\n"
|
|
(current-error-port))
|
|
(time (rank index
|
|
(foldl (lambda (t ils)
|
|
(let ([il (search t)])
|
|
(if il
|
|
(cons il ils)
|
|
ils)))
|
|
'() ts)))
|
|
))
|
|
|
|
(define (query-regular-expression index a-regexp)
|
|
(search-regular-expression index a-regexp))
|
|
|
|
(define (query index query-string sensitive? contain-all-terms type-normal)
|
|
(parameterize ([current-index-name (index-name index)])
|
|
; type-normal = #t => normal and/or search
|
|
; type-normal = #f => regular expression search
|
|
(let ([terms (split-in-terms query-string #t)])
|
|
(values (cond
|
|
; query is whitespace only
|
|
[(null? terms)
|
|
'()]
|
|
[type-normal
|
|
(query-normal index query-string sensitive? contain-all-terms)]
|
|
[else
|
|
(query-regular-expression index query-string)])
|
|
terms))))
|
|
|
|
) |