
Little Helper contains a full text search engine. Currently it indexes all html-files in /collects/doc. A mockup web-interface is present in order to facilitate easy experimentation with searches. Run run-indexer to generate the index for your documentation dir. Run launch to start a web-server with the search interface. Note: Currently assumes w3m is in your path (used to precompute the preview-snippets shown in the search results. svn: r8836
91 lines
3.0 KiB
Scheme
91 lines
3.0 KiB
Scheme
;;; This lexer is a little tricky.
|
|
;;; HtmlPrag discards source location information, so
|
|
;;; we use HtmlPrag to get the content. Then we break
|
|
;;; the content up into terms. Regular expression
|
|
;;; matching is then used to recover the positions of
|
|
;;; the terms (line and column numbers are lost, but
|
|
;;; they aren't needed).
|
|
|
|
(module html-to-txt mzscheme
|
|
(require scheme/mpair
|
|
scheme/system
|
|
(only scheme regexp-quote)
|
|
(lib "match.ss")
|
|
"planet/file.scm"
|
|
"planet/htmlprag.ss" ; version 1.3
|
|
"planet/intersperse.scm"
|
|
(prefix txt: "lexer.scm"))
|
|
|
|
(define (html-file->txt-file src-file dest-file)
|
|
(system (format "w3m -dump 'a' >'a'" src-file dest-file)))
|
|
|
|
(define (file->shtml file)
|
|
; read file and return parsed file
|
|
(with-input-from-file file
|
|
(lambda ()
|
|
(html->shtml
|
|
(port->string
|
|
(current-input-port))))))
|
|
|
|
(define (shtml->tokens s)
|
|
; extract the terms
|
|
(match s
|
|
[(t ...) (apply append (map shtml->tokens t))]
|
|
[t (if (string? t) (list t) '())]))
|
|
|
|
; freeze : mutable-tree -> immutabale-tree
|
|
; convert mpairs to pairs
|
|
(define (freeze o)
|
|
(if (mlist? o)
|
|
(map freeze (mlist->list o))
|
|
o))
|
|
|
|
(define (html-file->string file)
|
|
(apply string-append
|
|
(intersperse " "
|
|
(shtml->tokens
|
|
(freeze
|
|
(file->shtml file))))))
|
|
|
|
#;(define (html-file->terms file)
|
|
; return a list of terms occuring in file
|
|
(map car
|
|
(txt:port->tokens
|
|
(open-input-string
|
|
(apply string-append
|
|
(intersperse " "
|
|
(shtml->tokens
|
|
(freeze
|
|
(file->shtml file)))))))))
|
|
|
|
#;(define (terms->tokens terms file)
|
|
; use regexp-match-peek-positions to recover the
|
|
; positions of the terms
|
|
(let ([in (open-input-file file)])
|
|
(let loop ([start-pos 0]
|
|
[terms terms]
|
|
[tokens '()])
|
|
(cond [(null? terms)
|
|
(reverse tokens)]
|
|
[else
|
|
(let ([term (car terms)])
|
|
(display (list start-pos term)) (newline)
|
|
(let ([pos (regexp-match-peek-positions (regexp-quote term) in start-pos)])
|
|
(if pos
|
|
(loop (cdr (car pos))
|
|
(cdr terms)
|
|
(cons (list term (list 'line? 'col? (caar pos))) tokens))
|
|
(begin
|
|
(display (format "Skipped '~a'\n" term))
|
|
(loop start-pos (cdr terms) tokens)))))]))))
|
|
|
|
; (define file "/Applications/PLT Scheme v3.99.0.10/doc/reference/begin.html")
|
|
|
|
#;(txt:port->tokens
|
|
(open-input-string
|
|
(apply string-append
|
|
(shtml->tokens
|
|
(freeze
|
|
(file->shtml
|
|
"../test/html-example.html"))))))
|
|
) |