
Little Helper contains a full text search engine. Currently it indexes all html-files in /collects/doc. A mockup web-interface is present in order to facilitate easy experimentation with searches. Run run-indexer to generate the index for your documentation dir. Run launch to start a web-server with the search interface. Note: Currently assumes w3m is in your path (used to precompute the preview-snippets shown in the search results. svn: r8836
106 lines
3.6 KiB
Scheme
106 lines
3.6 KiB
Scheme
; TODO: How is @foo to be handled?
|
|
; - insert both @foo and foo ?
|
|
|
|
(module lexer mzscheme
|
|
(provide skip-to-next-token
|
|
read-token
|
|
for-each-token
|
|
for-each-token-in-file
|
|
document->tokens
|
|
token-case-sensitive)
|
|
|
|
(require (lib "match.ss"))
|
|
|
|
(define token-case-sensitive (make-parameter #t))
|
|
|
|
(define (utf-regexp str)
|
|
(byte-regexp (string->bytes/utf-8 str)))
|
|
|
|
(define whitespace "[ \n\r\t]")
|
|
(define brackets "[]\\[\\(\\){}")
|
|
(define punctuation "[,'`;]")
|
|
(define special "[\\|]")
|
|
(define delimiter (string-append "(" whitespace "|" punctuation "|"
|
|
brackets "|" special "|" "\"" ")"))
|
|
(define delimiter* (string-append delimiter "*"))
|
|
|
|
; MzScheme's identifier and symbol syntax is considerably more liberal
|
|
; than the syntax specified by R5RS. When input is scanned for tokens,
|
|
; the following characters delimit an identifier in addition to whitespace:
|
|
; " , ' ` ; ( ) [ ] { }
|
|
(define non-symbol-starter-regexp (utf-regexp delimiter*))
|
|
|
|
; In addition, an identifier cannot start with a hash mark (``#'') unless
|
|
; the hash mark is immediately followed by a percent sign (``%''). The only
|
|
; other special characters are backslash (``\'') and quoting vertical
|
|
; bars (``|''); any other character is used as part of an identifier.
|
|
|
|
(define (skip-to-next-token)
|
|
; (display (format "> ~a\n" (peek-bytes 20 0)))
|
|
(regexp-match non-symbol-starter-regexp (current-input-port))
|
|
(if (eqv? (peek-char) #\#)
|
|
(unless (equal? (peek-string 2 0) "#%")
|
|
(read-char)
|
|
(skip-to-next-token))))
|
|
|
|
(define non-delimiter "[^]\\[\\(\\){} \n\r\t,'`;\\|\"]")
|
|
(define non-delimiter* (string-append non-delimiter "*"))
|
|
(define non-delimiter*-regexp (utf-regexp non-delimiter*))
|
|
|
|
(define (bytes-downcase bs)
|
|
(string->bytes/utf-8
|
|
(string-downcase
|
|
(bytes->string/utf-8 bs #\space))))
|
|
|
|
(define read-token
|
|
(case-lambda
|
|
[()
|
|
(read-token #f)]
|
|
[(count-lines?)
|
|
(let* ([pos (if count-lines?
|
|
(let-values
|
|
([(line col pos) (port-next-location (current-input-port))])
|
|
(list line col pos))
|
|
(file-position (current-input-port)))]
|
|
[m (regexp-match non-delimiter*-regexp (current-input-port))])
|
|
(and m
|
|
(match m
|
|
[(token . _)
|
|
(list (if (token-case-sensitive)
|
|
token
|
|
(bytes-downcase token))
|
|
pos)])))]))
|
|
|
|
(define for-each-token
|
|
(case-lambda
|
|
[(f)
|
|
(for-each-token f #f)]
|
|
[(f count-lines?)
|
|
(unless (eof-object? (peek-char))
|
|
(skip-to-next-token)
|
|
(unless (eof-object? (peek-char))
|
|
(let ([token (read-token count-lines?)])
|
|
(if token
|
|
(begin
|
|
(f token)
|
|
(for-each-token f count-lines?))
|
|
(error "internal error: token expected after skipping")))))]))
|
|
|
|
(define (for-each-token-in-file file f)
|
|
(with-input-from-file file
|
|
(lambda ()
|
|
(for-each-token f ))))
|
|
|
|
; document->tokens : path -> (list (list byte-string position))
|
|
(define (document->tokens file)
|
|
(let ([tokens '()])
|
|
(call-with-input-file file
|
|
(λ (port)
|
|
(port-count-lines! port)
|
|
(parameterize ([current-input-port port])
|
|
(for-each-token (λ (token) (set! tokens (cons token tokens)))
|
|
#t))))
|
|
(reverse tokens)))
|
|
|
|
)
|