racket/collects/little-helper/indexer/lexer.scm
Jens Axel Soegaard ac47e02961 Initial checkin of Little Helper.
Little Helper contains a full text search engine. 
Currently it indexes all html-files in /collects/doc.
A mockup web-interface is present in order to facilitate easy experimentation with searches.
Run run-indexer to generate the index for your documentation dir. 
Run launch to start a web-server with the search interface.
Note: Currently assumes w3m is in your path (used to precompute the 
           preview-snippets shown in the search results.

svn: r8836
2008-03-01 13:26:18 +00:00

106 lines
3.6 KiB
Scheme

; TODO: How is @foo to be handled?
; - insert both @foo and foo ?
(module lexer mzscheme
(provide skip-to-next-token
read-token
for-each-token
for-each-token-in-file
document->tokens
token-case-sensitive)
(require (lib "match.ss"))
(define token-case-sensitive (make-parameter #t))
(define (utf-regexp str)
(byte-regexp (string->bytes/utf-8 str)))
(define whitespace "[ \n\r\t]")
(define brackets "[]\\[\\(\\){}")
(define punctuation "[,'`;]")
(define special "[\\|]")
(define delimiter (string-append "(" whitespace "|" punctuation "|"
brackets "|" special "|" "\"" ")"))
(define delimiter* (string-append delimiter "*"))
; MzScheme's identifier and symbol syntax is considerably more liberal
; than the syntax specified by R5RS. When input is scanned for tokens,
; the following characters delimit an identifier in addition to whitespace:
; " , ' ` ; ( ) [ ] { }
(define non-symbol-starter-regexp (utf-regexp delimiter*))
; In addition, an identifier cannot start with a hash mark (``#'') unless
; the hash mark is immediately followed by a percent sign (``%''). The only
; other special characters are backslash (``\'') and quoting vertical
; bars (``|''); any other character is used as part of an identifier.
(define (skip-to-next-token)
; (display (format "> ~a\n" (peek-bytes 20 0)))
(regexp-match non-symbol-starter-regexp (current-input-port))
(if (eqv? (peek-char) #\#)
(unless (equal? (peek-string 2 0) "#%")
(read-char)
(skip-to-next-token))))
(define non-delimiter "[^]\\[\\(\\){} \n\r\t,'`;\\|\"]")
(define non-delimiter* (string-append non-delimiter "*"))
(define non-delimiter*-regexp (utf-regexp non-delimiter*))
(define (bytes-downcase bs)
(string->bytes/utf-8
(string-downcase
(bytes->string/utf-8 bs #\space))))
(define read-token
(case-lambda
[()
(read-token #f)]
[(count-lines?)
(let* ([pos (if count-lines?
(let-values
([(line col pos) (port-next-location (current-input-port))])
(list line col pos))
(file-position (current-input-port)))]
[m (regexp-match non-delimiter*-regexp (current-input-port))])
(and m
(match m
[(token . _)
(list (if (token-case-sensitive)
token
(bytes-downcase token))
pos)])))]))
(define for-each-token
(case-lambda
[(f)
(for-each-token f #f)]
[(f count-lines?)
(unless (eof-object? (peek-char))
(skip-to-next-token)
(unless (eof-object? (peek-char))
(let ([token (read-token count-lines?)])
(if token
(begin
(f token)
(for-each-token f count-lines?))
(error "internal error: token expected after skipping")))))]))
(define (for-each-token-in-file file f)
(with-input-from-file file
(lambda ()
(for-each-token f ))))
; document->tokens : path -> (list (list byte-string position))
(define (document->tokens file)
(let ([tokens '()])
(call-with-input-file file
(λ (port)
(port-count-lines! port)
(parameterize ([current-input-port port])
(for-each-token (λ (token) (set! tokens (cons token tokens)))
#t))))
(reverse tokens)))
)