racket/collects/little-helper/indexer/lexer.scm

; TODO: How is @foo to be handled?
;       - insert both @foo and foo ?

(module lexer mzscheme
  (provide skip-to-next-token
           read-token
           for-each-token
           for-each-token-in-file
           document->tokens
           token-case-sensitive)

  (require (lib "match.ss"))

  (define token-case-sensitive (make-parameter #t))

  (define (utf-regexp str)
    (byte-regexp (string->bytes/utf-8 str)))

  (define whitespace     "[ \n\r\t]")
  (define brackets       "[]\\[\\(\\){}")
  (define punctuation    "[,'`;]")
  (define special        "[\\|]")
  (define delimiter      (string-append "(" whitespace "|" punctuation "|"
                                        brackets "|" special "|" "\"" ")"))
  (define delimiter*     (string-append delimiter "*"))

  ; MzScheme's identifier and symbol syntax is considerably more liberal
  ; than the syntax specified by R5RS. When input is scanned for tokens,
  ; the following characters delimit an identifier in addition to whitespace:
  ;     " , ' ` ; ( ) [ ] { }
  (define non-symbol-starter-regexp (utf-regexp delimiter*))

  ; In addition, an identifier cannot start with a hash mark (``#'') unless
  ; the hash mark is immediately followed by a percent sign (``%''). The only
  ; other special characters are backslash (``\'') and quoting vertical
  ; bars (``|''); any other character is used as part of an identifier.

  (define (skip-to-next-token)
    ; (display (format "> ~a\n" (peek-bytes 20 0)))
    (regexp-match non-symbol-starter-regexp (current-input-port))
    (if (eqv? (peek-char) #\#)
        (unless (equal? (peek-string 2 0) "#%")
          (read-char)
          (skip-to-next-token))))

  (define non-delimiter  "[^]\\[\\(\\){} \n\r\t,'`;\\|\"]")
  (define non-delimiter* (string-append non-delimiter "*"))
  (define non-delimiter*-regexp (utf-regexp non-delimiter*))

  (define (bytes-downcase bs)
    (string->bytes/utf-8
     (string-downcase
      (bytes->string/utf-8 bs #\space))))

  (define read-token
    (case-lambda
      [()
       (read-token #f)]
      [(count-lines?)
       (let* ([pos (if count-lines?
                       (let-values
                           ([(line col pos) (port-next-location (current-input-port))])
                         (list line col pos))
                       (file-position (current-input-port)))]
              [m   (regexp-match non-delimiter*-regexp (current-input-port))])
         (and m
              (match m
                [(token . _)
                 (list (if (token-case-sensitive)
                           token
                           (bytes-downcase token))
                       pos)])))]))

  (define for-each-token
    (case-lambda
      [(f)
       (for-each-token f #f)]
      [(f count-lines?)
       (unless (eof-object? (peek-char))
         (skip-to-next-token)
         (unless (eof-object? (peek-char))
           (let ([token (read-token count-lines?)])
             (if token
                 (begin
                   (f token)
                   (for-each-token f count-lines?))
                 (error "internal error: token expected after skipping")))))]))

  (define (for-each-token-in-file file f)
    (with-input-from-file file
      (lambda ()
        (for-each-token f ))))

  ; document->tokens : path -> (list (list byte-string position))
  (define (document->tokens file)
    (let ([tokens '()])
      (call-with-input-file file
        (λ (port)
          (port-count-lines! port)
          (parameterize ([current-input-port port])
            (for-each-token (λ (token) (set! tokens (cons token tokens)))
                            #t))))
      (reverse tokens)))

  )