extended syntax colorer to support lexer-specific backup; fix problems with new color lexers

svn: r15617 original commit: d807421a07e3d86b6ebf0802b4000d34af472372
2009-07-29 03:31:29 +00:00 · 2009-07-29 03:31:29 +00:00 · d6b4085433
commit d6b4085433
parent 3e64e5f4fb
3 changed files with 65 additions and 31 deletions
--- a/collects/framework/private/color.ss
+++ b/collects/framework/private/color.ss
@ -32,9 +32,13 @@ added get-regions
 (define (should-color-type? type)
  (not (memq type '(white-space no-color))))

-(define (make-data type mode) (cons type mode))
-(define (data-type data) (car data))
-(define (data-lexer-mode data) (cdr data))
+(define (make-data type mode backup-delta) 
+  (if (zero? backup-delta)
+      (cons type mode)
+      (vector type mode backup-delta)))
+(define (data-type data) (if (pair? data) (car data) (vector-ref data 0)))
+(define (data-lexer-mode data) (if (pair? data) (cdr data) (vector-ref data 1)))
+(define (data-backup-delta data) (if (vector? data) (vector-ref data 2) 0))

 (define -text<%>
  (interface (text:basic<%>)
@ -274,11 +278,11 @@ added get-regions
          (sync-invalid ls))))
    
    (define/private (re-tokenize ls in in-start-pos in-lexer-mode enable-suspend)
-      (let-values ([(lexeme type data new-token-start new-token-end new-lexer-mode) 
+      (let-values ([(lexeme type data new-token-start new-token-end backup-delta new-lexer-mode) 
                    (begin
                      (enable-suspend #f)
                      (begin0
-                       (get-token in in-lexer-mode)
+                       (get-token in in-start-pos in-lexer-mode)
                       (enable-suspend #t)))])
        (unless (eq? 'eof type)
          (enable-suspend #f)
@ -302,7 +306,8 @@ added get-regions
            ;; version.  In other words, the new greatly outweighs the tree
            ;; operations.
            ;;(insert-last! tokens (new token-tree% (length len) (data type)))
-            (insert-last-spec! (lexer-state-tokens ls) len (make-data type new-lexer-mode))
+            (insert-last-spec! (lexer-state-tokens ls) len (make-data type new-lexer-mode backup-delta))
+            #; (show-tree (lexer-state-tokens ls))
            (send (lexer-state-parens ls) add-token data len)
            (cond
             ((and (not (send (lexer-state-invalid-tokens ls) is-empty?))
@ -320,6 +325,29 @@ added get-regions
             (else
              (enable-suspend #t)
              (re-tokenize ls in in-start-pos new-lexer-mode enable-suspend)))))))
+
+    (define/private (show-tree t)
+      (printf "Tree:\n")
+      (send t search-min!)
+      (let loop ([old-s -inf.0])
+        (let ([s (send t get-root-start-position)]
+              [e (send t get-root-end-position)])
+          (unless (= s old-s)
+            (printf " ~s\n" (list s e))
+            (send t search! e)
+            (loop s)))))
+
+    (define/private (split-backward ls valid-tree pos)
+      (let loop ([pos pos][valid-tree valid-tree][old-invalid-tree #f])
+        (let-values (((orig-token-start orig-token-end valid-tree invalid-tree orig-data)
+                      (send valid-tree split/data (- pos (lexer-state-start-pos ls)))))
+          (let ([backup-pos (- pos (data-backup-delta orig-data))]
+                [invalid-tree (or old-invalid-tree invalid-tree)])
+            (if (backup-pos . < . pos)
+                ;; back up more:
+                (loop pos valid-tree invalid-tree)
+                ;; that was far enough:
+                (values orig-token-start orig-token-end valid-tree invalid-tree orig-data))))))
    
    (define/private (do-insert/delete/ls ls edit-start-pos change-length)
      (unless (lexer-state-up-to-date? ls)
@ -327,7 +355,7 @@ added get-regions
      (cond
       ((lexer-state-up-to-date? ls)
        (let-values (((orig-token-start orig-token-end valid-tree invalid-tree orig-data)
-                      (send (lexer-state-tokens ls) split/data (- edit-start-pos (lexer-state-start-pos ls)))))
+                      (split-backward ls (lexer-state-tokens ls) edit-start-pos)))
          (send (lexer-state-parens ls) split-tree orig-token-start)
          (set-lexer-state-invalid-tokens! ls invalid-tree)
          (set-lexer-state-tokens! ls valid-tree)
@ -349,8 +377,7 @@ added get-regions
          (queue-callback (λ () (colorer-callback)) #f)))
       ((>= edit-start-pos (lexer-state-invalid-tokens-start ls))
        (let-values (((tok-start tok-end valid-tree invalid-tree orig-data)
-                      (send (lexer-state-invalid-tokens ls) split/data
-                            (- edit-start-pos (lexer-state-start-pos ls)))))
+                      (split-backward ls (lexer-state-invalid-tokens ls) edit-start-pos)))
          (set-lexer-state-invalid-tokens! ls invalid-tree)
          (set-lexer-state-invalid-tokens-start!
           ls
@ -362,8 +389,7 @@ added get-regions
         (+ change-length (lexer-state-invalid-tokens-start ls))))
       (else
        (let-values (((tok-start tok-end valid-tree invalid-tree data)
-                      (send (lexer-state-tokens ls) split/data 
-                            (- edit-start-pos (lexer-state-start-pos ls)))))
+                      (split-backward ls (lexer-state-tokens ls) edit-start-pos)))
          (send (lexer-state-parens ls) truncate tok-start)
          (set-lexer-state-tokens! ls valid-tree)
          (set-lexer-state-invalid-tokens-start! ls (+ change-length (lexer-state-invalid-tokens-start ls)))
@ -463,14 +489,14 @@ added get-regions
        (reset-tokens)
        (set! should-color? (preferences:get 'framework:coloring-active))
        (set! token-sym->style token-sym->style-)
-        (set! get-token (if (procedure-arity-includes? get-token- 2)
+        (set! get-token (if (procedure-arity-includes? get-token- 3)
                            ;; New interface: thread through a mode:
                            get-token-
-                            ;; Old interface: no mode
-                            (lambda (in mode)
+                            ;; Old interface: no offset, backup delta, or mode
+                            (lambda (in offset mode)
                              (let-values ([(lexeme type data new-token-start new-token-end) 
                                            (get-token- in)])
-                                (values lexeme type data new-token-start new-token-end #f)))))
+                                (values lexeme type data new-token-start new-token-end 0 #f)))))
        (set! pairs pairs-)
        (for-each
         (lambda (ls)
--- a/collects/framework/private/scheme.ss
+++ b/collects/framework/private/scheme.ss
@ -1172,14 +1172,14 @@
    (preferences:add-callback
     'framework:tabify
     (lambda (k v) (set! tabify-pref v)))
-    (define/private (scheme-lexer-wrapper in mode)
-      (let-values (((lexeme type paren start end mode) (module-lexer in mode)))
+    (define/private (scheme-lexer-wrapper in offset mode)
+      (let-values (((lexeme type paren start end backup-delta mode) (module-lexer in offset mode)))
        (cond
          ((and (eq? type 'symbol)
                (get-keyword-type lexeme tabify-pref))
-           (values lexeme 'keyword paren start end mode))
+           (values lexeme 'keyword paren start end backup-delta mode))
          (else
-           (values lexeme type paren start end mode)))))
+           (values lexeme type paren start end backup-delta mode)))))
    
    (define/override (put-file text sup directory default-name)
      (parameterize ([finder:default-extension "ss"]
@ -1188,7 +1188,7 @@
        ;; don't call the surrogate's super, since it sets the default extension
        (sup directory default-name)))
    
-    (super-new (get-token (lambda (in mode) (scheme-lexer-wrapper in mode)))
+    (super-new (get-token (lambda (in offset mode) (scheme-lexer-wrapper in offset mode)))
               (token-sym->style short-sym->style-name)
               (matches '((|(| |)|)
                          (|[| |]|)
--- a/collects/scribblings/framework/color.scrbl
+++ b/collects/scribblings/framework/color.scrbl
@ -16,12 +16,14 @@
                                                            exact-nonnegative-integer?
                                                            exact-nonnegative-integer?))
                                                (-> input-port? 
+                                                    exact-nonnegative-integer?
                                                    any/c
                                                    (values any/c 
                                                            symbol? 
                                                            (or/c false? symbol?) 
                                                            exact-nonnegative-integer?
                                                            exact-nonnegative-integer?
+                                                            exact-nonnegative-integer?
                                                            any/c))))
                               (pairs (listof (list/p symbol? symbol?)))) void))]{
    Starts tokenizing the buffer for coloring and parenthesis matching.
@ -29,7 +31,7 @@
    The @scheme[token-sym->style] argument will be passed the first return symbol from @scheme[get-token],
    and it should return the style-name that the token should be colored.

-    The @scheme[get-token] argument takes an input port and optionally a mode value.
+    The @scheme[get-token] argument takes an input port and optionally an offset and mode value.
    When it accepts just an input port, @scheme[get-token] returns the next token as 5 values:

    @itemize[
@ -52,15 +54,21 @@
    @item{
    The ending position of the token.}]

-    When @scheme[get-token] accepts a mode value in addition to an
-    input port, it must also return an extra result, which is a new
-    mode. When @scheme[get-token] is called for the beginning on a
-    stream, the mode argument is @scheme[#f].  Thereafter, the mode
+    When @scheme[get-token] accepts an offset and mode value in addition to an
+    input port, it must also return two extra results, which are a backup
+    distance and new mode. The offset given to @scheme[get-token] can be added
+    to the position of the input port to obtain absolute coordinates within a 
+    text stream. The mode argument allows @scheme[get-token] to communicate information
+    from earlier parsing to later.
+    When @scheme[get-token] is called for the beginning on a
+    stream, the mode argument is @scheme[#f]; thereafter, the mode
    returned for the previous token is provided to @scheme[get-token]
    for the next token. The mode should not be a mutable value; if
    part of the stream is re-tokenized, the mode saved from the
    immediately preceding token is given again to the
-    @scheme[get-token] function.
+    @scheme[get-token] function. The backup distance returned by @scheme[get-token]
+    indicates the maximum number of characters to back up (counting from the start of the token)
+    and for re-parsing after a change to the editor within the token's region.

    The @scheme[get-token] function is usually be implemented with a lexer using the 
    @scheme[parser-tools/lex] library. The
@ -68,7 +76,7 @@
    @itemize[
    @item{
    Every position in the buffer must be accounted for in exactly one
-    token.}
+    token, and every token must have a non-zero width.}
    @item{
    The token returned by @scheme[get-token] must rely only on the contents of the
    input port argument plus the mode argument. This constraint  means that the
@ -77,16 +85,16 @@
    for tokens).}
    @item{
    A change to the stream must not change the tokenization of the stream prior
-    to the token immediately preceding the change.  In the following
-    example this invariant does not hold. If the buffer contains
+    to the token immediately preceding the change plus the backup distance.  In the following
+    example, this invariant does not hold for a zero backup distance: If the buffer contains
    @verbatim[#:indent 2]{" 1 2 3}
    and the tokenizer treats the unmatched " as its own token (a string
    error token), and separately tokenizes the 1 2 and 3, an edit to make
    the buffer look like
    @verbatim[#:indent 2]{" 1 2 3"}
    would result in a single string token modifying previous tokens.  To
-    handle these situations, @scheme[get-token] must treat the first line as a
-    single token.}]
+    handle these situations, @scheme[get-token] can treat the first line as a
+    single token, or it can precisely track backup distances.}]

    The @scheme[pairs] argument is a list of different kinds of matching parens.  The second
    value returned by @scheme[get-token] is compared to this list to see how the