fix unstable/2d's lexer to handle eof better

original commit: 87a8e6f677d9538001766910ba119dde8066b400
This commit is contained in:
Robby Findler 2013-02-25 10:36:46 -06:00
parent 1b75e51175
commit b49c680dd5
3 changed files with 182 additions and 149 deletions

View File

@ -156,17 +156,15 @@ todo:
(define the-state (make-state line pos (string-length first-tok-string)))
(setup-state the-state)
;; would like to be able to stop this loop
;; and process only part of the table,
;; but that works only when there are no broken
;; edges of the table that span the place I want to stop.
(define failed
(with-handlers ((exn:fail:read?
(λ (exn) exn)))
(with-handlers ((exn:fail:read? values))
(let loop ([map #f])
(define new-map
;; this might raise a read exception: what then?
(parse-2dcond-one-step peek-port (object-name peek-port) #f #f pos the-state map))
(when new-map
(loop new-map)))))
@ -177,153 +175,181 @@ todo:
;; no matter how long eol-string is, it counts for 1 position only.
(+ pos (string-length first-tok-string) 1)))
(define final-tokens
(cond
[(exn:fail:read? failed)
(define error-pos (- (srcloc-position (car (exn:fail:read-srclocs failed)))
base-position)) ;; account for the newline
(define peek-port2 (peeking-input-port port))
(port-count-lines! peek-port2)
;; pull the newline out of peek-port2
(for ([x (in-range (string-length eol-string))]) (read-char peek-port2))
(define (pull-chars n)
(apply
string
(let loop ([n n])
(cond
[(zero? n) '()]
[else (cons (read-char peek-port2) (loop (- n 1)))]))))
(define before-token (list (pull-chars error-pos)
'no-color
#f
(+ base-position 1)
(+ base-position 1 error-pos)))
(define end-of-table-approx
(let ([peek-port3 (peeking-input-port peek-port2)])
(port-count-lines! peek-port3)
(let loop ()
(define l (read-line peek-port3))
(define-values (line col pos) (port-next-location peek-port3))
(cond
[(and (string? l)
(regexp-match double-barred-chars-regexp l))
(loop)]
[else pos]))))
(define after-token
(list (pull-chars (- end-of-table-approx 1))
'error
#f
(+ base-position 1 error-pos)
(+ base-position 1 error-pos end-of-table-approx -1)))
(list newline-token before-token after-token)]
[else
(define lhses (close-cell-graph cell-connections (length table-column-breaks) (length rows)))
(define scratch-string (make-string (for/sum ([ss (in-list rows)])
(for/sum ([s (in-list ss)])
(string-length s)))
#\space))
(define collected-tokens '())
(define rows-as-vector (apply vector (reverse rows)))
(for ([set-of-indicies (in-list (sort (set->list lhses) compare/xy
#:key smallest-representative))])
(define regions
(fill-scratch-string set-of-indicies
rows-as-vector
scratch-string
table-column-breaks
initial-space-count
#t))
(define port (open-input-string scratch-string))
(port-count-lines! port)
(let loop ([mode (2d-lexer-state-chained-state a-2d-lexer-state)])
(define-values (_1 _2 current-pos) (port-next-location port))
(define-values (tok-str tok paren start end backup new-mode)
(uniform-chained-lexer port (+ pos offset) mode))
(unless (equal? 'eof tok)
(for ([sub-region (in-list (cropped-regions start end regions))])
(define start (- (car sub-region) current-pos))
(define end (- (cdr sub-region) current-pos))
(set! collected-tokens
(cons (list (if (and (string? tok-str)
(< start (string-length tok-str))
(<= end (string-length tok-str)))
(substring tok-str start end)
(list 'strange-token tok-str))
tok
paren
(+ base-position (car sub-region))
(+ base-position (cdr sub-region)))
collected-tokens)))
(loop new-mode))))
(define (collect-double-barred-token pending-start i offset str)
(when pending-start
(set! collected-tokens (cons (list (substring str pending-start i)
'parenthesis
#f
(+ base-position offset pending-start)
(+ base-position offset i))
collected-tokens))))
(for/fold ([offset 1]) ([strs (in-list (reverse (cons (list current-line) rows)))])
(for/fold ([offset offset]) ([str (in-list strs)])
(let loop ([i 0]
[pending-start #f])
(cond
[(< i (string-length str))
(define c (string-ref str i))
(cond
[(exn:fail:read:eof? failed)
;; in this case, the source location for the error
;; should be the beginning of the #2d token,
;; so we just turn the whole thing red in a single token
(define tok-string
(string-append
first-tok-string
(apply string
(let loop ()
(define c (read-char port))
(cond
[(eof-object? c) '()]
[else (cons c (loop))])))))
(values tok-string 'error #f
pos (+ pos (string-length tok-string))
0
#f)]
[else
(define final-tokens
(cond
[(exn:fail:read? failed)
(define error-pos (- (srcloc-position (car (exn:fail:read-srclocs failed)))
base-position)) ;; account for the newline
(when (< error-pos 0)
(error 'unstable/2d/lexer.rkt "got error-pos < 0: ~s ~s\n"
(srcloc-position (car (exn:fail:read-srclocs failed)))
base-position))
(define peek-port2 (peeking-input-port port))
(port-count-lines! peek-port2)
(define (pull-chars n)
(apply
string
(let loop ([n n])
(cond
[(zero? n) '()]
[else (cons (read-char peek-port2) (loop (- n 1)))]))))
(cond
[else
;; pull the newline out of peek-port2
(for ([x (in-range (string-length eol-string))]) (read-char peek-port2))
(define before-token (list (pull-chars error-pos)
'no-color
#f
(+ base-position 1)
(+ base-position 1 error-pos)))
(define end-of-table-approx
(let ([peek-port3 (peeking-input-port peek-port2)])
(port-count-lines! peek-port3)
(let loop ()
(define l (read-line peek-port3))
(define-values (line col pos) (port-next-location peek-port3))
(cond
[(and (string? l)
(regexp-match double-barred-chars-regexp l))
(loop)]
[else pos]))))
(define after-token
(list (pull-chars (- end-of-table-approx 1))
'error
#f
(+ base-position 1 error-pos)
(+ base-position 1 error-pos end-of-table-approx -1)))
(if (zero? error-pos)
(list newline-token after-token)
(list newline-token before-token after-token))])]
[else
(define lhses (close-cell-graph cell-connections (length table-column-breaks) (length rows)))
(define scratch-string (make-string (for/sum ([ss (in-list rows)])
(for/sum ([s (in-list ss)])
(string-length s)))
#\space))
(define collected-tokens '())
(define rows-as-vector (apply vector (reverse rows)))
(for ([set-of-indicies (in-list (sort (set->list lhses) compare/xy
#:key smallest-representative))])
(define regions
(fill-scratch-string set-of-indicies
rows-as-vector
scratch-string
table-column-breaks
initial-space-count
#t))
(define port (open-input-string scratch-string))
(port-count-lines! port)
(let loop ([mode (2d-lexer-state-chained-state a-2d-lexer-state)])
(define-values (_1 _2 current-pos) (port-next-location port))
(define-values (tok-str tok paren start end backup new-mode)
(uniform-chained-lexer port (+ pos offset) mode))
(unless (equal? 'eof tok)
(for ([sub-region (in-list (cropped-regions start end regions))])
(define start (- (car sub-region) current-pos))
(define end (- (cdr sub-region) current-pos))
(set! collected-tokens
(cons (list (if (and (string? tok-str)
(< start (string-length tok-str))
(<= end (string-length tok-str)))
(substring tok-str start end)
(list 'strange-token tok-str))
tok
paren
(+ base-position (car sub-region))
(+ base-position (cdr sub-region)))
collected-tokens)))
(loop new-mode))))
(define (collect-double-barred-token pending-start i offset str)
(when pending-start
(set! collected-tokens (cons (list (substring str pending-start i)
'parenthesis
#f
(+ base-position offset pending-start)
(+ base-position offset i))
collected-tokens))))
(for/fold ([offset 1]) ([strs (in-list (reverse (cons (list current-line) rows)))])
(for/fold ([offset offset]) ([str (in-list strs)])
(let loop ([i 0]
[pending-start #f])
(cond
[(member c double-barred-chars)
(loop (+ i 1)
(if pending-start pending-start i))]
[(< i (string-length str))
(define c (string-ref str i))
(cond
[(member c double-barred-chars)
(loop (+ i 1)
(if pending-start pending-start i))]
[else
(collect-double-barred-token pending-start i offset str)
(loop (+ i 1) #f)])]
[else
(collect-double-barred-token pending-start i offset str)
(loop (+ i 1) #f)])]
[else
(collect-double-barred-token pending-start i offset str)]))
(+ (string-length str) offset)))
(define sorted-tokens (sort collected-tokens <
#:key (λ (x) (list-ref x 3))))
;; there will be gaps that correspond to the places outside of the
;; outermost rectangle (at a minimum, newlines); this fills those
;; in with whitespace tokens
;; NOTE: this code does not deal properly with \r\n newline combinations
(define cracks-filled-in-tokens
(let loop ([fst newline-token]
[tokens sorted-tokens])
(cond
[(null? tokens) (list fst)]
[else
(define snd (car tokens))
(collect-double-barred-token pending-start i offset str)]))
(+ (string-length str) offset)))
(define sorted-tokens (sort collected-tokens <
#:key (λ (x) (list-ref x 3))))
;; there will be gaps that correspond to the places outside of the
;; outermost rectangle (at a minimum, newlines); this fills those
;; in with whitespace tokens
;; NOTE: this code does not deal properly with \r\n newline combinations
(define cracks-filled-in-tokens
(let loop ([fst newline-token]
[tokens sorted-tokens])
(cond
[(= (list-ref fst 4)
(list-ref snd 3))
(cons fst (loop snd (cdr tokens)))]
[(null? tokens) (list fst)]
[else
(define new-start (list-ref fst 4))
(define new-end (list-ref snd 3))
(list* fst
(list
; these are not the real characters ...
(make-string (- new-end new-start) #\space)
'white-space
#f
new-start
new-end)
(loop snd (cdr tokens)))])])))
cracks-filled-in-tokens]))
(define snd (car tokens))
(cond
[(= (list-ref fst 4)
(list-ref snd 3))
(cons fst (loop snd (cdr tokens)))]
[else
(define new-start (list-ref fst 4))
(define new-end (list-ref snd 3))
(list* fst
(list
; these are not the real characters ...
(make-string (- new-end new-start) #\space)
'white-space
#f
new-start
new-end)
(loop snd (cdr tokens)))])])))
cracks-filled-in-tokens]))
(values first-tok-string 'hash-colon-keyword #f
pos (+ pos (string-length first-tok-string))
0
(2d-lexer-state final-tokens
#t
(2d-lexer-state-chained-state a-2d-lexer-state)))]))
(values first-tok-string 'hash-colon-keyword #f
pos (+ pos (string-length first-tok-string))
0
(2d-lexer-state final-tokens
#t
(2d-lexer-state-chained-state a-2d-lexer-state)))])]))
(define (cropped-regions start end regions)
(define result-regions '())

View File

@ -314,9 +314,9 @@ example uses:
(cond
[(eof-object? c)
(raise-read-eof-error
"expected eof; "
"unexpected eof; "
source _line _col _pos
(and _pos (- _pos (+ current-line-start-position chars-read))))]
(and _pos (- (+ current-line-start-position chars-read) _pos)))]
[(equal? c #\return)
(cond
[(equal? #\newline (peek-char port))

View File

@ -31,7 +31,14 @@ example uses:
(case-lambda
[(char port)
(define-values (line col pos) (port-next-location port))
(dispatch-proc char port #f line col pos read/recursive previous-readtable)]
;; the "-2"s here are because the initial line and column
;; are supposed be at the beginning of the thing read, not
;; after the "#2" has been consumed.
(dispatch-proc char port #f line
(and col (- col 2))
(and pos (- pos 2))
read/recursive previous-readtable)]
[(char port source _line _col _pos)
(dispatch-proc char port source _line _col _pos
(λ (a b c) (read-syntax/recursive source a b c))