fix unstable/2d's lexer to handle eof better

original commit: 87a8e6f677d9538001766910ba119dde8066b400
This commit is contained in:
Robby Findler 2013-02-25 10:36:46 -06:00
parent 1b75e51175
commit b49c680dd5
3 changed files with 182 additions and 149 deletions

View File

@ -162,11 +162,9 @@ todo:
;; but that works only when there are no broken ;; but that works only when there are no broken
;; edges of the table that span the place I want to stop. ;; edges of the table that span the place I want to stop.
(define failed (define failed
(with-handlers ((exn:fail:read? (with-handlers ((exn:fail:read? values))
(λ (exn) exn)))
(let loop ([map #f]) (let loop ([map #f])
(define new-map (define new-map
;; this might raise a read exception: what then?
(parse-2dcond-one-step peek-port (object-name peek-port) #f #f pos the-state map)) (parse-2dcond-one-step peek-port (object-name peek-port) #f #f pos the-state map))
(when new-map (when new-map
(loop new-map))))) (loop new-map)))))
@ -177,153 +175,181 @@ todo:
;; no matter how long eol-string is, it counts for 1 position only. ;; no matter how long eol-string is, it counts for 1 position only.
(+ pos (string-length first-tok-string) 1))) (+ pos (string-length first-tok-string) 1)))
(define final-tokens (cond
(cond [(exn:fail:read:eof? failed)
[(exn:fail:read? failed) ;; in this case, the source location for the error
(define error-pos (- (srcloc-position (car (exn:fail:read-srclocs failed))) ;; should be the beginning of the #2d token,
base-position)) ;; account for the newline ;; so we just turn the whole thing red in a single token
(define peek-port2 (peeking-input-port port)) (define tok-string
(port-count-lines! peek-port2) (string-append
first-tok-string
(apply string
(let loop ()
(define c (read-char port))
(cond
[(eof-object? c) '()]
[else (cons c (loop))])))))
(values tok-string 'error #f
pos (+ pos (string-length tok-string))
0
#f)]
[else
(define final-tokens
(cond
[(exn:fail:read? failed)
(define error-pos (- (srcloc-position (car (exn:fail:read-srclocs failed)))
base-position)) ;; account for the newline
(when (< error-pos 0)
(error 'unstable/2d/lexer.rkt "got error-pos < 0: ~s ~s\n"
(srcloc-position (car (exn:fail:read-srclocs failed)))
base-position))
(define peek-port2 (peeking-input-port port))
(port-count-lines! peek-port2)
(define (pull-chars n)
(apply
string
(let loop ([n n])
(cond
[(zero? n) '()]
[else (cons (read-char peek-port2) (loop (- n 1)))]))))
(cond
;; pull the newline out of peek-port2 [else
(for ([x (in-range (string-length eol-string))]) (read-char peek-port2))
(define (pull-chars n) ;; pull the newline out of peek-port2
(apply (for ([x (in-range (string-length eol-string))]) (read-char peek-port2))
string
(let loop ([n n])
(cond
[(zero? n) '()]
[else (cons (read-char peek-port2) (loop (- n 1)))]))))
(define before-token (list (pull-chars error-pos)
'no-color
#f
(+ base-position 1)
(+ base-position 1 error-pos)))
(define end-of-table-approx
(let ([peek-port3 (peeking-input-port peek-port2)])
(port-count-lines! peek-port3)
(let loop ()
(define l (read-line peek-port3))
(define-values (line col pos) (port-next-location peek-port3))
(cond
[(and (string? l)
(regexp-match double-barred-chars-regexp l))
(loop)]
[else pos]))))
(define after-token
(list (pull-chars (- end-of-table-approx 1))
'error
#f
(+ base-position 1 error-pos)
(+ base-position 1 error-pos end-of-table-approx -1)))
(list newline-token before-token after-token)]
[else
(define lhses (close-cell-graph cell-connections (length table-column-breaks) (length rows))) (define before-token (list (pull-chars error-pos)
(define scratch-string (make-string (for/sum ([ss (in-list rows)]) 'no-color
(for/sum ([s (in-list ss)]) #f
(string-length s))) (+ base-position 1)
#\space)) (+ base-position 1 error-pos)))
(define collected-tokens '()) (define end-of-table-approx
(define rows-as-vector (apply vector (reverse rows))) (let ([peek-port3 (peeking-input-port peek-port2)])
(for ([set-of-indicies (in-list (sort (set->list lhses) compare/xy (port-count-lines! peek-port3)
#:key smallest-representative))]) (let loop ()
(define regions (define l (read-line peek-port3))
(fill-scratch-string set-of-indicies (define-values (line col pos) (port-next-location peek-port3))
rows-as-vector (cond
scratch-string [(and (string? l)
table-column-breaks (regexp-match double-barred-chars-regexp l))
initial-space-count (loop)]
#t)) [else pos]))))
(define port (open-input-string scratch-string)) (define after-token
(port-count-lines! port) (list (pull-chars (- end-of-table-approx 1))
(let loop ([mode (2d-lexer-state-chained-state a-2d-lexer-state)]) 'error
(define-values (_1 _2 current-pos) (port-next-location port)) #f
(define-values (tok-str tok paren start end backup new-mode) (+ base-position 1 error-pos)
(uniform-chained-lexer port (+ pos offset) mode)) (+ base-position 1 error-pos end-of-table-approx -1)))
(unless (equal? 'eof tok) (if (zero? error-pos)
(for ([sub-region (in-list (cropped-regions start end regions))]) (list newline-token after-token)
(define start (- (car sub-region) current-pos)) (list newline-token before-token after-token))])]
(define end (- (cdr sub-region) current-pos)) [else
(set! collected-tokens
(cons (list (if (and (string? tok-str)
(< start (string-length tok-str))
(<= end (string-length tok-str)))
(substring tok-str start end)
(list 'strange-token tok-str))
tok
paren
(+ base-position (car sub-region))
(+ base-position (cdr sub-region)))
collected-tokens)))
(loop new-mode))))
(define (collect-double-barred-token pending-start i offset str) (define lhses (close-cell-graph cell-connections (length table-column-breaks) (length rows)))
(when pending-start (define scratch-string (make-string (for/sum ([ss (in-list rows)])
(set! collected-tokens (cons (list (substring str pending-start i) (for/sum ([s (in-list ss)])
'parenthesis (string-length s)))
#f #\space))
(+ base-position offset pending-start) (define collected-tokens '())
(+ base-position offset i)) (define rows-as-vector (apply vector (reverse rows)))
collected-tokens)))) (for ([set-of-indicies (in-list (sort (set->list lhses) compare/xy
#:key smallest-representative))])
(define regions
(fill-scratch-string set-of-indicies
rows-as-vector
scratch-string
table-column-breaks
initial-space-count
#t))
(define port (open-input-string scratch-string))
(port-count-lines! port)
(let loop ([mode (2d-lexer-state-chained-state a-2d-lexer-state)])
(define-values (_1 _2 current-pos) (port-next-location port))
(define-values (tok-str tok paren start end backup new-mode)
(uniform-chained-lexer port (+ pos offset) mode))
(unless (equal? 'eof tok)
(for ([sub-region (in-list (cropped-regions start end regions))])
(define start (- (car sub-region) current-pos))
(define end (- (cdr sub-region) current-pos))
(set! collected-tokens
(cons (list (if (and (string? tok-str)
(< start (string-length tok-str))
(<= end (string-length tok-str)))
(substring tok-str start end)
(list 'strange-token tok-str))
tok
paren
(+ base-position (car sub-region))
(+ base-position (cdr sub-region)))
collected-tokens)))
(loop new-mode))))
(for/fold ([offset 1]) ([strs (in-list (reverse (cons (list current-line) rows)))]) (define (collect-double-barred-token pending-start i offset str)
(for/fold ([offset offset]) ([str (in-list strs)]) (when pending-start
(let loop ([i 0] (set! collected-tokens (cons (list (substring str pending-start i)
[pending-start #f]) 'parenthesis
(cond #f
[(< i (string-length str)) (+ base-position offset pending-start)
(define c (string-ref str i)) (+ base-position offset i))
collected-tokens))))
(for/fold ([offset 1]) ([strs (in-list (reverse (cons (list current-line) rows)))])
(for/fold ([offset offset]) ([str (in-list strs)])
(let loop ([i 0]
[pending-start #f])
(cond (cond
[(member c double-barred-chars) [(< i (string-length str))
(loop (+ i 1) (define c (string-ref str i))
(if pending-start pending-start i))] (cond
[(member c double-barred-chars)
(loop (+ i 1)
(if pending-start pending-start i))]
[else
(collect-double-barred-token pending-start i offset str)
(loop (+ i 1) #f)])]
[else [else
(collect-double-barred-token pending-start i offset str) (collect-double-barred-token pending-start i offset str)]))
(loop (+ i 1) #f)])] (+ (string-length str) offset)))
[else
(collect-double-barred-token pending-start i offset str)]))
(+ (string-length str) offset)))
(define sorted-tokens (sort collected-tokens < (define sorted-tokens (sort collected-tokens <
#:key (λ (x) (list-ref x 3)))) #:key (λ (x) (list-ref x 3))))
;; there will be gaps that correspond to the places outside of the ;; there will be gaps that correspond to the places outside of the
;; outermost rectangle (at a minimum, newlines); this fills those ;; outermost rectangle (at a minimum, newlines); this fills those
;; in with whitespace tokens ;; in with whitespace tokens
;; NOTE: this code does not deal properly with \r\n newline combinations ;; NOTE: this code does not deal properly with \r\n newline combinations
(define cracks-filled-in-tokens (define cracks-filled-in-tokens
(let loop ([fst newline-token] (let loop ([fst newline-token]
[tokens sorted-tokens]) [tokens sorted-tokens])
(cond
[(null? tokens) (list fst)]
[else
(define snd (car tokens))
(cond (cond
[(= (list-ref fst 4) [(null? tokens) (list fst)]
(list-ref snd 3))
(cons fst (loop snd (cdr tokens)))]
[else [else
(define new-start (list-ref fst 4)) (define snd (car tokens))
(define new-end (list-ref snd 3)) (cond
(list* fst [(= (list-ref fst 4)
(list (list-ref snd 3))
; these are not the real characters ... (cons fst (loop snd (cdr tokens)))]
(make-string (- new-end new-start) #\space) [else
'white-space (define new-start (list-ref fst 4))
#f (define new-end (list-ref snd 3))
new-start (list* fst
new-end) (list
(loop snd (cdr tokens)))])]))) ; these are not the real characters ...
cracks-filled-in-tokens])) (make-string (- new-end new-start) #\space)
'white-space
#f
new-start
new-end)
(loop snd (cdr tokens)))])])))
cracks-filled-in-tokens]))
(values first-tok-string 'hash-colon-keyword #f (values first-tok-string 'hash-colon-keyword #f
pos (+ pos (string-length first-tok-string)) pos (+ pos (string-length first-tok-string))
0 0
(2d-lexer-state final-tokens (2d-lexer-state final-tokens
#t #t
(2d-lexer-state-chained-state a-2d-lexer-state)))])) (2d-lexer-state-chained-state a-2d-lexer-state)))])]))
(define (cropped-regions start end regions) (define (cropped-regions start end regions)
(define result-regions '()) (define result-regions '())

View File

@ -314,9 +314,9 @@ example uses:
(cond (cond
[(eof-object? c) [(eof-object? c)
(raise-read-eof-error (raise-read-eof-error
"expected eof; " "unexpected eof; "
source _line _col _pos source _line _col _pos
(and _pos (- _pos (+ current-line-start-position chars-read))))] (and _pos (- (+ current-line-start-position chars-read) _pos)))]
[(equal? c #\return) [(equal? c #\return)
(cond (cond
[(equal? #\newline (peek-char port)) [(equal? #\newline (peek-char port))

View File

@ -31,7 +31,14 @@ example uses:
(case-lambda (case-lambda
[(char port) [(char port)
(define-values (line col pos) (port-next-location port)) (define-values (line col pos) (port-next-location port))
(dispatch-proc char port #f line col pos read/recursive previous-readtable)]
;; the "-2"s here are because the initial line and column
;; are supposed be at the beginning of the thing read, not
;; after the "#2" has been consumed.
(dispatch-proc char port #f line
(and col (- col 2))
(and pos (- pos 2))
read/recursive previous-readtable)]
[(char port source _line _col _pos) [(char port source _line _col _pos)
(dispatch-proc char port source _line _col _pos (dispatch-proc char port source _line _col _pos
(λ (a b c) (read-syntax/recursive source a b c)) (λ (a b c) (read-syntax/recursive source a b c))