fix unstable/2d's lexer to handle eof better

original commit: 87a8e6f677d9538001766910ba119dde8066b400
This commit is contained in:
Robby Findler 2013-02-25 10:36:46 -06:00
parent 1b75e51175
commit b49c680dd5
3 changed files with 182 additions and 149 deletions

View File

@ -156,17 +156,15 @@ todo:
(define the-state (make-state line pos (string-length first-tok-string))) (define the-state (make-state line pos (string-length first-tok-string)))
(setup-state the-state) (setup-state the-state)
;; would like to be able to stop this loop ;; would like to be able to stop this loop
;; and process only part of the table, ;; and process only part of the table,
;; but that works only when there are no broken ;; but that works only when there are no broken
;; edges of the table that span the place I want to stop. ;; edges of the table that span the place I want to stop.
(define failed (define failed
(with-handlers ((exn:fail:read? (with-handlers ((exn:fail:read? values))
(λ (exn) exn)))
(let loop ([map #f]) (let loop ([map #f])
(define new-map (define new-map
;; this might raise a read exception: what then?
(parse-2dcond-one-step peek-port (object-name peek-port) #f #f pos the-state map)) (parse-2dcond-one-step peek-port (object-name peek-port) #f #f pos the-state map))
(when new-map (when new-map
(loop new-map))))) (loop new-map)))))
@ -177,153 +175,181 @@ todo:
;; no matter how long eol-string is, it counts for 1 position only. ;; no matter how long eol-string is, it counts for 1 position only.
(+ pos (string-length first-tok-string) 1))) (+ pos (string-length first-tok-string) 1)))
(define final-tokens (cond
(cond [(exn:fail:read:eof? failed)
[(exn:fail:read? failed) ;; in this case, the source location for the error
(define error-pos (- (srcloc-position (car (exn:fail:read-srclocs failed))) ;; should be the beginning of the #2d token,
base-position)) ;; account for the newline ;; so we just turn the whole thing red in a single token
(define peek-port2 (peeking-input-port port)) (define tok-string
(port-count-lines! peek-port2) (string-append
first-tok-string
;; pull the newline out of peek-port2 (apply string
(for ([x (in-range (string-length eol-string))]) (read-char peek-port2)) (let loop ()
(define c (read-char port))
(define (pull-chars n) (cond
(apply [(eof-object? c) '()]
string [else (cons c (loop))])))))
(let loop ([n n]) (values tok-string 'error #f
(cond pos (+ pos (string-length tok-string))
[(zero? n) '()] 0
[else (cons (read-char peek-port2) (loop (- n 1)))])))) #f)]
(define before-token (list (pull-chars error-pos) [else
'no-color (define final-tokens
#f (cond
(+ base-position 1) [(exn:fail:read? failed)
(+ base-position 1 error-pos))) (define error-pos (- (srcloc-position (car (exn:fail:read-srclocs failed)))
(define end-of-table-approx base-position)) ;; account for the newline
(let ([peek-port3 (peeking-input-port peek-port2)]) (when (< error-pos 0)
(port-count-lines! peek-port3) (error 'unstable/2d/lexer.rkt "got error-pos < 0: ~s ~s\n"
(let loop () (srcloc-position (car (exn:fail:read-srclocs failed)))
(define l (read-line peek-port3)) base-position))
(define-values (line col pos) (port-next-location peek-port3)) (define peek-port2 (peeking-input-port port))
(cond (port-count-lines! peek-port2)
[(and (string? l) (define (pull-chars n)
(regexp-match double-barred-chars-regexp l)) (apply
(loop)] string
[else pos])))) (let loop ([n n])
(define after-token (cond
(list (pull-chars (- end-of-table-approx 1)) [(zero? n) '()]
'error [else (cons (read-char peek-port2) (loop (- n 1)))]))))
#f (cond
(+ base-position 1 error-pos)
(+ base-position 1 error-pos end-of-table-approx -1))) [else
(list newline-token before-token after-token)]
[else ;; pull the newline out of peek-port2
(for ([x (in-range (string-length eol-string))]) (read-char peek-port2))
(define lhses (close-cell-graph cell-connections (length table-column-breaks) (length rows)))
(define scratch-string (make-string (for/sum ([ss (in-list rows)]) (define before-token (list (pull-chars error-pos)
(for/sum ([s (in-list ss)]) 'no-color
(string-length s))) #f
#\space)) (+ base-position 1)
(define collected-tokens '()) (+ base-position 1 error-pos)))
(define rows-as-vector (apply vector (reverse rows))) (define end-of-table-approx
(for ([set-of-indicies (in-list (sort (set->list lhses) compare/xy (let ([peek-port3 (peeking-input-port peek-port2)])
#:key smallest-representative))]) (port-count-lines! peek-port3)
(define regions (let loop ()
(fill-scratch-string set-of-indicies (define l (read-line peek-port3))
rows-as-vector (define-values (line col pos) (port-next-location peek-port3))
scratch-string (cond
table-column-breaks [(and (string? l)
initial-space-count (regexp-match double-barred-chars-regexp l))
#t)) (loop)]
(define port (open-input-string scratch-string)) [else pos]))))
(port-count-lines! port) (define after-token
(let loop ([mode (2d-lexer-state-chained-state a-2d-lexer-state)]) (list (pull-chars (- end-of-table-approx 1))
(define-values (_1 _2 current-pos) (port-next-location port)) 'error
(define-values (tok-str tok paren start end backup new-mode) #f
(uniform-chained-lexer port (+ pos offset) mode)) (+ base-position 1 error-pos)
(unless (equal? 'eof tok) (+ base-position 1 error-pos end-of-table-approx -1)))
(for ([sub-region (in-list (cropped-regions start end regions))]) (if (zero? error-pos)
(define start (- (car sub-region) current-pos)) (list newline-token after-token)
(define end (- (cdr sub-region) current-pos)) (list newline-token before-token after-token))])]
(set! collected-tokens [else
(cons (list (if (and (string? tok-str)
(< start (string-length tok-str)) (define lhses (close-cell-graph cell-connections (length table-column-breaks) (length rows)))
(<= end (string-length tok-str))) (define scratch-string (make-string (for/sum ([ss (in-list rows)])
(substring tok-str start end) (for/sum ([s (in-list ss)])
(list 'strange-token tok-str)) (string-length s)))
tok #\space))
paren (define collected-tokens '())
(+ base-position (car sub-region)) (define rows-as-vector (apply vector (reverse rows)))
(+ base-position (cdr sub-region))) (for ([set-of-indicies (in-list (sort (set->list lhses) compare/xy
collected-tokens))) #:key smallest-representative))])
(loop new-mode)))) (define regions
(fill-scratch-string set-of-indicies
(define (collect-double-barred-token pending-start i offset str) rows-as-vector
(when pending-start scratch-string
(set! collected-tokens (cons (list (substring str pending-start i) table-column-breaks
'parenthesis initial-space-count
#f #t))
(+ base-position offset pending-start) (define port (open-input-string scratch-string))
(+ base-position offset i)) (port-count-lines! port)
collected-tokens)))) (let loop ([mode (2d-lexer-state-chained-state a-2d-lexer-state)])
(define-values (_1 _2 current-pos) (port-next-location port))
(for/fold ([offset 1]) ([strs (in-list (reverse (cons (list current-line) rows)))]) (define-values (tok-str tok paren start end backup new-mode)
(for/fold ([offset offset]) ([str (in-list strs)]) (uniform-chained-lexer port (+ pos offset) mode))
(let loop ([i 0] (unless (equal? 'eof tok)
[pending-start #f]) (for ([sub-region (in-list (cropped-regions start end regions))])
(cond (define start (- (car sub-region) current-pos))
[(< i (string-length str)) (define end (- (cdr sub-region) current-pos))
(define c (string-ref str i)) (set! collected-tokens
(cons (list (if (and (string? tok-str)
(< start (string-length tok-str))
(<= end (string-length tok-str)))
(substring tok-str start end)
(list 'strange-token tok-str))
tok
paren
(+ base-position (car sub-region))
(+ base-position (cdr sub-region)))
collected-tokens)))
(loop new-mode))))
(define (collect-double-barred-token pending-start i offset str)
(when pending-start
(set! collected-tokens (cons (list (substring str pending-start i)
'parenthesis
#f
(+ base-position offset pending-start)
(+ base-position offset i))
collected-tokens))))
(for/fold ([offset 1]) ([strs (in-list (reverse (cons (list current-line) rows)))])
(for/fold ([offset offset]) ([str (in-list strs)])
(let loop ([i 0]
[pending-start #f])
(cond (cond
[(member c double-barred-chars) [(< i (string-length str))
(loop (+ i 1) (define c (string-ref str i))
(if pending-start pending-start i))] (cond
[(member c double-barred-chars)
(loop (+ i 1)
(if pending-start pending-start i))]
[else
(collect-double-barred-token pending-start i offset str)
(loop (+ i 1) #f)])]
[else [else
(collect-double-barred-token pending-start i offset str) (collect-double-barred-token pending-start i offset str)]))
(loop (+ i 1) #f)])] (+ (string-length str) offset)))
[else
(collect-double-barred-token pending-start i offset str)])) (define sorted-tokens (sort collected-tokens <
(+ (string-length str) offset))) #:key (λ (x) (list-ref x 3))))
(define sorted-tokens (sort collected-tokens < ;; there will be gaps that correspond to the places outside of the
#:key (λ (x) (list-ref x 3)))) ;; outermost rectangle (at a minimum, newlines); this fills those
;; in with whitespace tokens
;; there will be gaps that correspond to the places outside of the ;; NOTE: this code does not deal properly with \r\n newline combinations
;; outermost rectangle (at a minimum, newlines); this fills those (define cracks-filled-in-tokens
;; in with whitespace tokens (let loop ([fst newline-token]
;; NOTE: this code does not deal properly with \r\n newline combinations [tokens sorted-tokens])
(define cracks-filled-in-tokens
(let loop ([fst newline-token]
[tokens sorted-tokens])
(cond
[(null? tokens) (list fst)]
[else
(define snd (car tokens))
(cond (cond
[(= (list-ref fst 4) [(null? tokens) (list fst)]
(list-ref snd 3))
(cons fst (loop snd (cdr tokens)))]
[else [else
(define new-start (list-ref fst 4)) (define snd (car tokens))
(define new-end (list-ref snd 3)) (cond
(list* fst [(= (list-ref fst 4)
(list (list-ref snd 3))
; these are not the real characters ... (cons fst (loop snd (cdr tokens)))]
(make-string (- new-end new-start) #\space) [else
'white-space (define new-start (list-ref fst 4))
#f (define new-end (list-ref snd 3))
new-start (list* fst
new-end) (list
(loop snd (cdr tokens)))])]))) ; these are not the real characters ...
cracks-filled-in-tokens])) (make-string (- new-end new-start) #\space)
'white-space
#f
new-start
new-end)
(loop snd (cdr tokens)))])])))
cracks-filled-in-tokens]))
(values first-tok-string 'hash-colon-keyword #f (values first-tok-string 'hash-colon-keyword #f
pos (+ pos (string-length first-tok-string)) pos (+ pos (string-length first-tok-string))
0 0
(2d-lexer-state final-tokens (2d-lexer-state final-tokens
#t #t
(2d-lexer-state-chained-state a-2d-lexer-state)))])) (2d-lexer-state-chained-state a-2d-lexer-state)))])]))
(define (cropped-regions start end regions) (define (cropped-regions start end regions)
(define result-regions '()) (define result-regions '())

View File

@ -314,9 +314,9 @@ example uses:
(cond (cond
[(eof-object? c) [(eof-object? c)
(raise-read-eof-error (raise-read-eof-error
"expected eof; " "unexpected eof; "
source _line _col _pos source _line _col _pos
(and _pos (- _pos (+ current-line-start-position chars-read))))] (and _pos (- (+ current-line-start-position chars-read) _pos)))]
[(equal? c #\return) [(equal? c #\return)
(cond (cond
[(equal? #\newline (peek-char port)) [(equal? #\newline (peek-char port))

View File

@ -31,7 +31,14 @@ example uses:
(case-lambda (case-lambda
[(char port) [(char port)
(define-values (line col pos) (port-next-location port)) (define-values (line col pos) (port-next-location port))
(dispatch-proc char port #f line col pos read/recursive previous-readtable)]
;; the "-2"s here are because the initial line and column
;; are supposed be at the beginning of the thing read, not
;; after the "#2" has been consumed.
(dispatch-proc char port #f line
(and col (- col 2))
(and pos (- pos 2))
read/recursive previous-readtable)]
[(char port source _line _col _pos) [(char port source _line _col _pos)
(dispatch-proc char port source _line _col _pos (dispatch-proc char port source _line _col _pos
(λ (a b c) (read-syntax/recursive source a b c)) (λ (a b c) (read-syntax/recursive source a b c))