From 2fc1f84f5b6647e22359c57cc695e6dbeed7085a Mon Sep 17 00:00:00 2001 From: Jay McCarthy Date: Fri, 18 Jan 2013 06:14:40 -0700 Subject: [PATCH] Removing optimizations from sgml-reader that do not work for UTF-8 --- collects/html/sgml-reader.rkt | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/collects/html/sgml-reader.rkt b/collects/html/sgml-reader.rkt index d5348e0569..c20289cea9 100644 --- a/collects/html/sgml-reader.rkt +++ b/collects/html/sgml-reader.rkt @@ -315,19 +315,24 @@ (list->string data)))) |# - +(define (lex-name* in) + (define os (open-output-string)) + (let loop () + (define ch (peek-char in)) + (when (name-char? ch) + (read-char in) + (display ch os) + (loop))) + (get-output-string os)) ;; lex-name : Input-port -> Symbol (define (lex-name in) - (let ([s (bytes->string/utf-8 (car (regexp-match #rx"^[a-zA-Z_:0-9&.-]*" in)))]) - (string->symbol - ;; Common case: string is already lowercased - (if (regexp-match-positions #rx"[A-Z]" s) - (string-downcase s) - s)))) + (string->symbol + (string-downcase + (lex-name* in)))) ;; lex-name/case-sensitive : Input-port -> Symbol (define (lex-name/case-sensitive in) - (let ([s (bytes->string/utf-8 (car (regexp-match #rx"^[a-zA-Z_:0-9&.-]*" in)))]) - (string->symbol s))) + (string->symbol + (lex-name* in))) #| (define (lex-name in) (string->symbol