Windows: Unicode I/O for console ports

Use WriteConsoleW and ReadConsoleW to support Unicode input and output
for a console independent of it's code page.

Closes #2087
This commit is contained in:
Matthew Flatt 2018-05-20 07:46:20 -06:00
parent 4d04a77cb3
commit f5f4e6cf03
2 changed files with 233 additions and 6 deletions

View File

@ -0,0 +1,60 @@
#lang racket/base
;; This test is intended for checking Unicode output to a Windows
;; console --- because that requires yet another decoding/encoding
;; layer within rktio to use WriteConsoleW --- so its output is
;; intended to be inspected directly.
(define H 100)
(define W 40)
(define MAX-FLUSH-SPAN 10)
(define (go content-char)
(define tl #\u2554) (define h #\u2550) (define tr #\u2557)
(define v #\u2551)
(define bl #\u255A) (define br #\u255D)
(define (draw)
(display (string tl))
(display (make-string (- W 2) h))
(display (string tr))
(newline)
(for ([i (- H 2)])
(display v)
(display (make-string (- W 2) content-char))
(display v)
(newline))
(display (string bl))
(display (make-string (- W 2) h))
(display (string br))
(newline))
;; ----------------------------------------
;; Direct to console:
(draw)
;; ----------------------------------------
;; Write UTF-8 byte string piecewise (to check that
;; incomplete encodings are buffered)
(define o (open-output-bytes))
(parameterize ([current-output-port o])
(draw))
(define bstr (get-output-bytes o))
(let loop ([i 0])
(unless (= i (bytes-length bstr))
(define span (min (add1 (random MAX-FLUSH-SPAN))
(- (bytes-length bstr) i)))
(write-bytes bstr (current-output-port) i (+ i span))
(flush-output)
(loop (+ i span)))))
(go #\uA9) ; copyright (2 bytes in UTF-8)
(go #\u3BB) ; lambda (3 bytes in UTF-8)
(go #\U24b6) ; white-circle "A"
(go #\U1F150) ; black-circle "A" (2 bytes in UTF-8; surrogate pair in UTF-16)

View File

@ -43,6 +43,8 @@ struct rktio_fd_t {
char *buffer; /* shared with reading thread */
int has_pending_byte; /* for text-mode input, may be dropped by a following lf */
int pending_byte; /* for text-mode input, either a CR waiting to decode, or byte that didn't fit */
int leftover_len; /* for bytes that should be written, but that form a UTF-8 encoding prefix */
char leftover[6];
#endif
};
@ -129,6 +131,11 @@ static intptr_t adjust_input_text_for_pending_cr(rktio_fd_t *rfd, char *buffer,
static const char *adjust_output_text(const char *buffer, intptr_t *towrite);
static intptr_t recount_output_text(const char *orig_buffer, const char *buffer, intptr_t wrote);
static wchar_t *convert_output_wtext(const char *buffer, intptr_t *_towrite,
int *_can_leftover, int *_keep_leftover,
int leftover_len, char *leftover);
static intptr_t recount_output_wtext(wchar_t *w_buffer, intptr_t winwrote);
#endif
/*========================================================================*/
@ -1021,15 +1028,18 @@ static void deinit_read_fd(rktio_t *rktio, rktio_fd_t *rfd, int full_close)
static long WINAPI WindowsFDReader(Win_FD_Input_Thread *th)
{
DWORD toget, got;
int perma_eof = 0;
int perma_eof = 0, ft, is_console = 0;
HANDLE eof_wait = NULL;
if (GetFileType((HANDLE)th->fd) == FILE_TYPE_PIPE) {
ft = GetFileType((HANDLE)th->fd);
if (ft == FILE_TYPE_PIPE) {
/* Reading from a pipe will return early when data is available. */
toget = RKTIO_FD_BUFFSIZE;
} else {
/* Non-pipe: get one char at a time: */
toget = 1;
is_console = (ft == FILE_TYPE_CHAR);
}
while (!perma_eof && !th->err) {
@ -1044,8 +1054,19 @@ static long WINAPI WindowsFDReader(Win_FD_Input_Thread *th)
/* Spurious wake-up? */
ReleaseSemaphore(th->lock_sema, 1, NULL);
} else {
int ok;
ReleaseSemaphore(th->lock_sema, 1, NULL);
if (ReadFile(th->fd, th->buffer, toget, &got, NULL)) {
if (!is_console)
ok = ReadFile(th->fd, th->buffer, toget, &got, NULL);
else {
# define CONSOLE_BUFFER_IN_SIZE 16
wchar_t w_buffer[CONSOLE_BUFFER_IN_SIZE];
ok = ReadConsoleW(th->fd, w_buffer, CONSOLE_BUFFER_IN_SIZE, &got, NULL);
if (ok) {
got = WideCharToMultiByte(CP_UTF8, 0, w_buffer, got, th->buffer, RKTIO_FD_BUFFSIZE, NULL, 0);
}
}
if (ok) {
WaitForSingleObject(th->lock_sema, INFINITE);
th->avail = got;
th->offset = 0;
@ -1172,19 +1193,42 @@ intptr_t rktio_write(rktio_t *rktio, rktio_fd_t *rfd, const char *buffer, intptr
/* If we try to write too much at once, the result
is ERROR_NOT_ENOUGH_MEMORY (as opposed to a partial write). */
int ok;
int ok, to_console, can_leftover = 0, keep_leftover = 0;
intptr_t towrite = len;
const char *orig_buffer = buffer;
wchar_t *w_buffer = NULL;
DWORD max_winwrote;
int err;
if (rfd->modes & RKTIO_OPEN_TEXT)
buffer = adjust_output_text(buffer, &towrite);
max_winwrote = towrite;
to_console = rktio_fd_is_terminal(rktio, rfd);
if (to_console) {
/* Decode UTF-8 and write a chunk on a character boundary. */
w_buffer = convert_output_wtext(buffer, &towrite,
&can_leftover, &keep_leftover,
rfd->leftover_len, rfd->leftover);
}
while (1) {
ok = WriteFile((HANDLE)rfd->fd, buffer, towrite, &winwrote, NULL);
if (!to_console)
ok = WriteFile((HANDLE)rfd->fd, buffer, towrite, &winwrote, NULL);
else {
if (towrite)
ok = WriteConsoleW((HANDLE)rfd->fd, w_buffer, towrite, &winwrote, NULL);
else {
/* can happend if can_leftover is > 0 */
ok = 1;
winwrote = 0;
}
}
if (!ok)
err = GetLastError();
if (!ok && (err == ERROR_NOT_ENOUGH_MEMORY)) {
towrite = towrite >> 1;
if (towrite && (buffer != orig_buffer)) {
@ -1205,6 +1249,27 @@ intptr_t rktio_write(rktio_t *rktio, rktio_fd_t *rfd, const char *buffer, intptr
return RKTIO_WRITE_ERROR;
}
if (to_console) {
/* Convert wchar count to byte count, taking into account leftovers */
int wrote_all = (winwrote == towrite);
if (winwrote) {
/* Recounting only works right if the outptu was well-formed
UTF-8. Weird things happen otherwise... but we guard against
external inconsistency with the `max_winwrote` check below. */
winwrote = recount_output_wtext(w_buffer, winwrote);
winwrote -= rfd->leftover_len;
rfd->leftover_len = 0;
}
if (wrote_all && can_leftover) {
memcpy(rfd->leftover + keep_leftover, buffer + winwrote, can_leftover);
rfd->leftover_len = can_leftover + keep_leftover;
winwrote += can_leftover;
}
free(w_buffer);
if (winwrote > max_winwrote)
winwrote = max_winwrote;
}
if (buffer != orig_buffer) {
/* Convert converted count back to original count: */
winwrote = recount_output_text(orig_buffer, buffer, winwrote);
@ -1484,6 +1549,108 @@ static intptr_t recount_output_text(const char *orig_buffer, const char *buffer,
return i;
}
static wchar_t *convert_output_wtext(const char *buffer, intptr_t *_towrite,
int *_can_leftover, int *_keep_leftover,
int leftover_len, char *leftover)
{
/* Figure out how many bytes we can convert to complete wide
characters. To avoid quadratic behavior overall, we'll limit the
number of bytes.
The given `leftover_len` and `leftover` is a prefix on `buffer`.
If the tail (after writing all other bytes) is an incomplete
UTF-8 prefix, report the prefix length in `_can_leftover`. */
intptr_t i, count, len = *_towrite;
char *src_buffer;
wchar_t *dest_buffer;
int want, span = 0;
if (leftover_len) {
/* Assume that leftover is a valid prefix: */
int v = ((unsigned char *)leftover)[0];
if ((v & 0xF8) == 0xF0)
span = 4;
else if ((v & 0xF0) == 0xE0)
span = 3;
else
span = 2;
} else
span = 0;
want = span - leftover_len;
for (i = 0, count = 0; (i < len) && (count < 1024); i++) {
int v = ((unsigned char *)buffer)[i];
if (want) {
if ((v & 0xC0) == 0x80) {
/* valid continuation byte */
want--;
if (!want) {
count++;
if (span == 4)
count++; /* surrogate pair */
}
} else {
/* not a valid continuation byte */
count++;
want = 0;
--i; /* retry byte */
}
} else if (!(v & 0x80)) {
count++;
} else if ((v & 0xF8) == 0xF0) {
span = 4;
want = 3;
} else if ((v & 0xF0) == 0xE0) {
span = 3;
want = 2;
} else {
span = 2;
want = 1;
}
}
if ((i == len) && (want > 0)) {
/* consuming all input, so set leftover */
int keep = span - want;
if (i >= keep) {
*_can_leftover = keep;
i -= keep;
*_keep_leftover = 0;
} else {
*_can_leftover = (keep - leftover_len);
*_keep_leftover = leftover_len;
i = 0;
}
} else {
*_can_leftover = 0;
*_keep_leftover = 0;
}
if (leftover_len) {
src_buffer = malloc(i + leftover_len);
memcpy(src_buffer, leftover, leftover_len);
memcpy(src_buffer + leftover_len, buffer, i);
i += leftover_len;
} else
src_buffer = (char *)buffer;
dest_buffer = (wchar_t *)malloc(sizeof(wchar_t) * count);
if (count > 0)
count = MultiByteToWideChar(CP_UTF8, 0, src_buffer, i, dest_buffer, count);
*_towrite = count;
if (leftover_len)
free(src_buffer);
return dest_buffer;
}
static intptr_t recount_output_wtext(wchar_t *w_buffer, intptr_t winwrote)
{
return WideCharToMultiByte(CP_UTF8, 0, w_buffer, winwrote, NULL, 0, NULL, 0);
}
static void deinit_write_fd(rktio_t *rktio, rktio_fd_t *rfd, int full_close)
{
if (rfd->oth) {