From e7bb4def7180f8aba17254b4b8472a2defc33189 Mon Sep 17 00:00:00 2001 From: Neal Alexander Date: Thu, 9 Jan 2020 19:34:58 +0100 Subject: [PATCH] added unicode support to windows console i/o original commit: e7e638e871ac4b46a84149dda93aae8741683e0a --- c/expeditor.c | 64 +++++++++++++++++----- c/externs.h | 8 +++ c/new-io.c | 77 ++++++++++++++++++++------- c/windows.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 259 insertions(+), 34 deletions(-) diff --git a/c/expeditor.c b/c/expeditor.c index c16865bc1e..b8dabb4c70 100644 --- a/c/expeditor.c +++ b/c/expeditor.c @@ -68,6 +68,11 @@ static IBOOL s_ee_init_term(void) { return init_status; } + +static int utf16_is_surrogate(WORD uc) { + return (uc - 0xd800u) < 2048u; +} + /* returns char, eof, #t (winched), or #f (nothing ready), the latter only if blockp is false */ static ptr s_ee_read_char(IBOOL blockp) { @@ -104,13 +109,13 @@ static ptr s_ee_read_char(IBOOL blockp) { tc = get_thread_context(); if (DISABLECOUNT(tc) == FIX(0)) { deactivate_thread(tc); - succ = ReadConsoleInput(hStdin, irInBuf, 1, &cNumRead); + succ = ReadConsoleInputW(hStdin, irInBuf, 1, &cNumRead); reactivate_thread(tc); } else { - succ = ReadConsoleInput(hStdin, irInBuf, 1, &cNumRead); + succ = ReadConsoleInputW(hStdin, irInBuf, 1, &cNumRead); } #else /* PTHREADS */ - succ = ReadConsoleInput(hStdin, irInBuf, 1, &cNumRead); + succ = ReadConsoleInputW(hStdin, irInBuf, 1, &cNumRead); #endif /* PTHREADS */ @@ -125,15 +130,22 @@ static ptr s_ee_read_char(IBOOL blockp) { KEY_EVENT_RECORD ker = irInBuf[0].Event.KeyEvent; rptcnt = ker.wRepeatCount; if (ker.bKeyDown) { - char c; + WCHAR c; - if (c = ker.uChar.AsciiChar) { + if (c = ker.uChar.UnicodeChar) { /* translate ^@ 2) and ^ to nul */ - if (c == 0x20 && (ker.dwControlKeyState & (LEFT_CTRL_PRESSED|RIGHT_CTRL_PRESSED))) + if (c == 0x20 && (ker.dwControlKeyState + & (LEFT_CTRL_PRESSED|RIGHT_CTRL_PRESSED))) { buf[0] = 0; - else - buf[0] = c; buflen = 1; + + } else if (utf16_is_surrogate(c)) { + return Schar('\0'); + + } else { + return Schar(c); + } + } else { switch (ker.wVirtualKeyCode) { case VK_DELETE: @@ -508,11 +520,35 @@ static ptr s_ee_get_clipboard(void) { ptr x = S_G.null_string; if (OpenClipboard((HWND)0)) { - HANDLE h = GetClipboardData(CF_TEXT); + HANDLE h = GetClipboardData(CF_UNICODETEXT); if (h != (HANDLE *)0) { - char *s = (char *)GlobalLock(h); - if (s != (char *)0) x = Sstring(s); + wchar_t *s = (wchar_t *)GlobalLock(h); + + if (s != NULL) { + int sz8 = WideCharToMultiByte(CP_UTF8, + WC_ERR_INVALID_CHARS, + s, -1, + NULL, + 0, NULL, NULL); + if (sz8 > 0) { + unsigned char *buf = (unsigned char*) malloc(sz8); + + if (buf != NULL) { + if (WideCharToMultiByte(CP_UTF8, + WC_ERR_INVALID_CHARS, + s, -1, + buf, + sz8, NULL, NULL)) { + + x = Sstring_utf8(buf, sz8 - 1); + } + + free(buf); + } + } + } + GlobalUnlock(h); } CloseClipboard(); @@ -521,9 +557,9 @@ static ptr s_ee_get_clipboard(void) { return x; } -static void s_ee_write_char(wchar_t c) { - if (c > 255) c = '?'; - putchar(c); +static void s_ee_write_char(wchar_t c) { // TODO: utf-32 chars? + DWORD n; + WriteConsoleW(hStdout, &c, 1, &n, NULL); } #else /* WIN32 */ diff --git a/c/externs.h b/c/externs.h index 50f0f5a54a..ae4d124b2b 100644 --- a/c/externs.h +++ b/c/externs.h @@ -396,6 +396,14 @@ extern int S_windows_stat64(const char *pathname, struct STATBUF *buffer); extern int S_windows_system(const char *command); extern int S_windows_unlink(const char *pathname); extern char *S_windows_getcwd(char *buffer, int maxlen); + +extern int S_windows_stdin_read(unsigned char *buf, int size); +extern int S_windows_stdout_write(unsigned char *buf, int size); +extern int S_windows_stderr_write(unsigned char *buf, int size); +extern int S_windows_console_write_utf8(HANDLE h, unsigned char *b_ptr, int b_n, int *written); +extern int S_windows_console_read_utf8(HANDLE h, unsigned char *output, int want, int *got); + + #endif /* WIN32 */ #ifdef FEATURE_EXPEDITOR diff --git a/c/new-io.c b/c/new-io.c index 67fef8fefa..cd1a4ec4aa 100644 --- a/c/new-io.c +++ b/c/new-io.c @@ -417,6 +417,40 @@ ptr S_close_fd(ptr file, IBOOL gzflag) { #define IO_SIZE_T size_t #endif /* WIN32 */ +int os_read(ptr tc, INT fd, unsigned char *buf, IO_SIZE_T size, int *m) { +#ifdef WIN32 + if (fd == 0) { + + // There may be a race condition or other problem with this signal mask. + // Originally it was placed deeper down, right next to the win32 API + // call, but this failed to mask the signal for some confusing reason. + + if (!SetConsoleCtrlHandler(NULL, TRUE)) { + *m = -1; + return 1; + } + + *m = S_windows_stdin_read(buf, size); + + if (!SetConsoleCtrlHandler(NULL, FALSE)) + return 1; + + return 0; + + } else +#endif /* WIN32 */ + { + int flag = 0; + + FD_EINTR_GUARD(*m >= 0 || Sboolean_value(KEYBOARDINTERRUPTPENDING(tc)), + flag, + *m = READ(fd, buf, size)); + + return flag; + } +} + + /* Returns string on error, #!eof on end-of-file and integer-count otherwise */ ptr S_bytevector_read(ptr file, ptr bv, iptr start, iptr count, IBOOL gzflag) { INT saved_errno = 0; @@ -433,33 +467,20 @@ ptr S_bytevector_read(ptr file, ptr bv, iptr start, iptr count, IBOOL gzflag) { #endif LOCKandDEACTIVATE(tc, bv) -#ifdef WIN32 - if (!gzflag && fd == 0) { - DWORD error_code; - SetConsoleCtrlHandler(NULL, TRUE); - SetLastError(0); - m = _read(0, &BVIT(bv,start), (IO_SIZE_T)count); - error_code = GetLastError(); - SetConsoleCtrlHandler(NULL, FALSE); - if (m == 0 && error_code == 0x3e3) { - KEYBOARDINTERRUPTPENDING(tc) = Strue; - SOMETHINGPENDING(tc) = Strue; - } - } else -#endif /* WIN32 */ { if (!gzflag) { - FD_EINTR_GUARD( - m >= 0 || Sboolean_value(KEYBOARDINTERRUPTPENDING(tc)), flag, - m = READ(fd,&BVIT(bv,start),(IO_SIZE_T)count)); + int len = 0; + flag = os_read(tc, fd, &BVIT(bv, start), (IO_SIZE_T)count, &len); + m = len; } else { GZ_EINTR_GUARD( 1, m >= 0 || Sboolean_value(KEYBOARDINTERRUPTPENDING(tc)), flag, gzfile, m = S_glzread(gzfile, &BVIT(bv,start), (GZ_IO_SIZE_T)count)); } - } + saved_errno = errno; + } REACTIVATEandUNLOCK(tc, bv) if (Sboolean_value(KEYBOARDINTERRUPTPENDING(tc))) { @@ -535,6 +556,21 @@ ptr S_bytevector_read_nb(ptr file, ptr bv, iptr start, iptr count, IBOOL gzflag) #endif /* WIN32 */ } + +int os_write(INT fd, unsigned char *buf, IO_SIZE_T size) { +#ifdef WIN32 + + if (fd == 1) + return S_windows_stdout_write(buf, size); + + else if (fd == 2) + return S_windows_stderr_write(buf, size); +#endif + + return WRITE(fd, buf, size); +} + + ptr S_bytevector_write(ptr file, ptr bv, iptr start, iptr count, IBOOL gzflag) { iptr i, s, c; ptr tc = get_thread_context(); @@ -552,6 +588,7 @@ ptr S_bytevector_write(ptr file, ptr bv, iptr start, iptr count, IBOOL gzflag) { /* if we could know that fd is nonblocking, we wouldn't need to deactivate. we could test ioctl, but some other thread could change it before we actually get around to writing. */ + LOCKandDEACTIVATE(tc, bv) if (gzflag) { /* strangely, gzwrite returns 0 on error */ @@ -561,7 +598,7 @@ ptr S_bytevector_write(ptr file, ptr bv, iptr start, iptr count, IBOOL gzflag) { i = S_glzwrite(gzfile, &BVIT(bv,s), (GZ_IO_SIZE_T)cx)); } else { FD_EINTR_GUARD(i >= 0 || Sboolean_value(KEYBOARDINTERRUPTPENDING(tc)), - flag, i = WRITE(fd, &BVIT(bv,s), (IO_SIZE_T)cx)); + flag, i = os_write(fd, &BVIT(bv,s), (IO_SIZE_T)cx)); } saved_errno = errno; REACTIVATEandUNLOCK(tc, bv) @@ -616,7 +653,7 @@ ptr S_put_byte(ptr file, INT byte, IBOOL gzflag) { i = S_glzwrite(gzfile, buf, 1)); } else { FD_EINTR_GUARD(i >= 0 || Sboolean_value(KEYBOARDINTERRUPTPENDING(tc)), - flag, i = WRITE(fd, buf, 1)); + flag, i = os_write(fd, buf, 1)); } saved_errno = errno; REACTIVATE(tc) diff --git a/c/windows.c b/c/windows.c index 1dac846529..a695628da8 100644 --- a/c/windows.c +++ b/c/windows.c @@ -33,6 +33,7 @@ void S_machine_init() { Sregister_symbol("(windows)PutRegistry", (void *)s_PutRegistry); Sregister_symbol("(windows)RemoveRegistry", (void *)s_RemoveRegistry); Sregister_symbol("(windows)ErrorString", (void *)s_ErrorString); + SetConsoleOutputCP(CP_UTF8); } INT S_getpagesize() { @@ -495,3 +496,146 @@ char *Sgetenv(const char *name) { } } } + +// Console + +#define WINDOWS_CONSOLE_STATIC_INPUT_LEN 1024 +#define UTF8_MAX_CODEPOINT 4 +#define WINDOWS_CONSOLE_ERROR -1 +#define WINDOWS_CONSOLE_EOF -2 +#define WINDOWS_CONSOLE_SUCCESS 0 + + +static int _windows_console_read_utf8_chunk(HANDLE h, unsigned char *buf, + int size, int *got) { + // 'utf16_buf' should be locked by the caller, and 'buf' must have the + // correct size relative to 'utf16_buf'. + + static wchar_t utf16_buf[WINDOWS_CONSOLE_STATIC_INPUT_LEN]; + + if (h == INVALID_HANDLE_VALUE || got == NULL || buf == NULL) + return WINDOWS_CONSOLE_ERROR; + + *got = 0; + + if (size < (WINDOWS_CONSOLE_STATIC_INPUT_LEN * UTF8_MAX_CODEPOINT)) + return WINDOWS_CONSOLE_ERROR; + + + DWORD utf16_len = 0; + + if (!ReadConsoleW(h, utf16_buf, sizeof utf16_buf, &utf16_len, NULL)) + return WINDOWS_CONSOLE_ERROR; + + else if (utf16_len == 0) + return WINDOWS_CONSOLE_EOF; + + else if (utf16_len > INT_MAX) + return WINDOWS_CONSOLE_ERROR; + + int utf8_len = WideCharToMultiByte(CP_UTF8, + WC_ERR_INVALID_CHARS, + utf16_buf, utf16_len, + buf, + size, NULL, NULL); + if (utf8_len <= 0) + return WINDOWS_CONSOLE_ERROR; + + *got = utf8_len; + return WINDOWS_CONSOLE_SUCCESS; +} + + + +int S_windows_console_read_utf8(HANDLE h, unsigned char *output, int want, + int *got) { + + // Regardless of the size of 'want', the maximum which can be read per call + // is WINDOWS_CONSOLE_STATIC_INPUT_LEN * UTF8_MAX_CODEPOINT + + static SRWLOCK lock = SRWLOCK_INIT; + + if (h == INVALID_HANDLE_VALUE || got == NULL || output == NULL) + return WINDOWS_CONSOLE_ERROR; + + *got = 0; + + if (want <= 0) + return WINDOWS_CONSOLE_SUCCESS; + + AcquireSRWLockExclusive(&lock); + { + static int surplus = 0; + static unsigned char surplus_buf[WINDOWS_CONSOLE_STATIC_INPUT_LEN + * UTF8_MAX_CODEPOINT]; + + + if (surplus <= 0) { + const int e = _windows_console_read_utf8_chunk(h, surplus_buf, + sizeof surplus_buf, + &surplus); + + if (e != WINDOWS_CONSOLE_SUCCESS) { + ReleaseSRWLockExclusive(&lock); + return e; + } + + } + + const int n = max(0,min(surplus, want)); + memcpy(output, surplus_buf, n); + + surplus -= n; + *got = n; + + if (surplus > 0) + memmove(surplus_buf, surplus_buf + n, surplus); + } + ReleaseSRWLockExclusive(&lock); + + return WINDOWS_CONSOLE_SUCCESS; +} + +// Having previously set the code page to UTF-8, we can output directly. + +int S_windows_stderr_write(unsigned char *buf, int size) { + return _write(2, buf, size); +} + + +int S_windows_stdout_write(unsigned char *buf, int size) { + return _write(1, buf, size); +} + +// Despite having set the code page to UTF-8, we cant use it to read due to +// windows bugs. We fall back to using the UTF-16 api, buffering the stream +// to avoid splitting codepoints. + +int S_windows_stdin_read(unsigned char *buf, int size) { + + const HANDLE h = GetStdHandle(STD_INPUT_HANDLE); + + if (h == INVALID_HANDLE_VALUE || size <= 0) + return WINDOWS_CONSOLE_ERROR; + + const int type = GetFileType(h); + + switch (type) { + case FILE_TYPE_CHAR: { + int n = 0; + const int ret = S_windows_console_read_utf8(h, buf, size, &n); + + switch (ret) { + case WINDOWS_CONSOLE_EOF : return 0; + case WINDOWS_CONSOLE_SUCCESS : return n; + default : return -1; + } + } + case FILE_TYPE_DISK: + case FILE_TYPE_PIPE: + return _read(0, buf, size); + + default: + return -1; + } +}