Convert duktape's cesu-8 strings to utf-8 in js_print (95c3f705) · Commits · milliways / mw

src/client/js-duk.c

+64 −2

Original line number	Diff line number	Diff line
		@@ -79,14 +79,76 @@ static void start_timeout(void)
		timeout_event = alarm_after(3, 0, NULL, &timeout);
		}

		/* Duktape uses a CESU-8 encoding, which allows UTF-16 surrogate pairs
		(themselves encoded in UTF-8), in order to be kinda-sorta compatible with
		ecmascript's UTF-16 requirements. This function just copies the cesu8 string,
		converting any surrogate pairs it finds to UTF-8. */
		static char cesu8_to_utf8(const char cesu8)
		{
		char *utf8 = calloc(1, strlen(cesu8) + 1);
		const unsigned char cc = (void )cesu8;
		char *cu = utf8;
		uint32_t hs = 0;

		while (*cc != '\0') {
		uint32_t c = 0;
		uint32_t u;

		if (cc[0] <= 0x7F) {
		cu++ = cc++;
		continue;
		} else if (cc[0] <= 0xDF) {
		cu++ = cc++;
		cu++ = cc++;
		continue;
		} else if (cc[0] <= 0xEF) {
		/* Surrogates are encoded in 3 chars so convert
		back to a single UTF-16 value */
		c = ((uint32_t)cc[0] & 0xF) << 12 \|
		((uint32_t)cc[1] & 0x3F) << 6 \|
		((uint32_t)cc[2] & 0x3F);
		} else {
		cu++ = cc++;
		cu++ = cc++;
		cu++ = cc++;
		cu++ = cc++;
		continue;
		}
		if (hs == 0 && c >= 0xD800 && c <= 0xDBFF)
		hs = c;
		else if (hs != 0 && c >= 0xDC00 && c <= 0xDFFF) {
		/* Have high and low surrogates - convert to code point then
		back to UTF-8 */
		u = 0x10000 + ((((uint32_t)hs & 0x3FF) << 10) \| (c & 0x3FF));
		*cu++ = 0xF0 \| u >> 18;
		*cu++ = 0x80 \| (u >> 12 & 0x3F);
		*cu++ = 0x80 \| (u >> 6 & 0x3F);
		*cu++ = 0x80 \| (u & 0x3F);
		hs = 0;
		} else {
		*cu++ = cc[0];
		*cu++ = cc[1];
		*cu++ = cc[2];
		hs = 0;
		}
		cc += 3;
		}
		*cu = '\0';
		return utf8;
		}

		static duk_ret_t js_print(duk_context *cx)
		{
		int argc = duk_get_top(cx);

		if (argc < 1)
		return 0;
		for (int i = 0; i < argc; i++)
		display_message(duk_to_string(cx, i - argc), 0, 1);
		for (int i = 0; i < argc; i++) {
		const char *cesu8 = duk_to_string(cx, i - argc);
		char *utf8 = cesu8_to_utf8(cesu8);
		display_message(utf8, 0, 1);
		free(utf8);
		}
		return 0;
		}