From ffb95c2e0ced9c6f9ca9109e37ea039a67797118 Mon Sep 17 00:00:00 2001
From: Petr Skocik <pskocik@gmail.com>
Date: Sun, 17 Jan 2021 22:21:07 +0100
Subject: [PATCH] Better handling of UCNs in strings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As the standard requires, take 4 hex digits after the \u opener of a
Universal Character Name, or take 8 hex digits after \U, but reject
smaller counts and don't consume more (https://port70.net/~nsz/c/c11/n1570.html#6.4.3,
https://port70.net/~nsz/c/c99/n1256.html#6.4.3).

The unicode codepoint used to get truncated to 1 byte. Now it gets expanded into UTF-8,
matching gcc & clang behavior on Linux.

TODO: Universal character names should also be supported in identifiers,
as in, e.g., char \u010dau_sv\u011bte[]="čau_světe";
---
 tccpp.c                               | 44 +++++++++++++++++++++++----
 tests/tests2/97_utf8_string_literal.c |  9 +++++-
 2 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/tccpp.c b/tccpp.c
index cd9bd684..bd84c574 100644
--- a/tccpp.c
+++ b/tccpp.c
@@ -345,6 +345,31 @@ ST_INLN void cstr_ccat(CString *cstr, int ch)
     cstr->size = size;
 }
 
+char *unicode_to_utf8 (char *b, uint32_t Uc)
+{
+    if (Uc<0x80) *b++=Uc;
+    else if (Uc<0x800) *b++=192+Uc/64, *b++=128+Uc%64;
+    else if (Uc-0xd800u<0x800) return b;
+    else if (Uc<0x10000) *b++=224+Uc/4096, *b++=128+Uc/64%64, *b++=128+Uc%64;
+    else if (Uc<0x110000) *b++=240+Uc/262144, *b++=128+Uc/4096%64, *b++=128+Uc/64%64, *b++=128+Uc%64;
+    return b;
+}
+
+/* add a unicode character expanded into utf8 */
+void cstr_u8cat(CString *cstr, int ch)
+{
+    unsigned char buf[4];
+    int size;
+    int add = (int)((unsigned char*)unicode_to_utf8(&buf[0],(uint32_t)ch) - &buf[0]);
+    unsigned char *p,*b=buf;
+    size = cstr->size + add;
+    if (size > cstr->size_allocated)
+        cstr_realloc(cstr, size);
+    for(p = (unsigned char*)cstr->data + (size - add); add; add--) *p++=*b++;
+    cstr->size = size;
+}
+
+
 ST_FUNC void cstr_cat(CString *cstr, const char *str, int len)
 {
     int size;
@@ -2100,12 +2125,13 @@ static void parse_escape_string(CString *outstr, const uint8_t *buf, int is_long
                 }
                 c = n;
                 goto add_char_nonext;
-            case 'x':
-            case 'u':
-            case 'U':
+            case 'x': { unsigned ucn_chars_nr = -1u; goto parse_hex_or_ucn;
+            case 'u': ucn_chars_nr = 4; goto parse_hex_or_ucn;
+            case 'U': ucn_chars_nr = 8; goto parse_hex_or_ucn;
+                parse_hex_or_ucn:;
                 p++;
                 n = 0;
-                for(;;) {
+                for(unsigned i=1;i<=ucn_chars_nr;i++) {
                     c = *p;
                     if (c >= 'a' && c <= 'f')
                         c = c - 'a' + 10;
@@ -2113,13 +2139,19 @@ static void parse_escape_string(CString *outstr, const uint8_t *buf, int is_long
                         c = c - 'A' + 10;
                     else if (isnum(c))
                         c = c - '0';
-                    else
+                    else{
+                        if (ucn_chars_nr!=-1)
+                            tcc_error("%u hex digits expected in universal-character-name\n", ucn_chars_nr);
                         break;
+                    }
                     n = n * 16 + c;
                     p++;
                 }
                 c = n;
-                goto add_char_nonext;
+                if(ucn_chars_nr==-1) goto add_char_nonext;
+                cstr_u8cat(outstr, c);
+                continue;
+             }
             case 'a':
                 c = '\a';
                 break;
diff --git a/tests/tests2/97_utf8_string_literal.c b/tests/tests2/97_utf8_string_literal.c
index 96fbab0d..c7e89989 100644
--- a/tests/tests2/97_utf8_string_literal.c
+++ b/tests/tests2/97_utf8_string_literal.c
@@ -1,12 +1,19 @@
 // this file contains BMP chars encoded in UTF-8
 #include <stdio.h>
 #include <wchar.h>
+#include <stdlib.h>
+#include <string.h>
 
 int main()
 {
+	char hello_world_in_czech[] = "čau, světe";
+	char hello_world_in_czech_ucn[] = "\u010dau, sv\u011bte";
+	if (sizeof(hello_world_in_czech) != sizeof(hello_world_in_czech_ucn)
+		|| strcmp(hello_world_in_czech, hello_world_in_czech_ucn))
+		abort();
+
     wchar_t s[] = L"hello$$你好¢¢世界€€world";
     wchar_t *p;
     for (p = s; *p; p++) printf("%04X ", (unsigned) *p);
-    printf("\n");
     return 0;
 }