mirror of
git://repo.or.cz/tinycc.git
synced 2026-06-28 23:58:41 +08:00
Better handling of UCNs in strings
As the standard requires, take 4 hex digits after the \u opener of a Universal Character Name, or take 8 hex digits after \U, but reject smaller counts and don't consume more (https://port70.net/~nsz/c/c11/n1570.html#6.4.3, https://port70.net/~nsz/c/c99/n1256.html#6.4.3). The unicode codepoint used to get truncated to 1 byte. Now it gets expanded into UTF-8, matching gcc & clang behavior on Linux. TODO: Universal character names should also be supported in identifiers, as in, e.g., char \u010dau_sv\u011bte[]="čau_světe";
This commit is contained in:
parent
6b614c4deb
commit
ffb95c2e0c
44
tccpp.c
44
tccpp.c
@ -345,6 +345,31 @@ ST_INLN void cstr_ccat(CString *cstr, int ch)
|
|||||||
cstr->size = size;
|
cstr->size = size;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char *unicode_to_utf8 (char *b, uint32_t Uc)
|
||||||
|
{
|
||||||
|
if (Uc<0x80) *b++=Uc;
|
||||||
|
else if (Uc<0x800) *b++=192+Uc/64, *b++=128+Uc%64;
|
||||||
|
else if (Uc-0xd800u<0x800) return b;
|
||||||
|
else if (Uc<0x10000) *b++=224+Uc/4096, *b++=128+Uc/64%64, *b++=128+Uc%64;
|
||||||
|
else if (Uc<0x110000) *b++=240+Uc/262144, *b++=128+Uc/4096%64, *b++=128+Uc/64%64, *b++=128+Uc%64;
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* add a unicode character expanded into utf8 */
|
||||||
|
void cstr_u8cat(CString *cstr, int ch)
|
||||||
|
{
|
||||||
|
unsigned char buf[4];
|
||||||
|
int size;
|
||||||
|
int add = (int)((unsigned char*)unicode_to_utf8(&buf[0],(uint32_t)ch) - &buf[0]);
|
||||||
|
unsigned char *p,*b=buf;
|
||||||
|
size = cstr->size + add;
|
||||||
|
if (size > cstr->size_allocated)
|
||||||
|
cstr_realloc(cstr, size);
|
||||||
|
for(p = (unsigned char*)cstr->data + (size - add); add; add--) *p++=*b++;
|
||||||
|
cstr->size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
ST_FUNC void cstr_cat(CString *cstr, const char *str, int len)
|
ST_FUNC void cstr_cat(CString *cstr, const char *str, int len)
|
||||||
{
|
{
|
||||||
int size;
|
int size;
|
||||||
@ -2100,12 +2125,13 @@ static void parse_escape_string(CString *outstr, const uint8_t *buf, int is_long
|
|||||||
}
|
}
|
||||||
c = n;
|
c = n;
|
||||||
goto add_char_nonext;
|
goto add_char_nonext;
|
||||||
case 'x':
|
case 'x': { unsigned ucn_chars_nr = -1u; goto parse_hex_or_ucn;
|
||||||
case 'u':
|
case 'u': ucn_chars_nr = 4; goto parse_hex_or_ucn;
|
||||||
case 'U':
|
case 'U': ucn_chars_nr = 8; goto parse_hex_or_ucn;
|
||||||
|
parse_hex_or_ucn:;
|
||||||
p++;
|
p++;
|
||||||
n = 0;
|
n = 0;
|
||||||
for(;;) {
|
for(unsigned i=1;i<=ucn_chars_nr;i++) {
|
||||||
c = *p;
|
c = *p;
|
||||||
if (c >= 'a' && c <= 'f')
|
if (c >= 'a' && c <= 'f')
|
||||||
c = c - 'a' + 10;
|
c = c - 'a' + 10;
|
||||||
@ -2113,13 +2139,19 @@ static void parse_escape_string(CString *outstr, const uint8_t *buf, int is_long
|
|||||||
c = c - 'A' + 10;
|
c = c - 'A' + 10;
|
||||||
else if (isnum(c))
|
else if (isnum(c))
|
||||||
c = c - '0';
|
c = c - '0';
|
||||||
else
|
else{
|
||||||
|
if (ucn_chars_nr!=-1)
|
||||||
|
tcc_error("%u hex digits expected in universal-character-name\n", ucn_chars_nr);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
n = n * 16 + c;
|
n = n * 16 + c;
|
||||||
p++;
|
p++;
|
||||||
}
|
}
|
||||||
c = n;
|
c = n;
|
||||||
goto add_char_nonext;
|
if(ucn_chars_nr==-1) goto add_char_nonext;
|
||||||
|
cstr_u8cat(outstr, c);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
case 'a':
|
case 'a':
|
||||||
c = '\a';
|
c = '\a';
|
||||||
break;
|
break;
|
||||||
|
|||||||
@ -1,12 +1,19 @@
|
|||||||
// this file contains BMP chars encoded in UTF-8
|
// this file contains BMP chars encoded in UTF-8
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <wchar.h>
|
#include <wchar.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
|
char hello_world_in_czech[] = "čau, světe";
|
||||||
|
char hello_world_in_czech_ucn[] = "\u010dau, sv\u011bte";
|
||||||
|
if (sizeof(hello_world_in_czech) != sizeof(hello_world_in_czech_ucn)
|
||||||
|
|| strcmp(hello_world_in_czech, hello_world_in_czech_ucn))
|
||||||
|
abort();
|
||||||
|
|
||||||
wchar_t s[] = L"hello$$你好¢¢世界€€world";
|
wchar_t s[] = L"hello$$你好¢¢世界€€world";
|
||||||
wchar_t *p;
|
wchar_t *p;
|
||||||
for (p = s; *p; p++) printf("%04X ", (unsigned) *p);
|
for (p = s; *p; p++) printf("%04X ", (unsigned) *p);
|
||||||
printf("\n");
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user