diff --git a/.gitignore b/.gitignore
index 8fe7959d..6870b8d3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -62,7 +62,6 @@ tests/*-cc*
 tests/*-tcc*
 tests/libtcc_test
 tests/libtcc_test_mt
-tests/libtcc_test_xor_rex
 tests/asm-c-connect
 tests/asm-c-connect-sep
 tests/vla_test
diff --git a/arm64-gen.c b/arm64-gen.c
index 6997082c..feb08719 100644
--- a/arm64-gen.c
+++ b/arm64-gen.c
@@ -1848,39 +1848,26 @@ ST_FUNC void gen_opl(int op)
 ST_FUNC void gen_opf(int op)
 {
     uint32_t x, a, b, dbl;
+    int bt = vtop[0].type.t & VT_BTYPE;
 
     if (op == TOK_NEG) {
-        switch (vtop[0].type.t & VT_BTYPE) {
-        case VT_LDOUBLE:
+        if (bt == VT_LDOUBLE) {
             vpush_helper_func(TOK___negtf2);
             vrott(2);
             gfunc_call(1);
             vpushi(0);
-            vtop->type.t = VT_LDOUBLE;
+            vtop->type.t = bt;
             vtop->r = REG_FRET;
-            break;
-
-        case VT_FLOAT:
-        case VT_DOUBLE:
+        } else {
             gv(RC_FLOAT);
-            dbl = (vtop[0].type.t & VT_BTYPE) == VT_DOUBLE;
-
+            dbl = bt == VT_DOUBLE;
             a = fltr(vtop[0].r);
-            vtop--;
-            x = get_reg(RC_FLOAT);
-            vtop++;
-            vtop[0].r = x;
-            x = fltr(x);
-
-            o(0x1e214000 | dbl << 22 | x | a << 5);
-            break;
-        default:
-            assert(0);
+            o(0x1e214000 | dbl << 22 | a | a << 5);
         }
         return;
     }
 
-    if (vtop[0].type.t == VT_LDOUBLE) {
+    if (bt == VT_LDOUBLE) {
         CType type = vtop[0].type;
         int func = 0;
         int cond = -1;
@@ -1912,7 +1899,7 @@ ST_FUNC void gen_opf(int op)
         return;
     }
 
-    dbl = vtop[0].type.t != VT_FLOAT;
+    dbl = bt != VT_FLOAT;
     gv2(RC_FLOAT, RC_FLOAT);
     assert(vtop[-1].r < VT_CONST && vtop[0].r < VT_CONST);
     a = fltr(vtop[-1].r);
@@ -2075,12 +2062,7 @@ ST_FUNC void gen_cvt_ftof(int t)
         gv(RC_FLOAT);
         assert(vtop[0].r < VT_CONST);
         a = fltr(vtop[0].r);
-        --vtop;
-        x = get_reg(RC_FLOAT);
-        ++vtop;
-        vtop[0].r = x;
-        x = fltr(x);
-
+        x = a;
         if (f == VT_FLOAT)
             o(0x1e22c000 | x | a << 5); // fcvt d(x),s(a)
         else
diff --git a/include/tccdefs.h b/include/tccdefs.h
index d7596ac6..45aa2c0b 100644
--- a/include/tccdefs.h
+++ b/include/tccdefs.h
@@ -191,10 +191,7 @@
 #if defined __x86_64__
 #if !defined _WIN32
     /* GCC compatible definition of va_list. */
-
-    enum __va_arg_type {
-        __va_gen_reg, __va_float_reg, __va_stack
-    };
+    /* This should be in sync with the declaration in our lib/va_list.c */
     typedef struct {
         unsigned gp_offset, fp_offset;
         union {
@@ -204,43 +201,7 @@
         char *reg_save_area;
     } __builtin_va_list[1];
 
-    static inline void *__va_arg(__builtin_va_list ap, int arg_type,
-                                 int size, int align)
-    {
-        size = (size + 7) & ~7;
-        align = (align + 7) & ~7;
-        switch ((enum __va_arg_type)arg_type) {
-        case __va_gen_reg:
-            if (ap->gp_offset + size <= 48) {
-                ap->gp_offset += size;
-                return ap->reg_save_area + ap->gp_offset - size;
-            }
-            goto use_overflow_area;
-        case __va_float_reg:
-            if (ap->fp_offset < 128 + 48) {
-                ap->fp_offset += 16;
-                if (size == 8)
-                    return ap->reg_save_area + ap->fp_offset - 16;
-                if (ap->fp_offset < 128 + 48) {
-                    double *p = (double *)(ap->reg_save_area + ap->fp_offset);
-                    p[-1] = p[0];
-                    ap->fp_offset += 16;
-                    return ap->reg_save_area + ap->fp_offset - 32;
-                }
-            }
-            goto use_overflow_area;
-        case __va_stack:
-        use_overflow_area:
-            ap->overflow_arg_area += size;
-            ap->overflow_arg_area =
-	      (char*)((long long)(ap->overflow_arg_area + align - 1) & -align);
-            return ap->overflow_arg_area - size;
-        default: /* should never happen */
-            char *a = (char *)0; *a = 0; // abort
-            return 0;
-        }
-    }
-
+    void *__va_arg(__builtin_va_list ap, int arg_type, int size, int align);
     #define __builtin_va_start(ap, last) \
        (*(ap) = *(__builtin_va_list)((char*)__builtin_frame_address(0) - 24))
     #define __builtin_va_arg(ap, t)   \
diff --git a/lib/Makefile b/lib/Makefile
index 5357e25f..9eddc526 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -59,8 +59,8 @@ $(Cbc)COMMON_O += bcheck.o
 EXTRA_O = runmain.o bt-exe.o bt-dll.o bt-log.o bcheck.o
 
 OBJ-i386 = $(I386_O) pic86.o $(LIN_O)
-OBJ-x86_64 = $(X86_64_O) $(LIN_O)
-OBJ-x86_64-osx = $(X86_64_O) $(OSX_O)
+OBJ-x86_64 = $(X86_64_O) va_list.o $(LIN_O)
+OBJ-x86_64-osx = $(X86_64_O) va_list.o $(OSX_O)
 OBJ-i386-win32 = $(I386_O) chkstk.o $(WIN_O)
 OBJ-x86_64-win32 = $(X86_64_O) chkstk.o $(WIN_O)
 OBJ-arm64 = $(ARM64_O) $(LIN_O)
diff --git a/lib/lib-arm64.c b/lib/lib-arm64.c
index 0d6bddd3..5637358e 100644
--- a/lib/lib-arm64.c
+++ b/lib/lib-arm64.c
@@ -18,14 +18,6 @@ typedef int int32_t;
 typedef unsigned uint32_t;
 typedef long long int64_t;
 typedef unsigned long long uint64_t;
-static void *memcpy(void* d, void* s, __SIZE_TYPE__ c) {
-    char *d_, *s_;
-    d_ = d; s_ = s;
-    for (__SIZE_TYPE__ i = 0; i < c; ++i) {
-        d_[i] = s_[i];
-    }
-    return d;
-}
 #else
 #include <stdint.h>
 #include <string.h>
@@ -38,29 +30,35 @@ void __clear_cache(void *beg, void *end)
 }
 #endif
 
-typedef struct {
-    uint64_t x0, x1;
+typedef union {
+    struct { uint64_t x0, x1; };
+    long double f;
 } u128_t;
 
+typedef union {
+    uint64_t x;
+    double f;
+} u64_t;
+
+typedef union {
+    uint32_t x;
+    float f;
+} u32_t;
+
 static long double f3_zero(int sgn)
 {
-    long double f;
     u128_t x = { 0, (uint64_t)sgn << 63 };
-    memcpy(&f, &x, 16);
-    return f;
+    return x.f;
 }
 
 static long double f3_infinity(int sgn)
 {
-    long double f;
     u128_t x = { 0, (uint64_t)sgn << 63 | 0x7fff000000000000 };
-    memcpy(&f, &x, 16);
-    return f;
+    return x.f;
 }
 
 static long double f3_NaN(void)
 {
-    long double f;
 #if 0
     // ARM's default NaN usually has just the top fraction bit set:
     u128_t x = {  0, 0x7fff800000000000 };
@@ -68,28 +66,31 @@ static long double f3_NaN(void)
     // GCC's library sets all fraction bits:
     u128_t x = { -1, 0x7fffffffffffffff };
 #endif
-    memcpy(&f, &x, 16);
-    return f;
+    return x.f;
 }
 
-static int fp3_convert_NaN(long double *f, int sgn, u128_t mnt)
+static int fp3_convert_NaN(long double *f, int sgn, u128_t *mnt)
 {
-    u128_t x = { mnt.x0,
-                 mnt.x1 | 0x7fff800000000000 | (uint64_t)sgn << 63 };
-    memcpy(f, &x, 16);
+    u128_t x = { mnt->x0,
+                 mnt->x1 | 0x7fff800000000000 | (uint64_t)sgn << 63 };
+    *f = x.f;
     return 1;
+#define fp3_convert_NaN(a,b,c) fp3_convert_NaN(a,b,&c)
 }
 
 static int fp3_detect_NaNs(long double *f,
-                           int a_sgn, int a_exp, u128_t a,
-                           int b_sgn, int b_exp, u128_t b)
+                           int a_sgn, int a_exp, u128_t *a,
+                           int b_sgn, int b_exp, u128_t *b)
+#define a (*a)
+#define b (*b)
 {
+#if 0
     // Detect signalling NaNs:
     if (a_exp == 32767 && (a.x0 | a.x1 << 16) && !(a.x1 >> 47 & 1))
         return fp3_convert_NaN(f, a_sgn, a);
     if (b_exp == 32767 && (b.x0 | b.x1 << 16) && !(b.x1 >> 47 & 1))
         return fp3_convert_NaN(f, b_sgn, b);
-
+#endif
     // Detect quiet NaNs:
     if (a_exp == 32767 && (a.x0 | a.x1 << 16))
         return fp3_convert_NaN(f, a_sgn, a);
@@ -97,12 +98,16 @@ static int fp3_detect_NaNs(long double *f,
         return fp3_convert_NaN(f, b_sgn, b);
 
     return 0;
+#undef a
+#undef b
+#define fp3_detect_NaNs(a,b,c,d,e,f,g) fp3_detect_NaNs(a,b,c,&d,e,f,&g)
 }
 
 static void f3_unpack(int *sgn, int32_t *exp, u128_t *mnt, long double f)
 {
     u128_t x;
-    memcpy(&x, &f, 16);
+
+    x.f = f;
     *sgn = x.x1 >> 63;
     *exp = x.x1 >> 48 & 32767;
     x.x1 = x.x1 << 16 >> 16;
@@ -110,7 +115,7 @@ static void f3_unpack(int *sgn, int32_t *exp, u128_t *mnt, long double f)
         x.x1 |= (uint64_t)1 << 48;
     else
         *exp = 1;
-    memcpy(mnt, &x, 16);
+    mnt->f = x.f;
 }
 
 static void f3_normalise(int32_t *exp, u128_t *mnt)
@@ -184,8 +189,7 @@ static long double f3_round(int sgn, int32_t exp, u128_t *x)
         return f3_infinity(sgn);
 
     x->x1 = x->x1 << 16 >> 16 | (uint64_t)exp << 48 | (uint64_t)sgn << 63;
-    memcpy(&f, x, 16);
-    return f;
+    return x->f;
 }
 
 static long double f3_add(long double fa, long double fb, int neg)
@@ -380,23 +384,20 @@ long double __divtf3(long double fa, long double fb)
 
 long double __negtf2(long double f)
 {
-    u128_t a;
-
-    memcpy(&a, &f, 16);
-    a.x1 ^= 1UL << 63;
-    memcpy(&f, &a, 16);
-
+    ((u128_t*)&f)->x1 ^= 1UL << 63;
     return f;
 }
 
 long double __extendsftf2(float f)
 {
-    long double fx;
     u128_t x;
+    u32_t u;
     uint32_t a;
     uint64_t aa;
-    memcpy(&a, &f, 4);
+
+    u.f = f, a = u.x;
     aa = a;
+
     x.x0 = 0;
     if (!(a << 1))
         x.x1 = aa << 32;
@@ -411,16 +412,17 @@ long double __extendsftf2(float f)
     } else
         x.x1 = (aa >> 31 << 63 | ((aa >> 23 & 255) + 16256) << 48 |
                 aa << 41 >> 16);
-    memcpy(&fx, &x, 16);
-    return fx;
+    return x.f;
 }
 
 long double __extenddftf2(double f)
 {
-    long double fx;
     u128_t x;
+    u64_t u;
     uint64_t a;
-    memcpy(&a, &f, 8);
+
+    u.f = f, a = u.x;
+
     x.x0 = a << 60;
     if (!(a << 1))
         x.x1 = a;
@@ -435,8 +437,7 @@ long double __extenddftf2(double f)
         x.x1 = a >> 63 << 63 | (15360 - adj + 1) << 48 | a << adj << 12 >> 16;
     } else
         x.x1 = a >> 63 << 63 | ((a >> 52 & 2047) + 15360) << 48 | a << 12 >> 16;
-    memcpy(&fx, &x, 16);
-    return fx;
+    return x.f;
 }
 
 float __trunctfsf2(long double f)
@@ -444,11 +445,10 @@ float __trunctfsf2(long double f)
     u128_t mnt;
     int32_t exp;
     int sgn;
-    uint32_t x;
-    float fx;
+    u32_t x;
+#define x x.x
 
     f3_unpack(&sgn, &exp, &mnt, f);
-
     if (exp == 32767 && (mnt.x0 | mnt.x1 << 16))
         x = 0x7fc00000 | (uint32_t)sgn << 31 | (mnt.x1 >> 25 & 0x007fffff);
     else if (exp > 16510)
@@ -466,8 +466,8 @@ float __trunctfsf2(long double f)
             x += 4;
         x = ((x >> 2) + (exp << 23)) | (uint32_t)sgn << 31;
     }
-    memcpy(&fx, &x, 4);
-    return fx;
+#undef x
+    return x.f;
 }
 
 double __trunctfdf2(long double f)
@@ -475,11 +475,10 @@ double __trunctfdf2(long double f)
     u128_t mnt;
     int32_t exp;
     int sgn;
-    uint64_t x;
-    double fx;
+    u64_t x;
+#define x x.x
 
     f3_unpack(&sgn, &exp, &mnt, f);
-
     if (exp == 32767 && (mnt.x0 | mnt.x1 << 16))
         x = (0x7ff8000000000000 | (uint64_t)sgn << 63 |
              mnt.x1 << 16 >> 12 | mnt.x0 >> 60);
@@ -498,8 +497,8 @@ double __trunctfdf2(long double f)
             x += 4;
         x = ((x >> 2) + ((uint64_t)exp << 52)) | (uint64_t)sgn << 63;
     }
-    memcpy(&fx, &x, 8);
-    return fx;
+#undef x
+    return x.f;
 }
 
 int32_t __fixtfsi(long double fa)
@@ -564,7 +563,6 @@ long double __floatsitf(int32_t a)
     int exp = 16414;
     uint32_t mnt = a;
     u128_t x = { 0, 0 };
-    long double f;
     int i;
     if (a) {
         if (a < 0) {
@@ -579,8 +577,7 @@ long double __floatsitf(int32_t a)
         x.x1 = ((uint64_t)sgn << 63 | (uint64_t)exp << 48 |
                 (uint64_t)(mnt << 1) << 16);
     }
-    memcpy(&f, &x, 16);
-    return f;
+    return x.f;
 }
 
 long double __floatditf(int64_t a)
@@ -589,7 +586,6 @@ long double __floatditf(int64_t a)
     int exp = 16446;
     uint64_t mnt = a;
     u128_t x = { 0, 0 };
-    long double f;
     int i;
     if (a) {
         if (a < 0) {
@@ -604,8 +600,7 @@ long double __floatditf(int64_t a)
         x.x0 = mnt << 49;
         x.x1 = (uint64_t)sgn << 63 | (uint64_t)exp << 48 | mnt << 1 >> 16;
     }
-    memcpy(&f, &x, 16);
-    return f;
+    return x.f;
 }
 
 long double __floatunsitf(uint32_t a)
@@ -613,7 +608,6 @@ long double __floatunsitf(uint32_t a)
     int exp = 16414;
     uint32_t mnt = a;
     u128_t x = { 0, 0 };
-    long double f;
     int i;
     if (a) {
         for (i = 16; i; i >>= 1)
@@ -623,8 +617,7 @@ long double __floatunsitf(uint32_t a)
             }
         x.x1 = (uint64_t)exp << 48 | (uint64_t)(mnt << 1) << 16;
     }
-    memcpy(&f, &x, 16);
-    return f;
+    return x.f;
 }
 
 long double __floatunditf(uint64_t a)
@@ -643,15 +636,14 @@ long double __floatunditf(uint64_t a)
         x.x0 = mnt << 49;
         x.x1 = (uint64_t)exp << 48 | mnt << 1 >> 16;
     }
-    memcpy(&f, &x, 16);
-    return f;
+    return x.f;
 }
 
 static int f3_cmp(long double fa, long double fb)
 {
     u128_t a, b;
-    memcpy(&a, &fa, 16);
-    memcpy(&b, &fb, 16);
+    a.f = fa;
+    b.f = fb;
     return (!(a.x0 | a.x1 << 1 | b.x0 | b.x1 << 1) ? 0 :
             ((a.x1 << 1 >> 49 == 0x7fff && (a.x0 | a.x1 << 16)) ||
              (b.x1 << 1 >> 49 == 0x7fff && (b.x0 | b.x1 << 16))) ? 2 :
diff --git a/lib/va_list.c b/lib/va_list.c
new file mode 100644
index 00000000..1fb55127
--- /dev/null
+++ b/lib/va_list.c
@@ -0,0 +1,67 @@
+/* va_list.c - tinycc support for va_list on X86_64 */
+
+#if defined __x86_64__
+
+/* Avoid include files, they may not be available when cross compiling */
+extern void abort(void);
+
+/* This should be in sync with our include/stdarg.h */
+enum __va_arg_type {
+    __va_gen_reg, __va_float_reg, __va_stack
+};
+
+/* GCC compatible definition of va_list. */
+/*predefined by TCC (tcc_predefs.h):
+typedef struct {
+    unsigned int gp_offset;
+    unsigned int fp_offset;
+    union {
+        unsigned int overflow_offset;
+        char *overflow_arg_area;
+    };
+    char *reg_save_area;
+} __builtin_va_list[1];
+*/
+
+extern void *memcpy(void *dest, const void *src, unsigned long n);
+
+void *__va_arg(__builtin_va_list ap,
+               int arg_type,
+               int size, int align)
+{
+    size = (size + 7) & ~7;
+    align = (align + 7) & ~7;
+    switch ((enum __va_arg_type)arg_type) {
+    case __va_gen_reg:
+        if (ap->gp_offset + size <= 48) {
+            ap->gp_offset += size;
+            return ap->reg_save_area + ap->gp_offset - size;
+        }
+        goto use_overflow_area;
+
+    case __va_float_reg:
+        if (ap->fp_offset < 128 + 48) {
+            ap->fp_offset += 16;
+            if (size == 8)
+                return ap->reg_save_area + ap->fp_offset - 16;
+            if (ap->fp_offset < 128 + 48) {
+                memcpy(ap->reg_save_area + ap->fp_offset - 8,
+                       ap->reg_save_area + ap->fp_offset, 8);
+                ap->fp_offset += 16;
+                return ap->reg_save_area + ap->fp_offset - 32;
+            }
+        }
+        goto use_overflow_area;
+
+    case __va_stack:
+    use_overflow_area:
+        ap->overflow_arg_area += size;
+        ap->overflow_arg_area = (char*)((long long)(ap->overflow_arg_area + align - 1) & -align);
+        return ap->overflow_arg_area - size;
+
+    default: /* should never happen */
+        abort();
+        return 0;
+    }
+}
+#endif
diff --git a/libtcc.c b/libtcc.c
index 58942e47..ce3a9796 100644
--- a/libtcc.c
+++ b/libtcc.c
@@ -791,7 +791,7 @@ ST_FUNC int tcc_open(TCCState *s1, const char *filename)
 }
 
 /* compile the file opened in 'file'. Return non zero if errors. */
-static int tcc_compile(TCCState *s1, int filetype, const char *str, int fd, const char *filename)
+static int tcc_compile(TCCState *s1, int filetype, const char *str, int fd)
 {
     /* Here we enter the code section where we use the global variables for
        parsing and code generation (tccpp.c, tccgen.c, <target>-gen.c).
@@ -807,16 +807,8 @@ static int tcc_compile(TCCState *s1, int filetype, const char *str, int fd, cons
 
         if (fd == -1) {
             int len = strlen(str);
-            tcc_open_bf(s1, filename ? filename : "<string>", len);
+            tcc_open_bf(s1, "<string>", len);
             memcpy(file->buffer, str, len);
-	    if (s1->do_debug && filename) {
-		FILE *fp = fopen(filename, "w");
-
-		if (fp) {
-		    fputs(str, fp);
-		    fclose(fp);
-		}
-	    }
         } else {
             tcc_open_bf(s1, str, 0);
             file->fd = fd;
@@ -846,12 +838,7 @@ static int tcc_compile(TCCState *s1, int filetype, const char *str, int fd, cons
 
 LIBTCCAPI int tcc_compile_string(TCCState *s, const char *str)
 {
-    return tcc_compile(s, s->filetype, str, -1, NULL);
-}
-
-LIBTCCAPI int tcc_compile_string_file(TCCState *s, const char *str, const char *filename)
-{
-    return tcc_compile(s, s->filetype, str, -1, filename);
+    return tcc_compile(s, s->filetype, str, -1);
 }
 
 /* define a preprocessor symbol. value can be NULL, sym can be "sym=val" */
@@ -1246,7 +1233,7 @@ ST_FUNC int tcc_add_file_internal(TCCState *s1, const char *filename, int flags)
         return tcc_add_binary(s1, flags, filename, fd);
 
     dynarray_add(&s1->target_deps, &s1->nb_target_deps, tcc_strdup(filename));
-    return tcc_compile(s1, flags, filename, fd, NULL);
+    return tcc_compile(s1, flags, filename, fd);
 }
 
 LIBTCCAPI int tcc_add_file(TCCState *s, const char *filename)
@@ -1585,30 +1572,6 @@ enum {
 #define TCC_OPTION_HAS_ARG 0x0001
 #define TCC_OPTION_NOSEP   0x0002 /* cannot have space before option and arg */
 
-/*
- * in tcc_options, if opt-string A is a prefix of opt-string B,
- * it's un-ambiguous if and only if option A is without TCC_OPTION_HAS_ARG.
- * otherwise (A with HAS_ARG), if, for instance, A is FOO and B is FOOBAR,
- * then "-FOOBAR" is either A with arg BAR, or B (-FOOBARX too, if B HAS_ARG).
- *
- * tcc_parse_args searches tcc_options in order, so if ambiguous:
- * - if the shorter (A) is earlier: the longer (B) is completely unreachable.
- * - else B wins, and A can't be used with adjacent arg if it also matches B.
- *
- * there are few clashes currently, and the longer is always earlier/reachable.
- * when it's ambiguous, shorter-concat-arg is not useful currently.
- * the sh(1) script 'optclash' can identifiy clashes (tcc root dir, try "-h").
- * at the time of writing, running './optclash' prints this:
-
-    -Wl,... (1642) overrides -W... (1644)
-    -Wp,... (1643) overrides -W... (1644)
-    -dumpmachine (1630) overrides -d... (1632)
-    -dumpversion (1631) overrides -d... (1632)
-    -dynamiclib (1623) overrides -d... (1632)
-    -flat_namespace (1624) overrides -f... (1650)
-    -mfloat-abi... (1647) overrides -m... (1649)
-
- */
 static const TCCOption tcc_options[] = {
     { "h", TCC_OPTION_HELP, 0 },
     { "-help", TCC_OPTION_HELP, 0 },
@@ -1848,27 +1811,6 @@ static void args_parser_add_file(TCCState *s, const char* filename, int filetype
         ++s->nb_libraries;
 }
 
-/*  parsing is between getopt(3) and getopt_long(3), and permuting-like:
- *  - an option is 1 or more chars.
- *  - at most 1 option per arg in argv.
- *  - an option in argv is "-OPT[...]" (few are --OPT, if OPT is "-...").
- *  - optarg is next arg, or adjacent non-empty (no '='. -std=.. is arg "=..").
- *  - supports also adjacent-only optarg (typically optional).
- *  - supports mixed options and operands ("--" is ignored, except with -run).
- *  - -OPT[...] can be ambiguous, which is resolved using tcc_options's order.
- *    (see tcc_options for details)
- *
- *  specifically, per arg of argv, in order:
- *  - if arg begins with '@' and is not exactly "@": process as @listfile.
- *  - elif arg is exactly "-" or doesn't begin with '-': process as input file.
- *    - if -run... is already set: also stop, arg... become argv of run_main.
- *  - elif arg is "--":
- *    - if -run... is already set: stop, arg... become argv of run_main.
- *    - else ignore it.
- *  - else ("-STRING") try to apply it as option, maybe with next (opt)arg.
- *
- *  after all args, if -run... but no "stop": run_main gets our argv (tcc ...)
- */
 /* using * to argc/argv to let "tcc -ar" benefit from @listfile expansion */
 PUB_FUNC int tcc_parse_args(TCCState *s, int *pargc, char ***pargv)
 {
@@ -2217,7 +2159,7 @@ unsupported_option:
     if (run) {
         if (*run && tcc_set_options(s, run) < 0)
             return -1;
-        x = 0;
+        x = 0, r = 0;
         goto extra_action;
     }
     if (!empty)
diff --git a/libtcc.h b/libtcc.h
index 11cd96cd..5949c807 100644
--- a/libtcc.h
+++ b/libtcc.h
@@ -105,18 +105,6 @@ LIBTCCAPI void tcc_list_symbols(TCCState *s, void *ctx,
 LIBTCCAPI void *_tcc_setjmp(TCCState *s1, void *jmp_buf, void *top_func, void *longjmp);
 #define tcc_setjmp(s1,jb,f) setjmp(_tcc_setjmp(s1, jb, f, longjmp))
 
-/* debugging */
-/* For debugging to work you have to enable it with tcc_set_options */
-
-/* compile a string containing a C source. Return -1 if error.
-   Write the string to file filename if debug is set. */
-LIBTCCAPI int tcc_compile_string_file(TCCState *s, const char *buf, const char *filename);
-
-/* Output object file. This must be done after tcc_relocate.
-   It only generates the file if debug is set.
-   The filename can be loaded with gdb command add-symbol-file */
-LIBTCCAPI int elf_output_obj(TCCState *s1, const char *filename);
-
 /* custom error printer for runtime exceptions. Returning 0 stops backtrace */
 typedef int TCCBtFunc(void *udata, void *pc, const char *file, int line, const char* func, const char *msg);
 LIBTCCAPI void tcc_set_backtrace_func(TCCState *s1, void* userdata, TCCBtFunc*);
diff --git a/optclash b/optclash
deleted file mode 100755
index 92015fbc..00000000
--- a/optclash
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/sh
-
-export LC_ALL=C
-
-defname=libtcc.c
-
-# $1 is the line number, $2... is the actual source line
-extract_opts() { awk '/tcc_options\[\]/ {x=1}; x; x && $2~/^}/ {exit}'; }
-
-case $1 in -h|--help)
-    echo "Usage: $0 [INFILE]"
-    echo "Detect tcc_options[] clashes in $defname-like INFILE."
-    echo "If INFILE is missing, use $defname at the same dir as this script."
-    echo "Clashes are reported as longer-overrides, or longer-unreachable."
-    exit
-esac
-
-f=${1-$(dirname "$0")/$defname}
-[ -r "$f" ] || { >&2 echo "$0: can't read -- $f"; exit 1; }
-
-nl -b a <"$f" | extract_opts | tr \" ' ' | awk '$2=="{"' | sort -b -k 3 |
-# "<line-num> { <unquoted-opt> <rest-of-line>"  sorted-up by opt
-# unavoidable O(N^2). the sort simplifies the awk code - only test prior opts.
-awk '
-    {
-        n=$1; opt=$3; h=/HAS_ARG/
-        for (pn in prevs) {  # pn: line-num
-            po = prevs[pn]   # po: opt-with-has-arg
-            if (index(opt,po) == 1) {
-                clash=1
-                printf("-%s%s (%d) %s -%s... (%s)\n", opt,h?"...":"",n,
-                       n>pn? "is not reachable! by":"overrides", po,pn)
-            }
-        }
-    }
-    h   {prevs[n] = opt}
-    END {if (clash) exit 1; print "no clashes"}
-'
diff --git a/riscv64-link.c b/riscv64-link.c
index b25321f5..edcf9b54 100644
--- a/riscv64-link.c
+++ b/riscv64-link.c
@@ -175,37 +175,23 @@ ST_FUNC void relocate_plt(TCCState *s1)
 
 static void riscv64_record_pcrel_hi(TCCState *s1, addr_t addr, addr_t val)
 {
-    int n = s1->nb_pcrel_hi_entries;
-    if (n >= s1->alloc_pcrel_hi_entries) {
-        int new_alloc = s1->alloc_pcrel_hi_entries ? s1->alloc_pcrel_hi_entries * 2 : 64;
-        s1->pcrel_hi_entries = tcc_realloc(s1->pcrel_hi_entries,
-            new_alloc * sizeof(*s1->pcrel_hi_entries));
-        s1->alloc_pcrel_hi_entries = new_alloc;
-    }
-    s1->pcrel_hi_entries[n].addr = addr;
-    s1->pcrel_hi_entries[n].val = val;
-    s1->nb_pcrel_hi_entries = n + 1;
-    last_hi.addr = addr;
-    last_hi.val = val;
+    struct pcrel_hi *entry = tcc_malloc(sizeof *entry);
+    entry->addr = addr;
+    entry->val = val;
+    dynarray_add(&s1->pcrel_hi_entries, &s1->nb_pcrel_hi_entries, entry);
 }
 
 static int riscv64_lookup_pcrel_hi(TCCState *s1, addr_t hi_addr, addr_t *hi_val)
 {
     int i;
-    struct pcrel_hi *entry;
-    if (s1->nb_pcrel_hi_entries && hi_addr == last_hi.addr) {
-        *hi_val = last_hi.val;
-        return 1;
-    }
-    for (i = s1->nb_pcrel_hi_entries - 1; i >= 0; --i) {
-        entry = &s1->pcrel_hi_entries[i];
+    for (i = s1->nb_pcrel_hi_entries; i > 0; ) {
+        struct pcrel_hi *entry = s1->pcrel_hi_entries[--i];
         if (entry->addr == hi_addr) {
-            last_hi = *entry;
             *hi_val = entry->val;
-            return 1;
+            return 0;
         }
     }
-    return 0;
+    return tcc_error_noabort("unsupported hi/lo pcrel reloc scheme");
 }
 
 ST_FUNC void relocate(TCCState *s1, ElfW_Rel *rel, int type, unsigned char *ptr,
@@ -279,15 +265,13 @@ ST_FUNC void relocate(TCCState *s1, ElfW_Rel *rel, int type, unsigned char *ptr,
         printf("PCREL_LO12_I: val=%lx addr=%lx\n", (long)val, (long)addr);
 #endif
         addr = val;
-        if (!riscv64_lookup_pcrel_hi(s1, addr, &val))
-          tcc_error_noabort("unsupported hi/lo pcrel reloc scheme");
+        riscv64_lookup_pcrel_hi(s1, addr, &val);
         write32le(ptr, (read32le(ptr) & 0xfffff)
                        | (((val - addr) & 0xfff) << 20));
         return;
     case R_RISCV_PCREL_LO12_S:
         addr = val;
-        if (!riscv64_lookup_pcrel_hi(s1, addr, &val))
-          tcc_error_noabort("unsupported hi/lo pcrel reloc scheme");
+        riscv64_lookup_pcrel_hi(s1, addr, &val);
         off32 = val - addr;
         write32le(ptr, (read32le(ptr) & ~0xfe000f80)
                        | ((off32 & 0xfe0) << 20)
diff --git a/tcc.h b/tcc.h
index e7a2f1e2..226bfa9b 100644
--- a/tcc.h
+++ b/tcc.h
@@ -938,11 +938,8 @@ struct TCCState {
     #define qrel s1->qrel
 
 #ifdef TCC_TARGET_RISCV64
-    struct pcrel_hi { addr_t addr, val; } last_hi;
-    struct pcrel_hi *pcrel_hi_entries;
+    struct pcrel_hi { addr_t addr, val; } **pcrel_hi_entries;
     int nb_pcrel_hi_entries;
-    int alloc_pcrel_hi_entries;
-    #define last_hi s1->last_hi
 #endif
 
 #ifdef TCC_TARGET_PE
diff --git a/tccelf.c b/tccelf.c
index b71c6f2b..122295d1 100644
--- a/tccelf.c
+++ b/tccelf.c
@@ -145,9 +145,6 @@ ST_FUNC void tccelf_delete(TCCState *s1)
     dynarray_reset(&s1->priv_sections, &s1->nb_priv_sections);
 
     tcc_free(s1->sym_attrs);
-#ifdef TCC_TARGET_RISCV64
-    tcc_free(s1->pcrel_hi_entries);
-#endif
     symtab_section = NULL; /* for tccrun.c:rt_printline() */
 }
 
@@ -1130,10 +1127,6 @@ static void relocate_section(TCCState *s1, Section *s, Section *sr)
     addr_t tgt, addr;
     int is_dwarf = s->sh_num >= s1->dwlo && s->sh_num < s1->dwhi;
 
-#ifdef TCC_TARGET_RISCV64
-    s1->nb_pcrel_hi_entries = 0;
-#endif
-
     qrel = (ElfW_Rel *)sr->data;
     for_each_elem(sr, 0, rel, ElfW_Rel) {
 	if (s->data == NULL) /* bss */
@@ -1155,6 +1148,7 @@ static void relocate_section(TCCState *s1, Section *s, Section *sr)
         addr = s->sh_addr + rel->r_offset;
         relocate(s1, rel, type, ptr, addr, tgt);
     }
+
 #ifndef ELF_OBJ_ONLY
     /* if the relocation is allocated, we change its symbol table */
     if (sr->sh_flags & SHF_ALLOC) {
@@ -1172,6 +1166,10 @@ static void relocate_section(TCCState *s1, Section *s, Section *sr)
         }
     }
 #endif
+
+#ifdef TCC_TARGET_RISCV64
+    dynarray_reset(&s1->pcrel_hi_entries, &s1->nb_pcrel_hi_entries);
+#endif
 }
 
 /* relocate all sections */
@@ -3103,7 +3101,7 @@ static void alloc_sec_names(TCCState *s1, int is_obj)
 }
 
 /* Output an elf .o file */
-LIBTCCAPI int elf_output_obj(TCCState *s1, const char *filename)
+static int elf_output_obj(TCCState *s1, const char *filename)
 {
     Section *s;
     int i, ret, file_offset;
diff --git a/tccgen.c b/tccgen.c
index 50802edf..75be059a 100644
--- a/tccgen.c
+++ b/tccgen.c
@@ -2526,6 +2526,14 @@ void gen_negf(int op)
 
     size = type_size(&vtop->type, &align);
     bt = vtop->type.t & VT_BTYPE;
+#if defined TCC_TARGET_X86_64 || defined TCC_TARGET_I386
+    /* sizeof long double is 12 or 16 here, but it's really the 80bit
+       extended float format.  */
+    if (bt == VT_LDOUBLE)
+        size = 10;
+#endif
+    if (nocode_wanted) /* save_reg() wont work */
+        goto gv2;
     save_reg(gv(RC_TYPE(bt)));
     vdup();
     incr_bf_adr(size - 1);
@@ -2534,6 +2542,8 @@ void gen_negf(int op)
     gen_op('^');
     vstore();
     vpop();
+gv2:
+    gv(RC_TYPE(bt)); /* -n is not a lvalue */
 }
 #endif
 
@@ -3097,9 +3107,7 @@ op_err:
 #endif
             type1 = vtop[-1].type;
             vpush_type_size(pointed_type(&vtop[-1].type), &align);
-            if (!(vtop[-1].type.t & VT_UNSIGNED)) {
-                gen_cast_s(VT_PTRDIFF_T);
-            }
+            vtop->type.t &= ~VT_UNSIGNED;
             gen_op('*');
 #ifdef CONFIG_TCC_BCHECK
             if (tcc_state->do_bounds_check && !CONST_WANTED) {
diff --git a/tccpp.c b/tccpp.c
index 269b3b41..4dd163fe 100644
--- a/tccpp.c
+++ b/tccpp.c
@@ -2475,7 +2475,7 @@ static void parse_number(const char *p)
             }
         }
     } else {
-        unsigned long long n = 0, n1 = 0;
+        unsigned long long n, n1;
         int lcount, ucount, ov = 0;
         const char *p1;
 
@@ -2486,6 +2486,7 @@ static void parse_number(const char *p)
             b = 8;
             q++;
         }
+        n = 0;
         while(1) {
             t = *q++;
             /* no need for checks except for base 10 / 8 errors */
@@ -2499,14 +2500,11 @@ static void parse_number(const char *p)
                 t = t - '0';
             if (t >= b)
                 tcc_error("invalid digit");
+            n1 = n;
             n = n * b + t;
-            if (!ov) {
-                /* detect overflow */
-                if (n1 >= 0x1000000000000000ULL && n / b != n1)
-                    ov = 1;
-                else
-                    n1 = n;
-	    }
+            /* detect overflow */
+            if (n1 >= 0x1000000000000000ULL && n / b != n1)
+                ov = 1;
         }
 
         /* Determine the characteristics (unsigned and/or 64bit) the type of
@@ -2556,26 +2554,7 @@ static void parse_number(const char *p)
         }
 
         if (ov)
-            /* Give a warning with values in case of an overflow. This helps to
-               spot the 0 too much in 0x8000'0000'0000'0000'0. It may even be
-               better to use a 0x8000'0000'0000'0000 from n1 instead of a 0 from
-               n after the overflow. This is at least undefined behavior.
-               The C99 to C23 standards state:
-               "Each constant shall have a type and the value of a constant
-               shall be in the range of representable values for its type."
-               "If an integer constant cannot be represented by any type ...,
-               then the integer constant has no type."
-               The C++ standards state:
-               "A program is ill-formed if one of its translation units contains
-               an integer-literal that cannot be represented by any of the
-               allowed types." */
-            tcc_warning(
-                b == 8
-                ? "integer constant overflow, using %#llo; did you mean %#llo?"
-                : b == 10
-                ? "integer constant overflow, using %llu, did you mean %llu?"
-                : "integer constant overflow, using %#llx; did you mean %#llx?",
-                n, n1);
+            tcc_warning("integer constant overflow");
 
         tok = TOK_CINT;
 	if (lcount) {
diff --git a/tccrun.c b/tccrun.c
index 71b34b1a..a01faee7 100644
--- a/tccrun.c
+++ b/tccrun.c
@@ -260,56 +260,22 @@ LIBTCCAPI int tcc_run(TCCState *s1, int argc, char **argv)
 }
 
 /* ------------------------------------------------------------- */
-/* remove all STB_LOCAL symbols. When do_debug is set, cleanup_sections()
-   keeps the relocation sections alive for a later elf_output_obj() call;
-   the r_info indices in those rela entries refer to the pre-cleanup
-   symbol numbering, so we must build an old->new map and rewrite them.
-   Without that, sort_syms() in elf_output_obj() allocates an
-   old_to_new_syms[] array sized for the post-cleanup (smaller) symtab
-   and indexes it with the stale (larger) sym_index values, reading off
-   the end of the array. */
+/* remove all STB_LOCAL symbols */
 static void cleanup_symbols(TCCState *s1)
 {
     Section *s = s1->symtab;
     int sym_index, end_sym = s->data_offset / sizeof (ElfSym);
-    int *old_to_new = s1->do_debug
-        ? tcc_mallocz(end_sym * sizeof(int))
-        : NULL;
     /* reset symtab */
     s->data_offset = s->link->data_offset = s->hash->data_offset = 0;
     init_symtab(s);
-    /* add global symbols again, recording the new index of each */
+    /* add global symbols again */
     for (sym_index = 1; sym_index < end_sym; ++sym_index) {
         ElfW(Sym) *sym = &((ElfW(Sym) *)s->data)[sym_index];
         const char *name = (char *)s->link->data + sym->st_name;
-        int new_idx;
         if (ELFW(ST_BIND)(sym->st_info) == STB_LOCAL)
             continue;
         //printf("sym %s\n", name);
-        new_idx = put_elf_sym(s, sym->st_value, sym->st_size,
-            sym->st_info, sym->st_other, sym->st_shndx, name);
-        if (old_to_new)
-            old_to_new[sym_index] = new_idx;
-    }
-    if (old_to_new) {
-        int i;
-        for (i = 1; i < s1->nb_sections; i++) {
-            Section *sr = s1->sections[i];
-            ElfW_Rel *rel;
-            if (sr->sh_type != SHT_RELX || sr->link != s)
-                continue;
-            for_each_elem(sr, 0, rel, ElfW_Rel) {
-                int old = ELFW(R_SYM)(rel->r_info);
-                int type = ELFW(R_TYPE)(rel->r_info);
-                /* Locals (and the undef sym at 0) map to 0 by
-                   tcc_mallocz; relocations against dropped locals now
-                   refer to SHN_UNDEF, which is the best we can do
-                   without preserving the locals themselves. */
-                int new_idx = (old > 0 && old < end_sym) ? old_to_new[old] : 0;
-                rel->r_info = ELFW(R_INFO)(new_idx, type);
-            }
-        }
-        tcc_free(old_to_new);
+        put_elf_sym(s, sym->st_value, sym->st_size, sym->st_info, sym->st_other, sym->st_shndx, name);
     }
 }
 
@@ -321,7 +287,7 @@ static void cleanup_sections(TCCState *s1)
     do {
         for (i = --f; i < p->nb_secs; i++) {
             Section *s = p->secs[i];
-            if (s1->do_debug || s == s1->symtab || s == s1->symtab->link || s == s1->symtab->hash) {
+            if (s == s1->symtab || s == s1->symtab->link || s == s1->symtab->hash) {
                 s->data = tcc_realloc(s->data, s->data_allocated = s->data_offset);
             } else {
                 free_section(s), tcc_free(s), p->secs[i] = NULL;
@@ -333,11 +299,10 @@ static void cleanup_sections(TCCState *s1)
 /* ------------------------------------------------------------- */
 /* 0 = .text rwx  other rw (memory >= 2 pages a 4096 bytes) */
 /* 1 = .text rx   other rw (memory >= 3 pages) */
-/* 2 = .debug    .debug ro (optional) */
-/* 3 = .text rx  .rdata ro  .data/.bss rw (memory >= 4 pages) */
+/* 2 = .text rx  .rdata ro  .data/.bss rw (memory >= 4 pages) */
 
 /* Some targets implement secutiry options that do not allow write in
-   executable code. These targets need CONFIG_RUNMEM_RO=2.
+   executable code. These targets need CONFIG_RUNMEM_RO=1.
    The disadvantage of this is that it requires a little bit more memory. */
 
 #ifndef CONFIG_RUNMEM_RO
@@ -378,13 +343,12 @@ redo:
     if (copy == 3)
         return 0;
 
-    for (k = 0; k < 4; ++k) { /* 0:rx, 1:ro, 2:ro debug , 3:rw sections */
+    for (k = 0; k < 3; ++k) { /* 0:rx, 1:ro, 2:rw sections */
         n = 0; addr = 0;
         for(i = 1; i < s1->nb_sections; i++) {
             static const char shf[] = {
-                SHF_ALLOC|SHF_EXECINSTR, SHF_ALLOC, 0, SHF_ALLOC|SHF_WRITE
+                SHF_ALLOC|SHF_EXECINSTR, SHF_ALLOC, SHF_ALLOC|SHF_WRITE
                 };
-	    if (k == 2 && s1->do_debug == 0) continue;
             s = s1->sections[i];
             if (shf[k] != (s->sh_flags & (SHF_ALLOC|SHF_WRITE|SHF_EXECINSTR)))
                 continue;
diff --git a/tests/Makefile b/tests/Makefile
index 4611e11c..6e0f3bd6 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -13,7 +13,6 @@ TESTS = \
  hello-run \
  libtest \
  libtest_mt \
- libtest_xor_rex \
  test3 \
  abitest \
  asm-c-connect-test \
@@ -41,9 +40,6 @@ endif
 ifeq (,$(filter i386 x86_64,$(ARCH)))
  TESTS := $(filter-out asm-c-connect-test,$(TESTS))
 endif
-ifeq (,$(filter x86_64,$(ARCH)))
- TESTS := $(filter-out libtest_xor_rex,$(TESTS))
-endif
 ifeq ($(OS),Windows_NT) # for libtcc_test to find libtcc.dll
  PATH := $(CURDIR)/$(TOP)$(if $(findstring ;,$(PATH)),;,:)$(PATH)
 endif
@@ -107,9 +103,6 @@ libtcc_test$(EXESUF): libtcc_test.c
 libtcc_test_mt$(EXESUF): libtcc_test_mt.c
 	$(CC) -o $@ $< $(CFLAGS) $(-LTCC) $(LIBS)
 
-libtcc_test_xor_rex$(EXESUF): libtcc_test_xor_rex.c
-	$(CC) -o $@ $< $(CFLAGS) $(-LTCC) $(LIBS)
-
 %-dir:
 	@echo ------------ $@ ------------
 	$(MAKE) -k -C $*
diff --git a/tests/libtcc_debug.c b/tests/libtcc_debug.c
deleted file mode 100644
index eff6d2fe..00000000
--- a/tests/libtcc_debug.c
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "libtcc.h"
-
-static const char program[] =
-"#include <stdio.h>\n"
-"int fib(int n)\n"
-"{\n"
-"    if (n <= 2)\n"
-"        return 1;\n"
-"    else\n"
-"        return fib(n-1) + fib(n-2);\n"
-"}\n"
-"int tst(void)\n"
-"{\n"
-"   int i;\n"
-"   for (i = 2; i < 20; i++)\n"
-"     printf(\"%d \", fib(i));\n"
-"   printf(\"\\n\");\n"
-"   return 0;\n"
-"}\n";
-
-void handle_error(void *opaque, const char *msg)
-{
-    fprintf(opaque, "%s\n", msg);
-}
-
-int
-main(void)
-{
-    int (*func)(void);
-    TCCState *s = tcc_new();
-
-    if (!s) {
-        fprintf(stderr, __FILE__ ": could not create tcc state\n");
-        return 1;
-    }
-#if 1
-    /* If -g option is not set the debugging files tst.c en tst.o will
-       not be created. */
-    tcc_set_options(s, "-g");
-#endif
-    tcc_set_error_func(s, stdout, handle_error);
-    tcc_set_output_type(s, TCC_OUTPUT_MEMORY);
-    if (tcc_compile_string_file(s, program, "tst.c") == -1)
-	return 1;
-    if (tcc_relocate(s) < 0)
-        return 1;
-    elf_output_obj(s, "tst.o");
-    /* set breakpoint on next line. and load symbol file with
-       gdb command add-symbol-file.
-       Then set breakpoint on tst and continue. */
-    if ((func = tcc_get_symbol(s, "tst")))
-        func();
-    tcc_delete(s);
-    return 0;
-}
diff --git a/tests/libtcc_test_xor_rex.c b/tests/libtcc_test_xor_rex.c
deleted file mode 100644
index 89bbb1a3..00000000
--- a/tests/libtcc_test_xor_rex.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Test for x86_64 xor REX prefix bug in load() -- x86_64-gen.c
- *
- * Bug: when loading a 64-bit zero constant into registers r8-r15,
- * load() emits:
- *
- *     o(0xc031 + REG_VALUE(r) * 0x900);   // xor r, r
- *
- * REG_VALUE(r) masks to (r & 7), losing bit 3, and no orex() call
- * emits the REX prefix needed for extended registers.
- *
- * Result:  r=TREG_R11(11) -> REG_VALUE=3 -> emits 31 db (xor ebx,ebx)
- * Correct: should emit 45 31 db (REX.RB xor r11d,r11d)
- *
- * Fix:
- *     orex(0, r, r, 0x31);
- *     o(0xc0 + REG_VALUE(r) * 9);
- *
- * Trigger: an indirect call through a compile-time null function pointer,
- * e.g. ((void(*)(void))0)(), causes gcall_or_jmp() to fall into the
- * "indirect call" path which does load(TREG_R11, <constant 0>).
- *
- * This test compiles such code via the libtcc API, then inspects the
- * generated machine code for the incorrect encoding.
- */
-
-#if !defined __x86_64__
-#include <stdio.h>
-int main(void) { printf("SKIP (x86_64 only)\n"); return 0; }
-#else
-
-#include <stdio.h>
-#include "libtcc.h"
-
-static void handle_error(void *opaque, const char *msg)
-{
-    fprintf(opaque, "%s\n", msg);
-}
-
-/*
- * Compiled via libtcc.  The cast-to-null indirect call forces
- * gcall_or_jmp() into its "else" branch (no VT_SYM, so the
- * condition on line ~650 fails), which does:
- *
- *     r = TREG_R11;
- *     load(r, vtop);          // <-- buggy xor lands here
- *     o(0x41); o(0xff);       // call/jmp *r
- *     o(0xd0 + REG_VALUE(r)); // r11 -> 0xd3
- *
- * We never execute the function (it would crash); we only
- * inspect the generated bytes.
- */
-static const char test_code[] =
-    "void test(void) {\n"
-    "    ((void(*)(void))0)();\n"
-    "}\n";
-
-int main(int argc, char **argv)
-{
-    TCCState *s;
-    unsigned char *code;
-    int i;
-    int ret = 0;
-
-    s = tcc_new();
-    if (!s) {
-        fprintf(stderr, "tcc_new() failed\n");
-        return 2;
-    }
-    tcc_set_error_func(s, stderr, handle_error);
-
-    for (i = 1; i < argc; ++i) {
-        char *a = argv[i];
-        if (a[0] == '-') {
-            if (a[1] == 'B')
-                tcc_set_lib_path(s, a + 2);
-            else if (a[1] == 'I')
-                tcc_add_include_path(s, a + 2);
-            else if (a[1] == 'L')
-                tcc_add_library_path(s, a + 2);
-        }
-    }
-
-    tcc_set_output_type(s, TCC_OUTPUT_MEMORY);
-    if (tcc_compile_string(s, test_code) == -1)
-        return 2;
-    if (tcc_relocate(s) < 0)
-        return 2;
-
-    code = (unsigned char *)tcc_get_symbol(s, "test");
-    if (!code) {
-        fprintf(stderr, "symbol 'test' not found\n");
-        return 2;
-    }
-
-    /*
-     * Scan for the 'call *%r11' instruction: 41 ff d3
-     * Then inspect the bytes immediately before it.
-     *
-     * Correct: 45 31 db  41 ff d3  (xor %r11d,%r11d ; call *%r11)
-     * Buggy:      31 db  41 ff d3  (xor %ebx,%ebx   ; call *%r11)
-     */
-    for (i = 3; i < 128; i++) {
-        if (code[i] == 0x41 && code[i+1] == 0xff && code[i+2] == 0xd3) {
-            if (i >= 3 && code[i-3] == 0x45
-                       && code[i-2] == 0x31
-                       && code[i-1] == 0xdb) {
-                printf("xor_rex: OK\n");
-            } else if (i >= 2 && code[i-2] == 0x31 && code[i-1] == 0xdb) {
-                printf("xor_rex: FAIL - xor %%ebx,%%ebx (31 db) emitted"
-                       " instead of xor %%r11d,%%r11d (45 31 db)\n");
-                ret = 1;
-            } else {
-                printf("xor_rex: FAIL - unexpected bytes before"
-                       " call *%%r11: %02x %02x %02x %02x\n",
-                       code[i-4], code[i-3], code[i-2], code[i-1]);
-                ret = 1;
-            }
-            goto done;
-        }
-    }
-    printf("xor_rex: SKIP - call *%%r11 not found in generated code\n");
-
-done:
-    tcc_delete(s);
-    return ret;
-}
-#endif
diff --git a/tests/nostdlib_test.c b/tests/nostdlib_test.c
deleted file mode 100755
index 7c7f982f..00000000
--- a/tests/nostdlib_test.c
+++ /dev/null
@@ -1,155 +0,0 @@
-#!/usr/local/bin/tcc -run -nostdlib
-
-// Not working on windows and apple because of different API.
-
-#include <unistd.h>
-#include <sys/syscall.h>
-#if defined __x86_64__
-__asm__ ("syscall:\n\t"
-	 "mov %rdi,%rax\n\t"
-	 "mov %rsi,%rdi\n\t"
-	 "mov %rdx,%rsi\n\t"
-	 "mov %rcx,%rdx\n\t"
-	 "mov %r8,%r10\n\t"
-	 "mov %r9,%r8\n\t"
-	 "mov 0x8(%rsp),%r9\n\t"
-	 "syscall\n\t"
-	 "ret");
-__asm__ (".global _start\n\t"
-	 "_start:\n\t"
-	 "mov 0(%rsp), %rdi\n\t"
-	 "lea 8(%rsp), %rsi\n\t"
-	 "jmp print");
-#elif defined __i386__
-__asm__ ("syscall:\n\t"
-	 "push %ebp\n\t"
-	 "push %edi\n\t"
-	 "push %esi\n\t"
-	 "push %ebx\n\t"
-	 "mov 0x2c(%esp),%ebp\n\t"
-	 "mov 0x28(%esp),%edi\n\t"
-	 "mov 0x24(%esp),%esi\n\t"
-	 "mov 0x20(%esp),%edx\n\t"
-	 "mov 0x1c(%esp),%ecx\n\t"
-	 "mov 0x18(%esp),%ebx\n\t"
-	 "mov 0x14(%esp),%eax\n\t"
-	 // "call *%gs:0x10\n\t"
-	 ".byte 0x65,0xff,0x15,0x10,0x00,0x00,0x00\n\t"
-	 "pop %ebx\n\t"
-	 "pop %esi\n\t"
-	 "pop %edi\n\t"
-	 "pop %ebp\n\t"
-	 "ret");
-__asm__ (".global _start\n\t"
-	 "_start:\n\t"
-	 "pop %esi\n\t"
-	 "mov %esp, %ecx\n\t"
-	 "and $0xfffffff0,%esp\n\t"
-	 "push %ecx\n\t"
-	 "push %esi\n\t"
-	 "call print");
-#elif defined __arm__
-__asm__ ("syscall:\n\t"
-	 "mov r12, sp\n\t"
-	 "push {r4, r5, r6, r7}\n\t"
-	 "mov r7, r0\n\t"
-	 "mov r0, r1\n\t"
-	 "mov r1, r2\n\t"
-	 "mov r2, r3\n\t"
-	 "ldm r12, {r3, r4, r5, r6}\n\t"
-	 "svc 0x00000000\n\t"
-	 "pop {r4, r5, r6, r7}\n\t"
-	 "mov pc, lr");
-__asm__ (".global _start\n\t"
-	 "_start:\n\t"
-	 "pop {r0}\n\t"
-	 "mov r1, sp\n\t"
-	 "bl print");
-#elif defined __aarch64__
-__asm__ ("syscall:\n\t"
-	 ".int 0x2a0003e8\n\t" // mov w8, w0
-	 ".int 0xaa0103e0\n\t" // x0, x1
-	 ".int 0xaa0203e1\n\t" // mov x1, x2
-	 ".int 0xaa0303e2\n\t" // mov x2, x3
-	 ".int 0xaa0403e3\n\t" // mov x3, x4
-	 ".int 0xaa0503e4\n\t" // mov x4, x5
-	 ".int 0xaa0603e5\n\t" // mov x5, x6
-	 ".int 0xaa0703e6\n\t" // mov x6, x7
-	 ".int 0xd4000001\n\t" // svc  #0x0
-	 ".int 0xd65f03c0"); // ret
-__asm__ (".global _start\n\t"
-	 "_start:\n\t"
-	 ".int 0xf94003e0\n\t" // ldr x0, [sp]
-	 ".int 0x910023e1\n\t" // add x1, sp, #08
-	 ".reloc .,R_AARCH64_CALL26,print\n\t"
-	 ".int 0x94000000"); // bl print
-#elif defined __riscv
-__asm__ ("syscall:\n\t"
-	 "mv t1,a0\n\t"
-	 "mv a0,a1\n\t"
-	 "mv a1,a2\n\t"
-	 "mv a2,a3\n\t"
-	 "mv a3,a4\n\t"
-	 "mv a4,a5\n\t"
-	 "mv a5,a6\n\t"
-	 "mv a6,a7\n\t"
-	 "mv a7,t1\n\t"
-	 "ecall\n\t"
-	 "ret");
-__asm__ (".global _start\n\t"
-	 "_start:\n\t"
-	 "ld a0,0(sp)\n\t"
-	 "addi a1,sp,8\n\t"
-	 "jal print");
-#endif
-unsigned long strlen(const char *s)
-{
-    unsigned long len = 0;
-
-    while (*s++)
-	len++;
-    return len;
-}
-
-static void pr_num(int num)
-{
-    char val[20], *p = &val[20];
-    
-    *--p = '\0';
-    do {
-	int a = num, b = 0;
-
-	while (a >= 10) {
-	    a -= 10;
-	    b++;
-	}
-	*--p = a + '0';
-	num = b;
-    } while (num);
-    syscall(SYS_write, 1, p, strlen(p));
-}
-
-static void pr_str(int n, char *s)
-{
-    pr_num(n);
-    syscall(SYS_write, 1, ": ", 2);
-    syscall(SYS_write, 1, s, strlen(s));
-    syscall(SYS_write, 1, "\n", 1);
-}
-
-void print(int argc, char **argv) {
-    int i;
-    char **envp = &argv[argc + 1];
-
-    syscall(SYS_write, 1, "argc: ", 6);
-    pr_num(argc);
-    syscall(SYS_write, 1, "\n", 1);
-    syscall(SYS_write, 1, "argv[]\n", 7);
-    for (i = 0; i < argc; i++)
-	pr_str(i, argv[i]);
-    syscall(SYS_write, 1, "envp[]\n", 7);
-    i = 0;
-    while (*envp)
-	pr_str(i++, *envp++);
-    syscall(SYS_exit, 0);
-}
diff --git a/x86_64-gen.c b/x86_64-gen.c
index af936b63..e1f4ae70 100644
--- a/x86_64-gen.c
+++ b/x86_64-gen.c
@@ -1840,6 +1840,7 @@ void gen_opf(int op)
             o(0x80); /* xor $0x80, $n(rbp) */
             gen_modrm(6, vtop->r, NULL, vtop->c.i + (bt == VT_DOUBLE ? 7 : 3));
             o(0x80);
+            gv(float_type); /* -n is not a lvalue */
         }
         return;
     }