From fd37da0d586c331b0008fbfd653a9659344fe76f Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 26 Jul 2017 09:52:19 +0200 Subject: [PATCH] PPC: Add soft-float support to interpreter. Contributed by Djordje Kovacevic and Stefan Pejic from RT-RK.com. Sponsored by Cisco Systems, Inc. --- src/host/buildvm_asm.c | 2 +- src/lj_arch.h | 29 +- src/lj_ccall.c | 38 +- src/lj_ccall.h | 4 +- src/lj_ccallback.c | 30 +- src/lj_frame.h | 2 +- src/lj_ircall.h | 2 +- src/vm_ppc.dasc | 1249 +++++++++++++++++++++++++++++++++------- 8 files changed, 1101 insertions(+), 255 deletions(-) --- a/src/host/buildvm_asm.c +++ b/src/host/buildvm_asm.c @@ -338,7 +338,7 @@ void emit_asm(BuildCtx *ctx) #if !(LJ_TARGET_PS3 || LJ_TARGET_PSVITA) fprintf(ctx->fp, "\t.section .note.GNU-stack,\"\"," ELFASM_PX "progbits\n"); #endif -#if LJ_TARGET_PPC && !LJ_TARGET_PS3 +#if LJ_TARGET_PPC && !LJ_TARGET_PS3 && !LJ_ABI_SOFTFP /* Hard-float ABI. */ fprintf(ctx->fp, "\t.gnu_attribute 4, 1\n"); #endif --- a/src/lj_arch.h +++ b/src/lj_arch.h @@ -254,6 +254,29 @@ #else #define LJ_ARCH_BITS 32 #define LJ_ARCH_NAME "ppc" + +#if !defined(LJ_ARCH_HASFPU) +#if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE) +#define LJ_ARCH_HASFPU 0 +#else +#define LJ_ARCH_HASFPU 1 +#endif +#endif + +#if !defined(LJ_ABI_SOFTFP) +#if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE) +#define LJ_ABI_SOFTFP 1 +#else +#define LJ_ABI_SOFTFP 0 +#endif +#endif +#endif + +#if LJ_ABI_SOFTFP +#define LJ_ARCH_NOJIT 1 /* NYI */ +#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL +#else +#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL_SINGLE #endif #define LJ_TARGET_PPC 1 @@ -262,7 +285,6 @@ #define LJ_TARGET_MASKSHIFT 0 #define LJ_TARGET_MASKROT 1 #define LJ_TARGET_UNIFYROT 1 /* Want only IR_BROL. */ -#define LJ_ARCH_NUMMODE LJ_NUMMODE_DUAL_SINGLE #if LJ_TARGET_CONSOLE #define LJ_ARCH_PPC32ON64 1 @@ -415,16 +437,13 @@ #error "No support for ILP32 model on ARM64" #endif #elif LJ_TARGET_PPC -#if defined(_SOFT_FLOAT) || defined(_SOFT_DOUBLE) -#error "No support for PowerPC CPUs without double-precision FPU" -#endif #if !LJ_ARCH_PPC64 && LJ_ARCH_ENDIAN == LUAJIT_LE #error "No support for little-endian PPC32" #endif #if LJ_ARCH_PPC64 #error "No support for PowerPC 64 bit mode (yet)" #endif -#ifdef __NO_FPRS__ +#if defined(__NO_FPRS__) && !defined(_SOFT_FLOAT) #error "No support for PPC/e500 anymore (use LuaJIT 2.0)" #endif #elif LJ_TARGET_MIPS32 --- a/src/lj_ccall.c +++ b/src/lj_ccall.c @@ -387,6 +387,24 @@ #define CCALL_HANDLE_COMPLEXARG \ /* Pass complex by value in 2 or 4 GPRs. */ +#define CCALL_HANDLE_GPR \ + /* Try to pass argument in GPRs. */ \ + if (n > 1) { \ + lua_assert(n == 2 || n == 4); /* int64_t or complex (float). */ \ + if (ctype_isinteger(d->info) || ctype_isfp(d->info)) \ + ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \ + else if (ngpr + n > maxgpr) \ + ngpr = maxgpr; /* Prevent reordering. */ \ + } \ + if (ngpr + n <= maxgpr) { \ + dp = &cc->gpr[ngpr]; \ + ngpr += n; \ + goto done; \ + } \ + +#if LJ_ABI_SOFTFP +#define CCALL_HANDLE_REGARG CCALL_HANDLE_GPR +#else #define CCALL_HANDLE_REGARG \ if (isfp) { /* Try to pass argument in FPRs. */ \ if (nfpr + 1 <= CCALL_NARG_FPR) { \ @@ -395,24 +413,16 @@ d = ctype_get(cts, CTID_DOUBLE); /* FPRs always hold doubles. */ \ goto done; \ } \ - } else { /* Try to pass argument in GPRs. */ \ - if (n > 1) { \ - lua_assert(n == 2 || n == 4); /* int64_t or complex (float). */ \ - if (ctype_isinteger(d->info)) \ - ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \ - else if (ngpr + n > maxgpr) \ - ngpr = maxgpr; /* Prevent reordering. */ \ - } \ - if (ngpr + n <= maxgpr) { \ - dp = &cc->gpr[ngpr]; \ - ngpr += n; \ - goto done; \ - } \ + } else { \ + CCALL_HANDLE_GPR \ } +#endif +#if !LJ_ABI_SOFTFP #define CCALL_HANDLE_RET \ if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ ctr = ctype_get(cts, CTID_DOUBLE); /* FPRs always hold doubles. */ +#endif #elif LJ_TARGET_MIPS32 /* -- MIPS o32 calling conventions ---------------------------------------- */ @@ -1080,7 +1090,7 @@ static int ccall_set_args(lua_State *L, } if (fid) lj_err_caller(L, LJ_ERR_FFI_NUMARG); /* Too few arguments. */ -#if LJ_TARGET_X64 || LJ_TARGET_PPC +#if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP) cc->nfpr = nfpr; /* Required for vararg functions. */ #endif cc->nsp = nsp; --- a/src/lj_ccall.h +++ b/src/lj_ccall.h @@ -86,9 +86,9 @@ typedef union FPRArg { #elif LJ_TARGET_PPC #define CCALL_NARG_GPR 8 -#define CCALL_NARG_FPR 8 +#define CCALL_NARG_FPR (LJ_ABI_SOFTFP ? 0 : 8) #define CCALL_NRET_GPR 4 /* For complex double. */ -#define CCALL_NRET_FPR 1 +#define CCALL_NRET_FPR (LJ_ABI_SOFTFP ? 0 : 1) #define CCALL_SPS_EXTRA 4 #define CCALL_SPS_FREE 0 --- a/src/lj_ccallback.c +++ b/src/lj_ccallback.c @@ -419,6 +419,23 @@ void lj_ccallback_mcode_free(CTState *ct #elif LJ_TARGET_PPC +#define CALLBACK_HANDLE_GPR \ + if (n > 1) { \ + lua_assert(((LJ_ABI_SOFTFP && ctype_isnum(cta->info)) || /* double. */ \ + ctype_isinteger(cta->info)) && n == 2); /* int64_t. */ \ + ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \ + } \ + if (ngpr + n <= maxgpr) { \ + sp = &cts->cb.gpr[ngpr]; \ + ngpr += n; \ + goto done; \ + } + +#if LJ_ABI_SOFTFP +#define CALLBACK_HANDLE_REGARG \ + CALLBACK_HANDLE_GPR \ + UNUSED(isfp); +#else #define CALLBACK_HANDLE_REGARG \ if (isfp) { \ if (nfpr + 1 <= CCALL_NARG_FPR) { \ @@ -427,20 +444,15 @@ void lj_ccallback_mcode_free(CTState *ct goto done; \ } \ } else { /* Try to pass argument in GPRs. */ \ - if (n > 1) { \ - lua_assert(ctype_isinteger(cta->info) && n == 2); /* int64_t. */ \ - ngpr = (ngpr + 1u) & ~1u; /* Align int64_t to regpair. */ \ - } \ - if (ngpr + n <= maxgpr) { \ - sp = &cts->cb.gpr[ngpr]; \ - ngpr += n; \ - goto done; \ - } \ + CALLBACK_HANDLE_GPR \ } +#endif +#if !LJ_ABI_SOFTFP #define CALLBACK_HANDLE_RET \ if (ctype_isfp(ctr->info) && ctr->size == sizeof(float)) \ *(double *)dp = *(float *)dp; /* FPRs always hold doubles. */ +#endif #elif LJ_TARGET_MIPS32 --- a/src/lj_frame.h +++ b/src/lj_frame.h @@ -226,7 +226,7 @@ enum { LJ_CONT_TAILCALL, LJ_CONT_FFI_CAL #define CFRAME_OFS_L 36 #define CFRAME_OFS_PC 32 #define CFRAME_OFS_MULTRES 28 -#define CFRAME_SIZE 272 +#define CFRAME_SIZE (LJ_ARCH_HASFPU ? 272 : 128) #define CFRAME_SHIFT_MULTRES 3 #endif #elif LJ_TARGET_MIPS32 --- a/src/lj_ircall.h +++ b/src/lj_ircall.h @@ -272,7 +272,7 @@ LJ_DATA const CCallInfo lj_ir_callinfo[I #define fp64_f2l __aeabi_f2lz #define fp64_f2ul __aeabi_f2ulz #endif -#elif LJ_TARGET_MIPS +#elif LJ_TARGET_MIPS || LJ_TARGET_PPC #define softfp_add __adddf3 #define softfp_sub __subdf3 #define softfp_mul __muldf3 --- a/src/vm_ppc.dasc +++ b/src/vm_ppc.dasc @@ -103,6 +103,18 @@ |// Fixed register assignments for the interpreter. |// Don't use: r1 = sp, r2 and r13 = reserved (TOC, TLS or SDATA) | +|.macro .FPU, a, b +|.if FPU +| a, b +|.endif +|.endmacro +| +|.macro .FPU, a, b, c +|.if FPU +| a, b, c +|.endif +|.endmacro +| |// The following must be C callee-save (but BASE is often refetched). |.define BASE, r14 // Base of current Lua stack frame. |.define KBASE, r15 // Constants of current Lua function. @@ -116,8 +128,10 @@ |.define TISNUM, r22 |.define TISNIL, r23 |.define ZERO, r24 +|.if FPU |.define TOBIT, f30 // 2^52 + 2^51. |.define TONUM, f31 // 2^52 + 2^51 + 2^31. +|.endif | |// The following temporaries are not saved across C calls, except for RA. |.define RA, r20 // Callee-save. @@ -133,6 +147,7 @@ | |// Saved temporaries. |.define SAVE0, r21 +|.define SAVE1, r25 | |// Calling conventions. |.define CARG1, r3 @@ -141,8 +156,10 @@ |.define CARG4, r6 // Overlaps TMP3. |.define CARG5, r7 // Overlaps INS. | +|.if FPU |.define FARG1, f1 |.define FARG2, f2 +|.endif | |.define CRET1, r3 |.define CRET2, r4 @@ -213,10 +230,16 @@ |.endif |.else | +|.if FPU |.define SAVE_LR, 276(sp) |.define CFRAME_SPACE, 272 // Delta for sp. |// Back chain for sp: 272(sp) <-- sp entering interpreter |.define SAVE_FPR_, 128 // .. 128+18*8: 64 bit FPR saves. +|.else +|.define SAVE_LR, 132(sp) +|.define CFRAME_SPACE, 128 // Delta for sp. +|// Back chain for sp: 128(sp) <-- sp entering interpreter +|.endif |.define SAVE_GPR_, 56 // .. 56+18*4: 32 bit GPR saves. |.define SAVE_CR, 52(sp) // 32 bit CR save. |.define SAVE_ERRF, 48(sp) // 32 bit C frame info. @@ -226,16 +249,25 @@ |.define SAVE_PC, 32(sp) |.define SAVE_MULTRES, 28(sp) |.define UNUSED1, 24(sp) +|.if FPU |.define TMPD_LO, 20(sp) |.define TMPD_HI, 16(sp) |.define TONUM_LO, 12(sp) |.define TONUM_HI, 8(sp) +|.else +|.define SFSAVE_4, 20(sp) +|.define SFSAVE_3, 16(sp) +|.define SFSAVE_2, 12(sp) +|.define SFSAVE_1, 8(sp) +|.endif |// Next frame lr: 4(sp) |// Back chain for sp: 0(sp) <-- sp while in interpreter | +|.if FPU |.define TMPD_BLO, 23(sp) |.define TMPD, TMPD_HI |.define TONUM_D, TONUM_HI +|.endif | |.endif | @@ -245,7 +277,7 @@ |.else | stw r..reg, SAVE_GPR_+(reg-14)*4(sp) |.endif -| stfd f..reg, SAVE_FPR_+(reg-14)*8(sp) +| .FPU stfd f..reg, SAVE_FPR_+(reg-14)*8(sp) |.endmacro |.macro rest_, reg |.if GPR64 @@ -253,7 +285,7 @@ |.else | lwz r..reg, SAVE_GPR_+(reg-14)*4(sp) |.endif -| lfd f..reg, SAVE_FPR_+(reg-14)*8(sp) +| .FPU lfd f..reg, SAVE_FPR_+(reg-14)*8(sp) |.endmacro | |.macro saveregs @@ -323,6 +355,7 @@ |// Trap for not-yet-implemented parts. |.macro NYI; tw 4, sp, sp; .endmacro | +|.if FPU |// int/FP conversions. |.macro tonum_i, freg, reg | xoris reg, reg, 0x8000 @@ -346,6 +379,7 @@ |.macro toint, reg, freg | toint reg, freg, freg |.endmacro +|.endif | |//----------------------------------------------------------------------- | @@ -533,9 +567,19 @@ static void build_subroutines(BuildCtx * | beq >2 |1: | addic. TMP1, TMP1, -8 + |.if FPU | lfd f0, 0(RA) + |.else + | lwz CARG1, 0(RA) + | lwz CARG2, 4(RA) + |.endif | addi RA, RA, 8 + |.if FPU | stfd f0, 0(BASE) + |.else + | stw CARG1, 0(BASE) + | stw CARG2, 4(BASE) + |.endif | addi BASE, BASE, 8 | bney <1 | @@ -613,23 +657,23 @@ static void build_subroutines(BuildCtx * | .toc ld TOCREG, SAVE_TOC | li TISNUM, LJ_TISNUM // Setup type comparison constants. | lp BASE, L->base - | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | lwz DISPATCH, L->glref // Setup pointer to dispatch table. | li ZERO, 0 - | stw TMP3, TMPD + | .FPU stw TMP3, TMPD | li TMP1, LJ_TFALSE - | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). + | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). | li TISNIL, LJ_TNIL | li_vmstate INTERP - | lfs TOBIT, TMPD + | .FPU lfs TOBIT, TMPD | lwz PC, FRAME_PC(BASE) // Fetch PC of previous frame. | la RA, -8(BASE) // Results start at BASE-8. - | stw TMP3, TMPD + | .FPU stw TMP3, TMPD | addi DISPATCH, DISPATCH, GG_G2DISP | stw TMP1, 0(RA) // Prepend false to error message. | li RD, 16 // 2 results: false + error message. | st_vmstate - | lfs TONUM, TMPD + | .FPU lfs TONUM, TMPD | b ->vm_returnc | |//----------------------------------------------------------------------- @@ -690,22 +734,22 @@ static void build_subroutines(BuildCtx * | li TISNUM, LJ_TISNUM // Setup type comparison constants. | lp TMP1, L->top | lwz PC, FRAME_PC(BASE) - | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | stb CARG3, L->status - | stw TMP3, TMPD - | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). - | lfs TOBIT, TMPD + | .FPU stw TMP3, TMPD + | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). + | .FPU lfs TOBIT, TMPD | sub RD, TMP1, BASE - | stw TMP3, TMPD - | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) + | .FPU stw TMP3, TMPD + | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) | addi RD, RD, 8 - | stw TMP0, TONUM_HI + | .FPU stw TMP0, TONUM_HI | li_vmstate INTERP | li ZERO, 0 | st_vmstate | andix. TMP0, PC, FRAME_TYPE | mr MULTRES, RD - | lfs TONUM, TMPD + | .FPU lfs TONUM, TMPD | li TISNIL, LJ_TNIL | beq ->BC_RET_Z | b ->vm_return @@ -739,19 +783,19 @@ static void build_subroutines(BuildCtx * | lp TMP2, L->base // TMP2 = old base (used in vmeta_call). | li TISNUM, LJ_TISNUM // Setup type comparison constants. | lp TMP1, L->top - | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | add PC, PC, BASE - | stw TMP3, TMPD + | .FPU stw TMP3, TMPD | li ZERO, 0 - | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). - | lfs TOBIT, TMPD + | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). + | .FPU lfs TOBIT, TMPD | sub PC, PC, TMP2 // PC = frame delta + frame type - | stw TMP3, TMPD - | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) + | .FPU stw TMP3, TMPD + | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) | sub NARGS8:RC, TMP1, BASE - | stw TMP0, TONUM_HI + | .FPU stw TMP0, TONUM_HI | li_vmstate INTERP - | lfs TONUM, TMPD + | .FPU lfs TONUM, TMPD | li TISNIL, LJ_TNIL | st_vmstate | @@ -839,15 +883,30 @@ static void build_subroutines(BuildCtx * | lwz INS, -4(PC) | subi CARG2, RB, 16 | decode_RB8 SAVE0, INS + |.if FPU | lfd f0, 0(RA) + |.else + | lwz TMP2, 0(RA) + | lwz TMP3, 4(RA) + |.endif | add TMP1, BASE, SAVE0 | stp BASE, L->base | cmplw TMP1, CARG2 | sub CARG3, CARG2, TMP1 | decode_RA8 RA, INS + |.if FPU | stfd f0, 0(CARG2) + |.else + | stw TMP2, 0(CARG2) + | stw TMP3, 4(CARG2) + |.endif | bney ->BC_CAT_Z + |.if FPU | stfdx f0, BASE, RA + |.else + | stwux TMP2, RA, BASE + | stw TMP3, 4(RA) + |.endif | b ->cont_nop | |//-- Table indexing metamethods ----------------------------------------- @@ -900,9 +959,19 @@ static void build_subroutines(BuildCtx * | // Returns TValue * (finished) or NULL (metamethod). | cmplwi CRET1, 0 | beq >3 + |.if FPU | lfd f0, 0(CRET1) + |.else + | lwz TMP0, 0(CRET1) + | lwz TMP1, 4(CRET1) + |.endif | ins_next1 + |.if FPU | stfdx f0, BASE, RA + |.else + | stwux TMP0, RA, BASE + | stw TMP1, 4(RA) + |.endif | ins_next2 | |3: // Call __index metamethod. @@ -920,7 +989,12 @@ static void build_subroutines(BuildCtx * | // Returns cTValue * or NULL. | cmplwi CRET1, 0 | beq >1 + |.if FPU | lfd f14, 0(CRET1) + |.else + | lwz SAVE0, 0(CRET1) + | lwz SAVE1, 4(CRET1) + |.endif | b ->BC_TGETR_Z |1: | stwx TISNIL, BASE, RA @@ -975,11 +1049,21 @@ static void build_subroutines(BuildCtx * | bl extern lj_meta_tset // (lua_State *L, TValue *o, TValue *k) | // Returns TValue * (finished) or NULL (metamethod). | cmplwi CRET1, 0 + |.if FPU | lfdx f0, BASE, RA + |.else + | lwzux TMP2, RA, BASE + | lwz TMP3, 4(RA) + |.endif | beq >3 | // NOBARRIER: lj_meta_tset ensures the table is not black. | ins_next1 + |.if FPU | stfd f0, 0(CRET1) + |.else + | stw TMP2, 0(CRET1) + | stw TMP3, 4(CRET1) + |.endif | ins_next2 | |3: // Call __newindex metamethod. @@ -990,7 +1074,12 @@ static void build_subroutines(BuildCtx * | add PC, TMP1, BASE | lwz LFUNC:RB, FRAME_FUNC(BASE) // Guaranteed to be a function here. | li NARGS8:RC, 24 // 3 args for func(t, k, v) + |.if FPU | stfd f0, 16(BASE) // Copy value to third argument. + |.else + | stw TMP2, 16(BASE) + | stw TMP3, 20(BASE) + |.endif | b ->vm_call_dispatch_f | |->vmeta_tsetr: @@ -998,7 +1087,12 @@ static void build_subroutines(BuildCtx * | stw PC, SAVE_PC | bl extern lj_tab_setinth // (lua_State *L, GCtab *t, int32_t key) | // Returns TValue *. + |.if FPU | stfd f14, 0(CRET1) + |.else + | stw SAVE0, 0(CRET1) + | stw SAVE1, 4(CRET1) + |.endif | b ->cont_nop | |//-- Comparison metamethods --------------------------------------------- @@ -1037,9 +1131,19 @@ static void build_subroutines(BuildCtx * | |->cont_ra: // RA = resultptr | lwz INS, -4(PC) + |.if FPU | lfd f0, 0(RA) + |.else + | lwz CARG1, 0(RA) + | lwz CARG2, 4(RA) + |.endif | decode_RA8 TMP1, INS + |.if FPU | stfdx f0, BASE, TMP1 + |.else + | stwux CARG1, TMP1, BASE + | stw CARG2, 4(TMP1) + |.endif | b ->cont_nop | |->cont_condt: // RA = resultptr @@ -1245,22 +1349,32 @@ static void build_subroutines(BuildCtx * |.macro .ffunc_n, name |->ff_ .. name: | cmplwi NARGS8:RC, 8 - | lwz CARG3, 0(BASE) + | lwz CARG1, 0(BASE) + |.if FPU | lfd FARG1, 0(BASE) + |.else + | lwz CARG2, 4(BASE) + |.endif | blt ->fff_fallback - | checknum CARG3; bge ->fff_fallback + | checknum CARG1; bge ->fff_fallback |.endmacro | |.macro .ffunc_nn, name |->ff_ .. name: | cmplwi NARGS8:RC, 16 - | lwz CARG3, 0(BASE) + | lwz CARG1, 0(BASE) + |.if FPU | lfd FARG1, 0(BASE) - | lwz CARG4, 8(BASE) + | lwz CARG3, 8(BASE) | lfd FARG2, 8(BASE) + |.else + | lwz CARG2, 4(BASE) + | lwz CARG3, 8(BASE) + | lwz CARG4, 12(BASE) + |.endif | blt ->fff_fallback + | checknum CARG1; bge ->fff_fallback | checknum CARG3; bge ->fff_fallback - | checknum CARG4; bge ->fff_fallback |.endmacro | |// Inlined GC threshold check. Caveat: uses TMP0 and TMP1. @@ -1281,14 +1395,21 @@ static void build_subroutines(BuildCtx * | bge cr1, ->fff_fallback | stw CARG3, 0(RA) | addi RD, NARGS8:RC, 8 // Compute (nresults+1)*8. + | addi TMP1, BASE, 8 + | add TMP2, RA, NARGS8:RC | stw CARG1, 4(RA) | beq ->fff_res // Done if exactly 1 argument. - | li TMP1, 8 - | subi RC, RC, 8 |1: - | cmplw TMP1, RC - | lfdx f0, BASE, TMP1 - | stfdx f0, RA, TMP1 + | cmplw TMP1, TMP2 + |.if FPU + | lfd f0, 0(TMP1) + | stfd f0, 0(TMP1) + |.else + | lwz CARG1, 0(TMP1) + | lwz CARG2, 4(TMP1) + | stw CARG1, -8(TMP1) + | stw CARG2, -4(TMP1) + |.endif | addi TMP1, TMP1, 8 | bney <1 | b ->fff_res @@ -1303,8 +1424,14 @@ static void build_subroutines(BuildCtx * | orc TMP1, TMP2, TMP0 | addi TMP1, TMP1, ~LJ_TISNUM+1 | slwi TMP1, TMP1, 3 + |.if FPU | la TMP2, CFUNC:RB->upvalue | lfdx FARG1, TMP2, TMP1 + |.else + | add TMP1, CFUNC:RB, TMP1 + | lwz CARG1, CFUNC:TMP1->upvalue[0].u32.hi + | lwz CARG2, CFUNC:TMP1->upvalue[0].u32.lo + |.endif | b ->fff_resn | |//-- Base library: getters and setters --------------------------------- @@ -1382,7 +1509,12 @@ static void build_subroutines(BuildCtx * | mr CARG1, L | bl extern lj_tab_get // (lua_State *L, GCtab *t, cTValue *key) | // Returns cTValue *. + |.if FPU | lfd FARG1, 0(CRET1) + |.else + | lwz CARG2, 4(CRET1) + | lwz CARG1, 0(CRET1) // Caveat: CARG1 == CRET1. + |.endif | b ->fff_resn | |//-- Base library: conversions ------------------------------------------ @@ -1391,7 +1523,11 @@ static void build_subroutines(BuildCtx * | // Only handles the number case inline (without a base argument). | cmplwi NARGS8:RC, 8 | lwz CARG1, 0(BASE) + |.if FPU | lfd FARG1, 0(BASE) + |.else + | lwz CARG2, 4(BASE) + |.endif | bne ->fff_fallback // Exactly one argument. | checknum CARG1; bgt ->fff_fallback | b ->fff_resn @@ -1442,12 +1578,23 @@ static void build_subroutines(BuildCtx * | cmplwi CRET1, 0 | li CARG3, LJ_TNIL | beq ->fff_restv // End of traversal: return nil. - | lfd f0, 8(BASE) // Copy key and value to results. | la RA, -8(BASE) + |.if FPU + | lfd f0, 8(BASE) // Copy key and value to results. | lfd f1, 16(BASE) | stfd f0, 0(RA) - | li RD, (2+1)*8 | stfd f1, 8(RA) + |.else + | lwz CARG1, 8(BASE) + | lwz CARG2, 12(BASE) + | lwz CARG3, 16(BASE) + | lwz CARG4, 20(BASE) + | stw CARG1, 0(RA) + | stw CARG2, 4(RA) + | stw CARG3, 8(RA) + | stw CARG4, 12(RA) + |.endif + | li RD, (2+1)*8 | b ->fff_res | |.ffunc_1 pairs @@ -1456,17 +1603,32 @@ static void build_subroutines(BuildCtx * | bne ->fff_fallback #if LJ_52 | lwz TAB:TMP2, TAB:CARG1->metatable + |.if FPU | lfd f0, CFUNC:RB->upvalue[0] + |.else + | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi + | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo + |.endif | cmplwi TAB:TMP2, 0 | la RA, -8(BASE) | bne ->fff_fallback #else + |.if FPU | lfd f0, CFUNC:RB->upvalue[0] + |.else + | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi + | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo + |.endif | la RA, -8(BASE) #endif | stw TISNIL, 8(BASE) | li RD, (3+1)*8 + |.if FPU | stfd f0, 0(RA) + |.else + | stw TMP0, 0(RA) + | stw TMP1, 4(RA) + |.endif | b ->fff_res | |.ffunc ipairs_aux @@ -1512,14 +1674,24 @@ static void build_subroutines(BuildCtx * | stfd FARG2, 0(RA) |.endif | ble >2 // Not in array part? + |.if FPU | lwzx TMP2, TMP1, TMP3 | lfdx f0, TMP1, TMP3 + |.else + | lwzux TMP2, TMP1, TMP3 + | lwz TMP3, 4(TMP1) + |.endif |1: | checknil TMP2 | li RD, (0+1)*8 | beq ->fff_res // End of iteration, return 0 results. | li RD, (2+1)*8 + |.if FPU | stfd f0, 8(RA) + |.else + | stw TMP2, 8(RA) + | stw TMP3, 12(RA) + |.endif | b ->fff_res |2: // Check for empty hash part first. Otherwise call C function. | lwz TMP0, TAB:CARG1->hmask @@ -1533,7 +1705,11 @@ static void build_subroutines(BuildCtx * | li RD, (0+1)*8 | beq ->fff_res | lwz TMP2, 0(CRET1) + |.if FPU | lfd f0, 0(CRET1) + |.else + | lwz TMP3, 4(CRET1) + |.endif | b <1 | |.ffunc_1 ipairs @@ -1542,12 +1718,22 @@ static void build_subroutines(BuildCtx * | bne ->fff_fallback #if LJ_52 | lwz TAB:TMP2, TAB:CARG1->metatable + |.if FPU | lfd f0, CFUNC:RB->upvalue[0] + |.else + | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi + | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo + |.endif | cmplwi TAB:TMP2, 0 | la RA, -8(BASE) | bne ->fff_fallback #else + |.if FPU | lfd f0, CFUNC:RB->upvalue[0] + |.else + | lwz TMP0, CFUNC:RB->upvalue[0].u32.hi + | lwz TMP1, CFUNC:RB->upvalue[0].u32.lo + |.endif | la RA, -8(BASE) #endif |.if DUALNUM @@ -1557,7 +1743,12 @@ static void build_subroutines(BuildCtx * |.endif | stw ZERO, 12(BASE) | li RD, (3+1)*8 + |.if FPU | stfd f0, 0(RA) + |.else + | stw TMP0, 0(RA) + | stw TMP1, 4(RA) + |.endif | b ->fff_res | |//-- Base library: catch errors ---------------------------------------- @@ -1576,19 +1767,32 @@ static void build_subroutines(BuildCtx * | |.ffunc xpcall | cmplwi NARGS8:RC, 16 - | lwz CARG4, 8(BASE) + | lwz CARG3, 8(BASE) + |.if FPU | lfd FARG2, 8(BASE) | lfd FARG1, 0(BASE) + |.else + | lwz CARG1, 0(BASE) + | lwz CARG2, 4(BASE) + | lwz CARG4, 12(BASE) + |.endif | blt ->fff_fallback | lbz TMP1, DISPATCH_GL(hookmask)(DISPATCH) | mr TMP2, BASE - | checkfunc CARG4; bne ->fff_fallback // Traceback must be a function. + | checkfunc CARG3; bne ->fff_fallback // Traceback must be a function. | la BASE, 16(BASE) | // Remember active hook before pcall. | rlwinm TMP1, TMP1, 32-HOOK_ACTIVE_SHIFT, 31, 31 + |.if FPU | stfd FARG2, 0(TMP2) // Swap function and traceback. - | subi NARGS8:RC, NARGS8:RC, 16 | stfd FARG1, 8(TMP2) + |.else + | stw CARG3, 0(TMP2) + | stw CARG4, 4(TMP2) + | stw CARG1, 8(TMP2) + | stw CARG2, 12(TMP2) + |.endif + | subi NARGS8:RC, NARGS8:RC, 16 | addi PC, TMP1, 16+FRAME_PCALL | b ->vm_call_dispatch | @@ -1631,9 +1835,21 @@ static void build_subroutines(BuildCtx * | stp BASE, L->top |2: // Move args to coroutine. | cmpw TMP1, NARGS8:RC + |.if FPU | lfdx f0, BASE, TMP1 + |.else + | add CARG3, BASE, TMP1 + | lwz TMP2, 0(CARG3) + | lwz TMP3, 4(CARG3) + |.endif | beq >3 + |.if FPU | stfdx f0, CARG2, TMP1 + |.else + | add CARG3, CARG2, TMP1 + | stw TMP2, 0(CARG3) + | stw TMP3, 4(CARG3) + |.endif | addi TMP1, TMP1, 8 | b <2 |3: @@ -1664,8 +1880,17 @@ static void build_subroutines(BuildCtx * | stp TMP2, L:SAVE0->top // Clear coroutine stack. |5: // Move results from coroutine. | cmplw TMP1, TMP3 + |.if FPU | lfdx f0, TMP2, TMP1 | stfdx f0, BASE, TMP1 + |.else + | add CARG3, TMP2, TMP1 + | lwz CARG1, 0(CARG3) + | lwz CARG2, 4(CARG3) + | add CARG3, BASE, TMP1 + | stw CARG1, 0(CARG3) + | stw CARG2, 4(CARG3) + |.endif | addi TMP1, TMP1, 8 | bne <5 |6: @@ -1690,12 +1915,22 @@ static void build_subroutines(BuildCtx * | andix. TMP0, PC, FRAME_TYPE | la TMP3, -8(TMP3) | li TMP1, LJ_TFALSE + |.if FPU | lfd f0, 0(TMP3) + |.else + | lwz CARG1, 0(TMP3) + | lwz CARG2, 4(TMP3) + |.endif | stp TMP3, L:SAVE0->top // Remove error from coroutine stack. | li RD, (2+1)*8 | stw TMP1, -8(BASE) // Prepend false to results. | la RA, -8(BASE) + |.if FPU | stfd f0, 0(BASE) // Copy error message. + |.else + | stw CARG1, 0(BASE) // Copy error message. + | stw CARG2, 4(BASE) + |.endif | b <7 |.else | mr CARG1, L @@ -1874,7 +2109,12 @@ static void build_subroutines(BuildCtx * | lus CARG1, 0x8000 // -(2^31). | beqy ->fff_resi |5: + |.if FPU | lfd FARG1, 0(BASE) + |.else + | lwz CARG1, 0(BASE) + | lwz CARG2, 4(BASE) + |.endif | blex func | b ->fff_resn |.endmacro @@ -1898,10 +2138,14 @@ static void build_subroutines(BuildCtx * | |.ffunc math_log | cmplwi NARGS8:RC, 8 - | lwz CARG3, 0(BASE) - | lfd FARG1, 0(BASE) + | lwz CARG1, 0(BASE) | bne ->fff_fallback // Need exactly 1 argument. - | checknum CARG3; bge ->fff_fallback + | checknum CARG1; bge ->fff_fallback + |.if FPU + | lfd FARG1, 0(BASE) + |.else + | lwz CARG2, 4(BASE) + |.endif | blex log | b ->fff_resn | @@ -1923,17 +2167,24 @@ static void build_subroutines(BuildCtx * |.if DUALNUM |.ffunc math_ldexp | cmplwi NARGS8:RC, 16 - | lwz CARG3, 0(BASE) + | lwz TMP0, 0(BASE) + |.if FPU | lfd FARG1, 0(BASE) - | lwz CARG4, 8(BASE) + |.else + | lwz CARG1, 0(BASE) + | lwz CARG2, 4(BASE) + |.endif + | lwz TMP1, 8(BASE) |.if GPR64 | lwz CARG2, 12(BASE) - |.else + |.elif FPU | lwz CARG1, 12(BASE) + |.else + | lwz CARG3, 12(BASE) |.endif | blt ->fff_fallback - | checknum CARG3; bge ->fff_fallback - | checknum CARG4; bne ->fff_fallback + | checknum TMP0; bge ->fff_fallback + | checknum TMP1; bne ->fff_fallback |.else |.ffunc_nn math_ldexp |.if GPR64 @@ -1948,8 +2199,10 @@ static void build_subroutines(BuildCtx * |.ffunc_n math_frexp |.if GPR64 | la CARG2, DISPATCH_GL(tmptv)(DISPATCH) - |.else + |.elif FPU | la CARG1, DISPATCH_GL(tmptv)(DISPATCH) + |.else + | la CARG3, DISPATCH_GL(tmptv)(DISPATCH) |.endif | lwz PC, FRAME_PC(BASE) | blex frexp @@ -1958,7 +2211,12 @@ static void build_subroutines(BuildCtx * |.if not DUALNUM | tonum_i FARG2, TMP1 |.endif + |.if FPU | stfd FARG1, 0(RA) + |.else + | stw CRET1, 0(RA) + | stw CRET2, 4(RA) + |.endif | li RD, (2+1)*8 |.if DUALNUM | stw TISNUM, 8(RA) @@ -1971,13 +2229,20 @@ static void build_subroutines(BuildCtx * |.ffunc_n math_modf |.if GPR64 | la CARG2, -8(BASE) - |.else + |.elif FPU | la CARG1, -8(BASE) + |.else + | la CARG3, -8(BASE) |.endif | lwz PC, FRAME_PC(BASE) | blex modf | la RA, -8(BASE) + |.if FPU | stfd FARG1, 0(BASE) + |.else + | stw CRET1, 0(BASE) + | stw CRET2, 4(BASE) + |.endif | li RD, (2+1)*8 | b ->fff_res | @@ -1985,13 +2250,13 @@ static void build_subroutines(BuildCtx * |.if DUALNUM | .ffunc_1 name | checknum CARG3 - | addi TMP1, BASE, 8 - | add TMP2, BASE, NARGS8:RC + | addi SAVE0, BASE, 8 + | add SAVE1, BASE, NARGS8:RC | bne >4 |1: // Handle integers. - | lwz CARG4, 0(TMP1) - | cmplw cr1, TMP1, TMP2 - | lwz CARG2, 4(TMP1) + | lwz CARG4, 0(SAVE0) + | cmplw cr1, SAVE0, SAVE1 + | lwz CARG2, 4(SAVE0) | bge cr1, ->fff_resi | checknum CARG4 | xoris TMP0, CARG1, 0x8000 @@ -2008,36 +2273,76 @@ static void build_subroutines(BuildCtx * |.if GPR64 | rldicl CARG1, CARG1, 0, 32 |.endif - | addi TMP1, TMP1, 8 + | addi SAVE0, SAVE0, 8 | b <1 |3: | bge ->fff_fallback | // Convert intermediate result to number and continue below. + |.if FPU | tonum_i FARG1, CARG1 - | lfd FARG2, 0(TMP1) + | lfd FARG2, 0(SAVE0) + |.else + | mr CARG2, CARG1 + | bl ->vm_sfi2d_1 + | lwz CARG3, 0(SAVE0) + | lwz CARG4, 4(SAVE0) + |.endif | b >6 |4: + |.if FPU | lfd FARG1, 0(BASE) + |.else + | lwz CARG1, 0(BASE) + | lwz CARG2, 4(BASE) + |.endif | bge ->fff_fallback |5: // Handle numbers. - | lwz CARG4, 0(TMP1) - | cmplw cr1, TMP1, TMP2 - | lfd FARG2, 0(TMP1) + | lwz CARG3, 0(SAVE0) + | cmplw cr1, SAVE0, SAVE1 + |.if FPU + | lfd FARG2, 0(SAVE0) + |.else + | lwz CARG4, 4(SAVE0) + |.endif | bge cr1, ->fff_resn - | checknum CARG4; bge >7 + | checknum CARG3; bge >7 |6: + | addi SAVE0, SAVE0, 8 + |.if FPU | fsub f0, FARG1, FARG2 - | addi TMP1, TMP1, 8 |.if ismax | fsel FARG1, f0, FARG1, FARG2 |.else | fsel FARG1, f0, FARG2, FARG1 |.endif + |.else + | stw CARG1, SFSAVE_1 + | stw CARG2, SFSAVE_2 + | stw CARG3, SFSAVE_3 + | stw CARG4, SFSAVE_4 + | blex __ledf2 + | cmpwi CRET1, 0 + |.if ismax + | blt >8 + |.else + | bge >8 + |.endif + | lwz CARG1, SFSAVE_1 + | lwz CARG2, SFSAVE_2 + | b <5 + |8: + | lwz CARG1, SFSAVE_3 + | lwz CARG2, SFSAVE_4 + |.endif | b <5 |7: // Convert integer to number and continue above. - | lwz CARG2, 4(TMP1) + | lwz CARG3, 4(SAVE0) | bne ->fff_fallback - | tonum_i FARG2, CARG2 + |.if FPU + | tonum_i FARG2, CARG3 + |.else + | bl ->vm_sfi2d_2 + |.endif | b <6 |.else | .ffunc_n name @@ -2237,28 +2542,37 @@ static void build_subroutines(BuildCtx * | |.macro .ffunc_bit_op, name, ins | .ffunc_bit name - | addi TMP1, BASE, 8 - | add TMP2, BASE, NARGS8:RC + | addi SAVE0, BASE, 8 + | add SAVE1, BASE, NARGS8:RC |1: - | lwz CARG4, 0(TMP1) - | cmplw cr1, TMP1, TMP2 + | lwz CARG4, 0(SAVE0) + | cmplw cr1, SAVE0, SAVE1 |.if DUALNUM - | lwz CARG2, 4(TMP1) + | lwz CARG2, 4(SAVE0) |.else - | lfd FARG1, 0(TMP1) + | lfd FARG1, 0(SAVE0) |.endif | bgey cr1, ->fff_resi | checknum CARG4 |.if DUALNUM + |.if FPU | bnel ->fff_bitop_fb |.else + | beq >3 + | stw CARG1, SFSAVE_1 + | bl ->fff_bitop_fb + | mr CARG2, CARG1 + | lwz CARG1, SFSAVE_1 + |3: + |.endif + |.else | fadd FARG1, FARG1, TOBIT | bge ->fff_fallback | stfd FARG1, TMPD | lwz CARG2, TMPD_LO |.endif | ins CARG1, CARG1, CARG2 - | addi TMP1, TMP1, 8 + | addi SAVE0, SAVE0, 8 | b <1 |.endmacro | @@ -2280,7 +2594,14 @@ static void build_subroutines(BuildCtx * |.macro .ffunc_bit_sh, name, ins, shmod |.if DUALNUM | .ffunc_2 bit_..name + |.if FPU | checknum CARG3; bnel ->fff_tobit_fb + |.else + | checknum CARG3; beq >1 + | bl ->fff_tobit_fb + | lwz CARG2, 12(BASE) // Conversion polluted CARG2. + |1: + |.endif | // Note: no inline conversion from number for 2nd argument! | checknum CARG4; bne ->fff_fallback |.else @@ -2317,27 +2638,77 @@ static void build_subroutines(BuildCtx * |->fff_resn: | lwz PC, FRAME_PC(BASE) | la RA, -8(BASE) + |.if FPU | stfd FARG1, -8(BASE) + |.else + | stw CARG1, -8(BASE) + | stw CARG2, -4(BASE) + |.endif | b ->fff_res1 | |// Fallback FP number to bit conversion. |->fff_tobit_fb: |.if DUALNUM + |.if FPU | lfd FARG1, 0(BASE) | bgt ->fff_fallback | fadd FARG1, FARG1, TOBIT | stfd FARG1, TMPD | lwz CARG1, TMPD_LO | blr + |.else + | bgt ->fff_fallback + | mr CARG2, CARG1 + | mr CARG1, CARG3 + |// Modifies: CARG1, CARG2, TMP0, TMP1, TMP2. + |->vm_tobit: + | slwi TMP2, CARG1, 1 + | addis TMP2, TMP2, 0x0020 + | cmpwi TMP2, 0 + | bge >2 + | li TMP1, 0x3e0 + | srawi TMP2, TMP2, 21 + | not TMP1, TMP1 + | sub. TMP2, TMP1, TMP2 + | cmpwi cr7, CARG1, 0 + | blt >1 + | slwi TMP1, CARG1, 11 + | srwi TMP0, CARG2, 21 + | oris TMP1, TMP1, 0x8000 + | or TMP1, TMP1, TMP0 + | srw CARG1, TMP1, TMP2 + | bclr 4, 28 // Return if cr7[lt] == 0, no hint. + | neg CARG1, CARG1 + | blr + |1: + | addi TMP2, TMP2, 21 + | srw TMP1, CARG2, TMP2 + | slwi CARG2, CARG1, 12 + | subfic TMP2, TMP2, 20 + | slw TMP0, CARG2, TMP2 + | or CARG1, TMP1, TMP0 + | bclr 4, 28 // Return if cr7[lt] == 0, no hint. + | neg CARG1, CARG1 + | blr + |2: + | li CARG1, 0 + | blr + |.endif |.endif |->fff_bitop_fb: |.if DUALNUM - | lfd FARG1, 0(TMP1) + |.if FPU + | lfd FARG1, 0(SAVE0) | bgt ->fff_fallback | fadd FARG1, FARG1, TOBIT | stfd FARG1, TMPD | lwz CARG2, TMPD_LO | blr + |.else + | bgt ->fff_fallback + | mr CARG1, CARG4 + | b ->vm_tobit + |.endif |.endif | |//----------------------------------------------------------------------- @@ -2530,10 +2901,21 @@ static void build_subroutines(BuildCtx * | decode_RA8 RC, INS // Call base. | beq >2 |1: // Move results down. + |.if FPU | lfd f0, 0(RA) + |.else + | lwz CARG1, 0(RA) + | lwz CARG2, 4(RA) + |.endif | addic. TMP1, TMP1, -8 | addi RA, RA, 8 + |.if FPU | stfdx f0, BASE, RC + |.else + | add CARG3, BASE, RC + | stw CARG1, 0(CARG3) + | stw CARG2, 4(CARG3) + |.endif | addi RC, RC, 8 | bne <1 |2: @@ -2586,10 +2968,12 @@ static void build_subroutines(BuildCtx * |//----------------------------------------------------------------------- | |.macro savex_, a, b, c, d + |.if FPU | stfd f..a, 16+a*8(sp) | stfd f..b, 16+b*8(sp) | stfd f..c, 16+c*8(sp) | stfd f..d, 16+d*8(sp) + |.endif |.endmacro | |->vm_exit_handler: @@ -2661,16 +3045,16 @@ static void build_subroutines(BuildCtx * | lwz KBASE, PC2PROTO(k)(TMP1) | // Setup type comparison constants. | li TISNUM, LJ_TISNUM - | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). - | stw TMP3, TMPD + | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU stw TMP3, TMPD | li ZERO, 0 - | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). - | lfs TOBIT, TMPD - | stw TMP3, TMPD - | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) + | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). + | .FPU lfs TOBIT, TMPD + | .FPU stw TMP3, TMPD + | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) | li TISNIL, LJ_TNIL - | stw TMP0, TONUM_HI - | lfs TONUM, TMPD + | .FPU stw TMP0, TONUM_HI + | .FPU lfs TONUM, TMPD | // Modified copy of ins_next which handles function header dispatch, too. | lwz INS, 0(PC) | addi PC, PC, 4 @@ -2715,7 +3099,35 @@ static void build_subroutines(BuildCtx * |//-- Math helper functions ---------------------------------------------- |//----------------------------------------------------------------------- | - |// NYI: Use internal implementations of floor, ceil, trunc. + |// NYI: Use internal implementations of floor, ceil, trunc, sfcmp. + | + |.macro sfi2d, AHI, ALO + |.if not FPU + | mr. AHI, ALO + | bclr 12, 2 // Handle zero first. + | srawi TMP0, ALO, 31 + | xor TMP1, ALO, TMP0 + | sub TMP1, TMP1, TMP0 // Absolute value in TMP1. + | cntlzw AHI, TMP1 + | andix. TMP0, TMP0, 0x800 // Mask sign bit. + | slw TMP1, TMP1, AHI // Align mantissa left with leading 1. + | subfic AHI, AHI, 0x3ff+31-1 // Exponent -1 in AHI. + | slwi ALO, TMP1, 21 + | or AHI, AHI, TMP0 // Sign | Exponent. + | srwi TMP1, TMP1, 11 + | slwi AHI, AHI, 20 // Align left. + | add AHI, AHI, TMP1 // Add mantissa, increment exponent. + | blr + |.endif + |.endmacro + | + |// Input: CARG2. Output: CARG1, CARG2. Temporaries: TMP0, TMP1. + |->vm_sfi2d_1: + | sfi2d CARG1, CARG2 + | + |// Input: CARG4. Output: CARG3, CARG4. Temporaries: TMP0, TMP1. + |->vm_sfi2d_2: + | sfi2d CARG3, CARG4 | |->vm_modi: | divwo. TMP0, CARG1, CARG2 @@ -2783,21 +3195,21 @@ static void build_subroutines(BuildCtx * | addi DISPATCH, r12, GG_G2DISP | stw r11, CTSTATE->cb.slot | stw r3, CTSTATE->cb.gpr[0] - | stfd f1, CTSTATE->cb.fpr[0] + | .FPU stfd f1, CTSTATE->cb.fpr[0] | stw r4, CTSTATE->cb.gpr[1] - | stfd f2, CTSTATE->cb.fpr[1] + | .FPU stfd f2, CTSTATE->cb.fpr[1] | stw r5, CTSTATE->cb.gpr[2] - | stfd f3, CTSTATE->cb.fpr[2] + | .FPU stfd f3, CTSTATE->cb.fpr[2] | stw r6, CTSTATE->cb.gpr[3] - | stfd f4, CTSTATE->cb.fpr[3] + | .FPU stfd f4, CTSTATE->cb.fpr[3] | stw r7, CTSTATE->cb.gpr[4] - | stfd f5, CTSTATE->cb.fpr[4] + | .FPU stfd f5, CTSTATE->cb.fpr[4] | stw r8, CTSTATE->cb.gpr[5] - | stfd f6, CTSTATE->cb.fpr[5] + | .FPU stfd f6, CTSTATE->cb.fpr[5] | stw r9, CTSTATE->cb.gpr[6] - | stfd f7, CTSTATE->cb.fpr[6] + | .FPU stfd f7, CTSTATE->cb.fpr[6] | stw r10, CTSTATE->cb.gpr[7] - | stfd f8, CTSTATE->cb.fpr[7] + | .FPU stfd f8, CTSTATE->cb.fpr[7] | addi TMP0, sp, CFRAME_SPACE+8 | stw TMP0, CTSTATE->cb.stack | mr CARG1, CTSTATE @@ -2808,21 +3220,21 @@ static void build_subroutines(BuildCtx * | lp BASE, L:CRET1->base | li TISNUM, LJ_TISNUM // Setup type comparison constants. | lp RC, L:CRET1->top - | lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). + | .FPU lus TMP3, 0x59c0 // TOBIT = 2^52 + 2^51 (float). | li ZERO, 0 | mr L, CRET1 - | stw TMP3, TMPD - | lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) + | .FPU stw TMP3, TMPD + | .FPU lus TMP0, 0x4338 // Hiword of 2^52 + 2^51 (double) | lwz LFUNC:RB, FRAME_FUNC(BASE) - | ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). - | stw TMP0, TONUM_HI + | .FPU ori TMP3, TMP3, 0x0004 // TONUM = 2^52 + 2^51 + 2^31 (float). + | .FPU stw TMP0, TONUM_HI | li TISNIL, LJ_TNIL | li_vmstate INTERP - | lfs TOBIT, TMPD - | stw TMP3, TMPD + | .FPU lfs TOBIT, TMPD + | .FPU stw TMP3, TMPD | sub RC, RC, BASE | st_vmstate - | lfs TONUM, TMPD + | .FPU lfs TONUM, TMPD | ins_callt |.endif | @@ -2836,7 +3248,7 @@ static void build_subroutines(BuildCtx * | mr CARG2, RA | bl extern lj_ccallback_leave // (CTState *cts, TValue *o) | lwz CRET1, CTSTATE->cb.gpr[0] - | lfd FARG1, CTSTATE->cb.fpr[0] + | .FPU lfd FARG1, CTSTATE->cb.fpr[0] | lwz CRET2, CTSTATE->cb.gpr[1] | b ->vm_leave_unw |.endif @@ -2870,14 +3282,14 @@ static void build_subroutines(BuildCtx * | bge <1 |2: | bney cr1, >3 - | lfd f1, CCSTATE->fpr[0] - | lfd f2, CCSTATE->fpr[1] - | lfd f3, CCSTATE->fpr[2] - | lfd f4, CCSTATE->fpr[3] - | lfd f5, CCSTATE->fpr[4] - | lfd f6, CCSTATE->fpr[5] - | lfd f7, CCSTATE->fpr[6] - | lfd f8, CCSTATE->fpr[7] + | .FPU lfd f1, CCSTATE->fpr[0] + | .FPU lfd f2, CCSTATE->fpr[1] + | .FPU lfd f3, CCSTATE->fpr[2] + | .FPU lfd f4, CCSTATE->fpr[3] + | .FPU lfd f5, CCSTATE->fpr[4] + | .FPU lfd f6, CCSTATE->fpr[5] + | .FPU lfd f7, CCSTATE->fpr[6] + | .FPU lfd f8, CCSTATE->fpr[7] |3: | lp TMP0, CCSTATE->func | lwz CARG2, CCSTATE->gpr[1] @@ -2894,7 +3306,7 @@ static void build_subroutines(BuildCtx * | lwz TMP2, -4(r14) | lwz TMP0, 4(r14) | stw CARG1, CCSTATE:TMP1->gpr[0] - | stfd FARG1, CCSTATE:TMP1->fpr[0] + | .FPU stfd FARG1, CCSTATE:TMP1->fpr[0] | stw CARG2, CCSTATE:TMP1->gpr[1] | mtlr TMP0 | stw CARG3, CCSTATE:TMP1->gpr[2] @@ -2923,19 +3335,19 @@ static void build_ins(BuildCtx *ctx, BCO case BC_ISLT: case BC_ISGE: case BC_ISLE: case BC_ISGT: | // RA = src1*8, RD = src2*8, JMP with RD = target |.if DUALNUM - | lwzux TMP0, RA, BASE + | lwzux CARG1, RA, BASE | addi PC, PC, 4 | lwz CARG2, 4(RA) - | lwzux TMP1, RD, BASE + | lwzux CARG3, RD, BASE | lwz TMP2, -4(PC) - | checknum cr0, TMP0 - | lwz CARG3, 4(RD) + | checknum cr0, CARG1 + | lwz CARG4, 4(RD) | decode_RD4 TMP2, TMP2 - | checknum cr1, TMP1 - | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) + | checknum cr1, CARG3 + | addis SAVE0, TMP2, -(BCBIAS_J*4 >> 16) | bne cr0, >7 | bne cr1, >8 - | cmpw CARG2, CARG3 + | cmpw CARG2, CARG4 if (op == BC_ISLT) { | bge >2 } else if (op == BC_ISGE) { @@ -2946,28 +3358,41 @@ static void build_ins(BuildCtx *ctx, BCO | ble >2 } |1: - | add PC, PC, TMP2 + | add PC, PC, SAVE0 |2: | ins_next | |7: // RA is not an integer. | bgt cr0, ->vmeta_comp | // RA is a number. - | lfd f0, 0(RA) + | .FPU lfd f0, 0(RA) | bgt cr1, ->vmeta_comp | blt cr1, >4 | // RA is a number, RD is an integer. - | tonum_i f1, CARG3 + |.if FPU + | tonum_i f1, CARG4 + |.else + | bl ->vm_sfi2d_2 + |.endif | b >5 | |8: // RA is an integer, RD is not an integer. | bgt cr1, ->vmeta_comp | // RA is an integer, RD is a number. + |.if FPU | tonum_i f0, CARG2 + |.else + | bl ->vm_sfi2d_1 + |.endif |4: - | lfd f1, 0(RD) + | .FPU lfd f1, 0(RD) |5: + |.if FPU | fcmpu cr0, f0, f1 + |.else + | blex __ledf2 + | cmpwi CRET1, 0 + |.endif if (op == BC_ISLT) { | bge <2 } else if (op == BC_ISGE) { @@ -3015,42 +3440,42 @@ static void build_ins(BuildCtx *ctx, BCO vk = op == BC_ISEQV; | // RA = src1*8, RD = src2*8, JMP with RD = target |.if DUALNUM - | lwzux TMP0, RA, BASE + | lwzux CARG1, RA, BASE | addi PC, PC, 4 | lwz CARG2, 4(RA) - | lwzux TMP1, RD, BASE - | checknum cr0, TMP0 - | lwz TMP2, -4(PC) - | checknum cr1, TMP1 - | decode_RD4 TMP2, TMP2 - | lwz CARG3, 4(RD) + | lwzux CARG3, RD, BASE + | checknum cr0, CARG1 + | lwz SAVE0, -4(PC) + | checknum cr1, CARG3 + | decode_RD4 SAVE0, SAVE0 + | lwz CARG4, 4(RD) | cror 4*cr7+gt, 4*cr0+gt, 4*cr1+gt - | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) + | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16) if (vk) { | ble cr7, ->BC_ISEQN_Z } else { | ble cr7, ->BC_ISNEN_Z } |.else - | lwzux TMP0, RA, BASE - | lwz TMP2, 0(PC) + | lwzux CARG1, RA, BASE + | lwz SAVE0, 0(PC) | lfd f0, 0(RA) | addi PC, PC, 4 - | lwzux TMP1, RD, BASE - | checknum cr0, TMP0 - | decode_RD4 TMP2, TMP2 + | lwzux CARG3, RD, BASE + | checknum cr0, CARG1 + | decode_RD4 SAVE0, SAVE0 | lfd f1, 0(RD) - | checknum cr1, TMP1 - | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) + | checknum cr1, CARG3 + | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16) | bge cr0, >5 | bge cr1, >5 | fcmpu cr0, f0, f1 if (vk) { | bne >1 - | add PC, PC, TMP2 + | add PC, PC, SAVE0 } else { | beq >1 - | add PC, PC, TMP2 + | add PC, PC, SAVE0 } |1: | ins_next @@ -3058,36 +3483,36 @@ static void build_ins(BuildCtx *ctx, BCO |5: // Either or both types are not numbers. |.if not DUALNUM | lwz CARG2, 4(RA) - | lwz CARG3, 4(RD) + | lwz CARG4, 4(RD) |.endif |.if FFI - | cmpwi cr7, TMP0, LJ_TCDATA - | cmpwi cr5, TMP1, LJ_TCDATA + | cmpwi cr7, CARG1, LJ_TCDATA + | cmpwi cr5, CARG3, LJ_TCDATA |.endif - | not TMP3, TMP0 - | cmplw TMP0, TMP1 - | cmplwi cr1, TMP3, ~LJ_TISPRI // Primitive? + | not TMP2, CARG1 + | cmplw CARG1, CARG3 + | cmplwi cr1, TMP2, ~LJ_TISPRI // Primitive? |.if FFI | cror 4*cr7+eq, 4*cr7+eq, 4*cr5+eq |.endif - | cmplwi cr6, TMP3, ~LJ_TISTABUD // Table or userdata? + | cmplwi cr6, TMP2, ~LJ_TISTABUD // Table or userdata? |.if FFI | beq cr7, ->vmeta_equal_cd |.endif - | cmplw cr5, CARG2, CARG3 + | cmplw cr5, CARG2, CARG4 | crandc 4*cr0+gt, 4*cr0+eq, 4*cr1+gt // 2: Same type and primitive. | crorc 4*cr0+lt, 4*cr5+eq, 4*cr0+eq // 1: Same tv or different type. | crand 4*cr0+eq, 4*cr0+eq, 4*cr5+eq // 0: Same type and same tv. - | mr SAVE0, PC + | mr SAVE1, PC | cror 4*cr0+eq, 4*cr0+eq, 4*cr0+gt // 0 or 2. | cror 4*cr0+lt, 4*cr0+lt, 4*cr0+gt // 1 or 2. if (vk) { | bne cr0, >6 - | add PC, PC, TMP2 + | add PC, PC, SAVE0 |6: } else { | beq cr0, >6 - | add PC, PC, TMP2 + | add PC, PC, SAVE0 |6: } |.if DUALNUM @@ -3102,6 +3527,7 @@ static void build_ins(BuildCtx *ctx, BCO | | // Different tables or userdatas. Need to check __eq metamethod. | // Field metatable must be at same offset for GCtab and GCudata! + | mr CARG3, CARG4 | lwz TAB:TMP2, TAB:CARG2->metatable | li CARG4, 1-vk // ne = 0 or 1. | cmplwi TAB:TMP2, 0 @@ -3109,7 +3535,7 @@ static void build_ins(BuildCtx *ctx, BCO | lbz TMP2, TAB:TMP2->nomm | andix. TMP2, TMP2, 1<vmeta_equal // Handle __eq metamethod. break; @@ -3150,16 +3576,16 @@ static void build_ins(BuildCtx *ctx, BCO vk = op == BC_ISEQN; | // RA = src*8, RD = num_const*8, JMP with RD = target |.if DUALNUM - | lwzux TMP0, RA, BASE + | lwzux CARG1, RA, BASE | addi PC, PC, 4 | lwz CARG2, 4(RA) - | lwzux TMP1, RD, KBASE - | checknum cr0, TMP0 - | lwz TMP2, -4(PC) - | checknum cr1, TMP1 - | decode_RD4 TMP2, TMP2 - | lwz CARG3, 4(RD) - | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) + | lwzux CARG3, RD, KBASE + | checknum cr0, CARG1 + | lwz SAVE0, -4(PC) + | checknum cr1, CARG3 + | decode_RD4 SAVE0, SAVE0 + | lwz CARG4, 4(RD) + | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16) if (vk) { |->BC_ISEQN_Z: } else { @@ -3167,7 +3593,7 @@ static void build_ins(BuildCtx *ctx, BCO } | bne cr0, >7 | bne cr1, >8 - | cmpw CARG2, CARG3 + | cmpw CARG2, CARG4 |4: |.else if (vk) { @@ -3175,20 +3601,20 @@ static void build_ins(BuildCtx *ctx, BCO } else { |->BC_ISNEN_Z: // Dummy label. } - | lwzx TMP0, BASE, RA + | lwzx CARG1, BASE, RA | addi PC, PC, 4 | lfdx f0, BASE, RA - | lwz TMP2, -4(PC) + | lwz SAVE0, -4(PC) | lfdx f1, KBASE, RD - | decode_RD4 TMP2, TMP2 - | checknum TMP0 - | addis TMP2, TMP2, -(BCBIAS_J*4 >> 16) + | decode_RD4 SAVE0, SAVE0 + | checknum CARG1 + | addis SAVE0, SAVE0, -(BCBIAS_J*4 >> 16) | bge >3 | fcmpu cr0, f0, f1 |.endif if (vk) { | bne >1 - | add PC, PC, TMP2 + | add PC, PC, SAVE0 |1: |.if not FFI |3: @@ -3199,13 +3625,13 @@ static void build_ins(BuildCtx *ctx, BCO |.if not FFI |3: |.endif - | add PC, PC, TMP2 + | add PC, PC, SAVE0 |2: } | ins_next |.if FFI |3: - | cmpwi TMP0, LJ_TCDATA + | cmpwi CARG1, LJ_TCDATA | beq ->vmeta_equal_cd | b <1 |.endif @@ -3213,18 +3639,31 @@ static void build_ins(BuildCtx *ctx, BCO |7: // RA is not an integer. | bge cr0, <3 | // RA is a number. - | lfd f0, 0(RA) + | .FPU lfd f0, 0(RA) | blt cr1, >1 | // RA is a number, RD is an integer. - | tonum_i f1, CARG3 + |.if FPU + | tonum_i f1, CARG4 + |.else + | bl ->vm_sfi2d_2 + |.endif | b >2 | |8: // RA is an integer, RD is a number. + |.if FPU | tonum_i f0, CARG2 + |.else + | bl ->vm_sfi2d_1 + |.endif |1: - | lfd f1, 0(RD) + | .FPU lfd f1, 0(RD) |2: + |.if FPU | fcmpu cr0, f0, f1 + |.else + | blex __ledf2 + | cmpwi CRET1, 0 + |.endif | b <4 |.endif break; @@ -3279,7 +3718,12 @@ static void build_ins(BuildCtx *ctx, BCO | add PC, PC, TMP2 } else { | li TMP1, LJ_TFALSE + |.if FPU | lfdx f0, BASE, RD + |.else + | lwzux CARG1, RD, BASE + | lwz CARG2, 4(RD) + |.endif | cmplw TMP0, TMP1 if (op == BC_ISTC) { | bge >1 @@ -3288,7 +3732,12 @@ static void build_ins(BuildCtx *ctx, BCO } | addis PC, PC, -(BCBIAS_J*4 >> 16) | decode_RD4 TMP2, INS + |.if FPU | stfdx f0, BASE, RA + |.else + | stwux CARG1, RA, BASE + | stw CARG2, 4(RA) + |.endif | add PC, PC, TMP2 |1: } @@ -3323,8 +3772,15 @@ static void build_ins(BuildCtx *ctx, BCO case BC_MOV: | // RA = dst*8, RD = src*8 | ins_next1 + |.if FPU | lfdx f0, BASE, RD | stfdx f0, BASE, RA + |.else + | lwzux TMP0, RD, BASE + | lwz TMP1, 4(RD) + | stwux TMP0, RA, BASE + | stw TMP1, 4(RA) + |.endif | ins_next2 break; case BC_NOT: @@ -3426,44 +3882,65 @@ static void build_ins(BuildCtx *ctx, BCO ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN); ||switch (vk) { ||case 0: - | lwzx TMP1, BASE, RB + | lwzx CARG1, BASE, RB | .if DUALNUM - | lwzx TMP2, KBASE, RC + | lwzx CARG3, KBASE, RC | .endif + | .if FPU | lfdx f14, BASE, RB | lfdx f15, KBASE, RC + | .else + | add TMP1, BASE, RB + | add TMP2, KBASE, RC + | lwz CARG2, 4(TMP1) + | lwz CARG4, 4(TMP2) + | .endif | .if DUALNUM - | checknum cr0, TMP1 - | checknum cr1, TMP2 + | checknum cr0, CARG1 + | checknum cr1, CARG3 | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | bge ->vmeta_arith_vn | .else - | checknum TMP1; bge ->vmeta_arith_vn + | checknum CARG1; bge ->vmeta_arith_vn | .endif || break; ||case 1: - | lwzx TMP1, BASE, RB + | lwzx CARG1, BASE, RB | .if DUALNUM - | lwzx TMP2, KBASE, RC + | lwzx CARG3, KBASE, RC | .endif + | .if FPU | lfdx f15, BASE, RB | lfdx f14, KBASE, RC + | .else + | add TMP1, BASE, RB + | add TMP2, KBASE, RC + | lwz CARG2, 4(TMP1) + | lwz CARG4, 4(TMP2) + | .endif | .if DUALNUM - | checknum cr0, TMP1 - | checknum cr1, TMP2 + | checknum cr0, CARG1 + | checknum cr1, CARG3 | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | bge ->vmeta_arith_nv | .else - | checknum TMP1; bge ->vmeta_arith_nv + | checknum CARG1; bge ->vmeta_arith_nv | .endif || break; ||default: - | lwzx TMP1, BASE, RB - | lwzx TMP2, BASE, RC + | lwzx CARG1, BASE, RB + | lwzx CARG3, BASE, RC + | .if FPU | lfdx f14, BASE, RB | lfdx f15, BASE, RC - | checknum cr0, TMP1 - | checknum cr1, TMP2 + | .else + | add TMP1, BASE, RB + | add TMP2, BASE, RC + | lwz CARG2, 4(TMP1) + | lwz CARG4, 4(TMP2) + | .endif + | checknum cr0, CARG1 + | checknum cr1, CARG3 | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | bge ->vmeta_arith_vv || break; @@ -3497,48 +3974,78 @@ static void build_ins(BuildCtx *ctx, BCO | fsub a, b, a // b - floor(b/c)*c |.endmacro | + |.macro sfpmod + |->BC_MODVN_Z: + | stw CARG1, SFSAVE_1 + | stw CARG2, SFSAVE_2 + | mr SAVE0, CARG3 + | mr SAVE1, CARG4 + | blex __divdf3 + | blex floor + | mr CARG3, SAVE0 + | mr CARG4, SAVE1 + | blex __muldf3 + | mr CARG3, CRET1 + | mr CARG4, CRET2 + | lwz CARG1, SFSAVE_1 + | lwz CARG2, SFSAVE_2 + | blex __subdf3 + |.endmacro + | |.macro ins_arithfp, fpins | ins_arithpre |.if "fpins" == "fpmod_" | b ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. - |.else + |.elif FPU | fpins f0, f14, f15 | ins_next1 | stfdx f0, BASE, RA | ins_next2 + |.else + | blex __divdf3 // Only soft-float div uses this macro. + | ins_next1 + | stwux CRET1, RA, BASE + | stw CRET2, 4(RA) + | ins_next2 |.endif |.endmacro | - |.macro ins_arithdn, intins, fpins + |.macro ins_arithdn, intins, fpins, fpcall | // RA = dst*8, RB = src1*8, RC = src2*8 | num_const*8 ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN); ||switch (vk) { ||case 0: - | lwzux TMP1, RB, BASE - | lwzux TMP2, RC, KBASE - | lwz CARG1, 4(RB) - | checknum cr0, TMP1 - | lwz CARG2, 4(RC) + | lwzux CARG1, RB, BASE + | lwzux CARG3, RC, KBASE + | lwz CARG2, 4(RB) + | checknum cr0, CARG1 + | lwz CARG4, 4(RC) + | checknum cr1, CARG3 || break; ||case 1: - | lwzux TMP1, RB, BASE - | lwzux TMP2, RC, KBASE - | lwz CARG2, 4(RB) - | checknum cr0, TMP1 - | lwz CARG1, 4(RC) + | lwzux CARG3, RB, BASE + | lwzux CARG1, RC, KBASE + | lwz CARG4, 4(RB) + | checknum cr0, CARG3 + | lwz CARG2, 4(RC) + | checknum cr1, CARG1 || break; ||default: - | lwzux TMP1, RB, BASE - | lwzux TMP2, RC, BASE - | lwz CARG1, 4(RB) - | checknum cr0, TMP1 - | lwz CARG2, 4(RC) + | lwzux CARG1, RB, BASE + | lwzux CARG3, RC, BASE + | lwz CARG2, 4(RB) + | checknum cr0, CARG1 + | lwz CARG4, 4(RC) + | checknum cr1, CARG3 || break; ||} - | checknum cr1, TMP2 | bne >5 | bne cr1, >5 - | intins CARG1, CARG1, CARG2 + |.if "intins" == "intmod" + | mr CARG1, CARG2 + | mr CARG2, CARG4 + |.endif + | intins CARG1, CARG2, CARG4 | bso >4 |1: | ins_next1 @@ -3550,29 +4057,40 @@ static void build_ins(BuildCtx *ctx, BCO | checkov TMP0, <1 // Ignore unrelated overflow. | ins_arithfallback b |5: // FP variant. + |.if FPU ||if (vk == 1) { | lfd f15, 0(RB) - | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | lfd f14, 0(RC) ||} else { | lfd f14, 0(RB) - | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | lfd f15, 0(RC) ||} + |.endif + | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | ins_arithfallback bge |.if "fpins" == "fpmod_" | b ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway. |.else + |.if FPU | fpins f0, f14, f15 - | ins_next1 | stfdx f0, BASE, RA + |.else + |.if "fpcall" == "sfpmod" + | sfpmod + |.else + | blex fpcall + |.endif + | stwux CRET1, RA, BASE + | stw CRET2, 4(RA) + |.endif + | ins_next1 | b <2 |.endif |.endmacro | - |.macro ins_arith, intins, fpins + |.macro ins_arith, intins, fpins, fpcall |.if DUALNUM - | ins_arithdn intins, fpins + | ins_arithdn intins, fpins, fpcall |.else | ins_arithfp fpins |.endif @@ -3587,9 +4105,9 @@ static void build_ins(BuildCtx *ctx, BCO | addo. TMP0, TMP0, TMP3 | add y, a, b |.endmacro - | ins_arith addo32., fadd + | ins_arith addo32., fadd, __adddf3 |.else - | ins_arith addo., fadd + | ins_arith addo., fadd, __adddf3 |.endif break; case BC_SUBVN: case BC_SUBNV: case BC_SUBVV: @@ -3601,36 +4119,48 @@ static void build_ins(BuildCtx *ctx, BCO | subo. TMP0, TMP0, TMP3 | sub y, a, b |.endmacro - | ins_arith subo32., fsub + | ins_arith subo32., fsub, __subdf3 |.else - | ins_arith subo., fsub + | ins_arith subo., fsub, __subdf3 |.endif break; case BC_MULVN: case BC_MULNV: case BC_MULVV: - | ins_arith mullwo., fmul + | ins_arith mullwo., fmul, __muldf3 break; case BC_DIVVN: case BC_DIVNV: case BC_DIVVV: | ins_arithfp fdiv break; case BC_MODVN: - | ins_arith intmod, fpmod + | ins_arith intmod, fpmod, sfpmod break; case BC_MODNV: case BC_MODVV: - | ins_arith intmod, fpmod_ + | ins_arith intmod, fpmod_, sfpmod break; case BC_POW: | // NYI: (partial) integer arithmetic. - | lwzx TMP1, BASE, RB + | lwzx CARG1, BASE, RB + | lwzx CARG3, BASE, RC + |.if FPU | lfdx FARG1, BASE, RB - | lwzx TMP2, BASE, RC | lfdx FARG2, BASE, RC - | checknum cr0, TMP1 - | checknum cr1, TMP2 + |.else + | add TMP1, BASE, RB + | add TMP2, BASE, RC + | lwz CARG2, 4(TMP1) + | lwz CARG4, 4(TMP2) + |.endif + | checknum cr0, CARG1 + | checknum cr1, CARG3 | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt | bge ->vmeta_arith_vv | blex pow | ins_next1 + |.if FPU | stfdx FARG1, BASE, RA + |.else + | stwux CARG1, RA, BASE + | stw CARG2, 4(RA) + |.endif | ins_next2 break; @@ -3650,8 +4180,15 @@ static void build_ins(BuildCtx *ctx, BCO | lp BASE, L->base | bne ->vmeta_binop | ins_next1 + |.if FPU | lfdx f0, BASE, SAVE0 // Copy result from RB to RA. | stfdx f0, BASE, RA + |.else + | lwzux TMP0, SAVE0, BASE + | lwz TMP1, 4(SAVE0) + | stwux TMP0, RA, BASE + | stw TMP1, 4(RA) + |.endif | ins_next2 break; @@ -3714,8 +4251,15 @@ static void build_ins(BuildCtx *ctx, BCO case BC_KNUM: | // RA = dst*8, RD = num_const*8 | ins_next1 + |.if FPU | lfdx f0, KBASE, RD | stfdx f0, BASE, RA + |.else + | lwzux TMP0, RD, KBASE + | lwz TMP1, 4(RD) + | stwux TMP0, RA, BASE + | stw TMP1, 4(RA) + |.endif | ins_next2 break; case BC_KPRI: @@ -3748,8 +4292,15 @@ static void build_ins(BuildCtx *ctx, BCO | lwzx UPVAL:RB, LFUNC:RB, RD | ins_next1 | lwz TMP1, UPVAL:RB->v + |.if FPU | lfd f0, 0(TMP1) | stfdx f0, BASE, RA + |.else + | lwz TMP2, 0(TMP1) + | lwz TMP3, 4(TMP1) + | stwux TMP2, RA, BASE + | stw TMP3, 4(RA) + |.endif | ins_next2 break; case BC_USETV: @@ -3757,14 +4308,24 @@ static void build_ins(BuildCtx *ctx, BCO | lwz LFUNC:RB, FRAME_FUNC(BASE) | srwi RA, RA, 1 | addi RA, RA, offsetof(GCfuncL, uvptr) + |.if FPU | lfdux f0, RD, BASE + |.else + | lwzux CARG1, RD, BASE + | lwz CARG3, 4(RD) + |.endif | lwzx UPVAL:RB, LFUNC:RB, RA | lbz TMP3, UPVAL:RB->marked | lwz CARG2, UPVAL:RB->v | andix. TMP3, TMP3, LJ_GC_BLACK // isblack(uv) | lbz TMP0, UPVAL:RB->closed | lwz TMP2, 0(RD) + |.if FPU | stfd f0, 0(CARG2) + |.else + | stw CARG1, 0(CARG2) + | stw CARG3, 4(CARG2) + |.endif | cmplwi cr1, TMP0, 0 | lwz TMP1, 4(RD) | cror 4*cr0+eq, 4*cr0+eq, 4*cr1+eq @@ -3820,11 +4381,21 @@ static void build_ins(BuildCtx *ctx, BCO | lwz LFUNC:RB, FRAME_FUNC(BASE) | srwi RA, RA, 1 | addi RA, RA, offsetof(GCfuncL, uvptr) + |.if FPU | lfdx f0, KBASE, RD + |.else + | lwzux TMP2, RD, KBASE + | lwz TMP3, 4(RD) + |.endif | lwzx UPVAL:RB, LFUNC:RB, RA | ins_next1 | lwz TMP1, UPVAL:RB->v + |.if FPU | stfd f0, 0(TMP1) + |.else + | stw TMP2, 0(TMP1) + | stw TMP3, 4(TMP1) + |.endif | ins_next2 break; case BC_USETP: @@ -3972,11 +4543,21 @@ static void build_ins(BuildCtx *ctx, BCO |.endif | ble ->vmeta_tgetv // Integer key and in array part? | lwzx TMP0, TMP1, TMP2 + |.if FPU | lfdx f14, TMP1, TMP2 + |.else + | lwzux SAVE0, TMP1, TMP2 + | lwz SAVE1, 4(TMP1) + |.endif | checknil TMP0; beq >2 |1: | ins_next1 + |.if FPU | stfdx f14, BASE, RA + |.else + | stwux SAVE0, RA, BASE + | stw SAVE1, 4(RA) + |.endif | ins_next2 | |2: // Check for __index if table value is nil. @@ -4052,12 +4633,22 @@ static void build_ins(BuildCtx *ctx, BCO | lwz TMP1, TAB:RB->asize | lwz TMP2, TAB:RB->array | cmplw TMP0, TMP1; bge ->vmeta_tgetb + |.if FPU | lwzx TMP1, TMP2, RC | lfdx f0, TMP2, RC + |.else + | lwzux TMP1, TMP2, RC + | lwz TMP3, 4(TMP2) + |.endif | checknil TMP1; beq >5 |1: | ins_next1 + |.if FPU | stfdx f0, BASE, RA + |.else + | stwux TMP1, RA, BASE + | stw TMP3, 4(RA) + |.endif | ins_next2 | |5: // Check for __index if table value is nil. @@ -4087,10 +4678,20 @@ static void build_ins(BuildCtx *ctx, BCO | cmplw TMP0, CARG2 | slwi TMP2, CARG2, 3 | ble ->vmeta_tgetr // In array part? + |.if FPU | lfdx f14, TMP1, TMP2 + |.else + | lwzux SAVE0, TMP2, TMP1 + | lwz SAVE1, 4(TMP2) + |.endif |->BC_TGETR_Z: | ins_next1 + |.if FPU | stfdx f14, BASE, RA + |.else + | stwux SAVE0, RA, BASE + | stw SAVE1, 4(RA) + |.endif | ins_next2 break; @@ -4131,11 +4732,22 @@ static void build_ins(BuildCtx *ctx, BCO | ble ->vmeta_tsetv // Integer key and in array part? | lwzx TMP2, TMP1, TMP0 | lbz TMP3, TAB:RB->marked + |.if FPU | lfdx f14, BASE, RA + |.else + | add SAVE1, BASE, RA + | lwz SAVE0, 0(SAVE1) + | lwz SAVE1, 4(SAVE1) + |.endif | checknil TMP2; beq >3 |1: | andix. TMP2, TMP3, LJ_GC_BLACK // isblack(table) + |.if FPU | stfdx f14, TMP1, TMP0 + |.else + | stwux SAVE0, TMP1, TMP0 + | stw SAVE1, 4(TMP1) + |.endif | bne >7 |2: | ins_next @@ -4176,7 +4788,13 @@ static void build_ins(BuildCtx *ctx, BCO | lwz NODE:TMP2, TAB:RB->node | stb ZERO, TAB:RB->nomm // Clear metamethod cache. | and TMP1, TMP1, TMP0 // idx = str->hash & tab->hmask + |.if FPU | lfdx f14, BASE, RA + |.else + | add CARG2, BASE, RA + | lwz SAVE0, 0(CARG2) + | lwz SAVE1, 4(CARG2) + |.endif | slwi TMP0, TMP1, 5 | slwi TMP1, TMP1, 3 | sub TMP1, TMP0, TMP1 @@ -4192,7 +4810,12 @@ static void build_ins(BuildCtx *ctx, BCO | checknil CARG2; beq >4 // Key found, but nil value? |2: | andix. TMP0, TMP3, LJ_GC_BLACK // isblack(table) + |.if FPU | stfd f14, NODE:TMP2->val + |.else + | stw SAVE0, NODE:TMP2->val.u32.hi + | stw SAVE1, NODE:TMP2->val.u32.lo + |.endif | bne >7 |3: | ins_next @@ -4231,7 +4854,12 @@ static void build_ins(BuildCtx *ctx, BCO | bl extern lj_tab_newkey // (lua_State *L, GCtab *t, TValue *k) | // Returns TValue *. | lp BASE, L->base + |.if FPU | stfd f14, 0(CRET1) + |.else + | stw SAVE0, 0(CRET1) + | stw SAVE1, 4(CRET1) + |.endif | b <3 // No 2nd write barrier needed. | |7: // Possible table write barrier for the value. Skip valiswhite check. @@ -4248,13 +4876,24 @@ static void build_ins(BuildCtx *ctx, BCO | lwz TMP2, TAB:RB->array | lbz TMP3, TAB:RB->marked | cmplw TMP0, TMP1 + |.if FPU | lfdx f14, BASE, RA + |.else + | add CARG2, BASE, RA + | lwz SAVE0, 0(CARG2) + | lwz SAVE1, 4(CARG2) + |.endif | bge ->vmeta_tsetb | lwzx TMP1, TMP2, RC | checknil TMP1; beq >5 |1: | andix. TMP0, TMP3, LJ_GC_BLACK // isblack(table) + |.if FPU | stfdx f14, TMP2, RC + |.else + | stwux SAVE0, RC, TMP2 + | stw SAVE1, 4(RC) + |.endif | bne >7 |2: | ins_next @@ -4294,10 +4933,20 @@ static void build_ins(BuildCtx *ctx, BCO |2: | cmplw TMP0, CARG3 | slwi TMP2, CARG3, 3 + |.if FPU | lfdx f14, BASE, RA + |.else + | lwzux SAVE0, RA, BASE + | lwz SAVE1, 4(RA) + |.endif | ble ->vmeta_tsetr // In array part? | ins_next1 + |.if FPU | stfdx f14, TMP1, TMP2 + |.else + | stwux SAVE0, TMP1, TMP2 + | stw SAVE1, 4(TMP1) + |.endif | ins_next2 | |7: // Possible table write barrier for the value. Skip valiswhite check. @@ -4327,10 +4976,20 @@ static void build_ins(BuildCtx *ctx, BCO | add TMP1, TMP1, TMP0 | andix. TMP0, TMP3, LJ_GC_BLACK // isblack(table) |3: // Copy result slots to table. + |.if FPU | lfd f0, 0(RA) + |.else + | lwz SAVE0, 0(RA) + | lwz SAVE1, 4(RA) + |.endif | addi RA, RA, 8 | cmpw cr1, RA, TMP2 + |.if FPU | stfd f0, 0(TMP1) + |.else + | stw SAVE0, 0(TMP1) + | stw SAVE1, 4(TMP1) + |.endif | addi TMP1, TMP1, 8 | blt cr1, <3 | bne >7 @@ -4397,9 +5056,20 @@ static void build_ins(BuildCtx *ctx, BCO | beq cr1, >3 |2: | addi TMP3, TMP2, 8 + |.if FPU | lfdx f0, RA, TMP2 + |.else + | add CARG3, RA, TMP2 + | lwz CARG1, 0(CARG3) + | lwz CARG2, 4(CARG3) + |.endif | cmplw cr1, TMP3, NARGS8:RC + |.if FPU | stfdx f0, BASE, TMP2 + |.else + | stwux CARG1, TMP2, BASE + | stw CARG2, 4(TMP2) + |.endif | mr TMP2, TMP3 | bne cr1, <2 |3: @@ -4432,14 +5102,28 @@ static void build_ins(BuildCtx *ctx, BCO | add BASE, BASE, RA | lwz TMP1, -24(BASE) | lwz LFUNC:RB, -20(BASE) + |.if FPU | lfd f1, -8(BASE) | lfd f0, -16(BASE) + |.else + | lwz CARG1, -8(BASE) + | lwz CARG2, -4(BASE) + | lwz CARG3, -16(BASE) + | lwz CARG4, -12(BASE) + |.endif | stw TMP1, 0(BASE) // Copy callable. | stw LFUNC:RB, 4(BASE) | checkfunc TMP1 - | stfd f1, 16(BASE) // Copy control var. | li NARGS8:RC, 16 // Iterators get 2 arguments. + |.if FPU + | stfd f1, 16(BASE) // Copy control var. | stfdu f0, 8(BASE) // Copy state. + |.else + | stw CARG1, 16(BASE) // Copy control var. + | stw CARG2, 20(BASE) + | stwu CARG3, 8(BASE) // Copy state. + | stw CARG4, 4(BASE) + |.endif | bne ->vmeta_call | ins_call break; @@ -4460,7 +5144,12 @@ static void build_ins(BuildCtx *ctx, BCO | slwi TMP3, RC, 3 | bge >5 // Index points after array part? | lwzx TMP2, TMP1, TMP3 + |.if FPU | lfdx f0, TMP1, TMP3 + |.else + | lwzux CARG1, TMP3, TMP1 + | lwz CARG2, 4(TMP3) + |.endif | checknil TMP2 | lwz INS, -4(PC) | beq >4 @@ -4472,7 +5161,12 @@ static void build_ins(BuildCtx *ctx, BCO |.endif | addi RC, RC, 1 | addis TMP3, PC, -(BCBIAS_J*4 >> 16) + |.if FPU | stfd f0, 8(RA) + |.else + | stw CARG1, 8(RA) + | stw CARG2, 12(RA) + |.endif | decode_RD4 TMP1, INS | stw RC, -4(RA) // Update control var. | add PC, TMP1, TMP3 @@ -4497,17 +5191,38 @@ static void build_ins(BuildCtx *ctx, BCO | slwi RB, RC, 3 | sub TMP3, TMP3, RB | lwzx RB, TMP2, TMP3 + |.if FPU | lfdx f0, TMP2, TMP3 + |.else + | add CARG3, TMP2, TMP3 + | lwz CARG1, 0(CARG3) + | lwz CARG2, 4(CARG3) + |.endif | add NODE:TMP3, TMP2, TMP3 | checknil RB | lwz INS, -4(PC) | beq >7 + |.if FPU | lfd f1, NODE:TMP3->key + |.else + | lwz CARG3, NODE:TMP3->key.u32.hi + | lwz CARG4, NODE:TMP3->key.u32.lo + |.endif | addis TMP2, PC, -(BCBIAS_J*4 >> 16) + |.if FPU | stfd f0, 8(RA) + |.else + | stw CARG1, 8(RA) + | stw CARG2, 12(RA) + |.endif | add RC, RC, TMP0 | decode_RD4 TMP1, INS + |.if FPU | stfd f1, 0(RA) + |.else + | stw CARG3, 0(RA) + | stw CARG4, 4(RA) + |.endif | addi RC, RC, 1 | add PC, TMP1, TMP2 | stw RC, -4(RA) // Update control var. @@ -4573,9 +5288,19 @@ static void build_ins(BuildCtx *ctx, BCO | subi TMP2, TMP2, 16 | ble >2 // No vararg slots? |1: // Copy vararg slots to destination slots. + |.if FPU | lfd f0, 0(RC) + |.else + | lwz CARG1, 0(RC) + | lwz CARG2, 4(RC) + |.endif | addi RC, RC, 8 + |.if FPU | stfd f0, 0(RA) + |.else + | stw CARG1, 0(RA) + | stw CARG2, 4(RA) + |.endif | cmplw RA, TMP2 | cmplw cr1, RC, TMP3 | bge >3 // All destination slots filled? @@ -4598,9 +5323,19 @@ static void build_ins(BuildCtx *ctx, BCO | addi MULTRES, TMP1, 8 | bgt >7 |6: + |.if FPU | lfd f0, 0(RC) + |.else + | lwz CARG1, 0(RC) + | lwz CARG2, 4(RC) + |.endif | addi RC, RC, 8 + |.if FPU | stfd f0, 0(RA) + |.else + | stw CARG1, 0(RA) + | stw CARG2, 4(RA) + |.endif | cmplw RC, TMP3 | addi RA, RA, 8 | blt <6 // More vararg slots? @@ -4651,14 +5386,38 @@ static void build_ins(BuildCtx *ctx, BCO | li TMP1, 0 |2: | addi TMP3, TMP1, 8 + |.if FPU | lfdx f0, RA, TMP1 + |.else + | add CARG3, RA, TMP1 + | lwz CARG1, 0(CARG3) + | lwz CARG2, 4(CARG3) + |.endif | cmpw TMP3, RC + |.if FPU | stfdx f0, TMP2, TMP1 + |.else + | add CARG3, TMP2, TMP1 + | stw CARG1, 0(CARG3) + | stw CARG2, 4(CARG3) + |.endif | beq >3 | addi TMP1, TMP3, 8 + |.if FPU | lfdx f1, RA, TMP3 + |.else + | add CARG3, RA, TMP3 + | lwz CARG1, 0(CARG3) + | lwz CARG2, 4(CARG3) + |.endif | cmpw TMP1, RC + |.if FPU | stfdx f1, TMP2, TMP3 + |.else + | add CARG3, TMP2, TMP3 + | stw CARG1, 0(CARG3) + | stw CARG2, 4(CARG3) + |.endif | bne <2 |3: |5: @@ -4700,8 +5459,15 @@ static void build_ins(BuildCtx *ctx, BCO | subi TMP2, BASE, 8 | decode_RB8 RB, INS if (op == BC_RET1) { + |.if FPU | lfd f0, 0(RA) | stfd f0, 0(TMP2) + |.else + | lwz CARG1, 0(RA) + | lwz CARG2, 4(RA) + | stw CARG1, 0(TMP2) + | stw CARG2, 4(TMP2) + |.endif } |5: | cmplw RB, RD @@ -4762,11 +5528,11 @@ static void build_ins(BuildCtx *ctx, BCO |4: | stw CARG1, FORL_IDX*8+4(RA) } else { - | lwz TMP3, FORL_STEP*8(RA) + | lwz SAVE0, FORL_STEP*8(RA) | lwz CARG3, FORL_STEP*8+4(RA) | lwz TMP2, FORL_STOP*8(RA) | lwz CARG2, FORL_STOP*8+4(RA) - | cmplw cr7, TMP3, TISNUM + | cmplw cr7, SAVE0, TISNUM | cmplw cr1, TMP2, TISNUM | crand 4*cr0+eq, 4*cr0+eq, 4*cr7+eq | crand 4*cr0+eq, 4*cr0+eq, 4*cr1+eq @@ -4809,41 +5575,80 @@ static void build_ins(BuildCtx *ctx, BCO if (vk) { |.if DUALNUM |9: // FP loop. + |.if FPU | lfd f1, FORL_IDX*8(RA) |.else + | lwz CARG1, FORL_IDX*8(RA) + | lwz CARG2, FORL_IDX*8+4(RA) + |.endif + |.else | lfdux f1, RA, BASE |.endif + |.if FPU | lfd f3, FORL_STEP*8(RA) | lfd f2, FORL_STOP*8(RA) - | lwz TMP3, FORL_STEP*8(RA) | fadd f1, f1, f3 | stfd f1, FORL_IDX*8(RA) + |.else + | lwz CARG3, FORL_STEP*8(RA) + | lwz CARG4, FORL_STEP*8+4(RA) + | mr SAVE1, RD + | blex __adddf3 + | mr RD, SAVE1 + | stw CRET1, FORL_IDX*8(RA) + | stw CRET2, FORL_IDX*8+4(RA) + | lwz CARG3, FORL_STOP*8(RA) + | lwz CARG4, FORL_STOP*8+4(RA) + |.endif + | lwz SAVE0, FORL_STEP*8(RA) } else { |.if DUALNUM |9: // FP loop. |.else | lwzux TMP1, RA, BASE - | lwz TMP3, FORL_STEP*8(RA) + | lwz SAVE0, FORL_STEP*8(RA) | lwz TMP2, FORL_STOP*8(RA) | cmplw cr0, TMP1, TISNUM - | cmplw cr7, TMP3, TISNUM + | cmplw cr7, SAVE0, TISNUM | cmplw cr1, TMP2, TISNUM |.endif + |.if FPU | lfd f1, FORL_IDX*8(RA) + |.else + | lwz CARG1, FORL_IDX*8(RA) + | lwz CARG2, FORL_IDX*8+4(RA) + |.endif | crand 4*cr0+lt, 4*cr0+lt, 4*cr7+lt | crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt + |.if FPU | lfd f2, FORL_STOP*8(RA) + |.else + | lwz CARG3, FORL_STOP*8(RA) + | lwz CARG4, FORL_STOP*8+4(RA) + |.endif | bge ->vmeta_for } - | cmpwi cr6, TMP3, 0 + | cmpwi cr6, SAVE0, 0 if (op != BC_JFORL) { | srwi RD, RD, 1 } + |.if FPU | stfd f1, FORL_EXT*8(RA) + |.else + | stw CARG1, FORL_EXT*8(RA) + | stw CARG2, FORL_EXT*8+4(RA) + |.endif if (op != BC_JFORL) { | add RD, PC, RD } + |.if FPU | fcmpu cr0, f1, f2 + |.else + | mr SAVE1, RD + | blex __ledf2 + | cmpwi CRET1, 0 + | mr RD, SAVE1 + |.endif if (op == BC_JFORI) { | addis PC, RD, -(BCBIAS_J*4 >> 16) }