diff --git a/src/racket/sconfig.h b/src/racket/sconfig.h index 53ef1e9ca1..fc2a524c35 100644 --- a/src/racket/sconfig.h +++ b/src/racket/sconfig.h @@ -162,7 +162,9 @@ # if defined(i386) # define SCHEME_PLATFORM_LIBRARY_SUBPATH "i386-linux" # define REGISTER_POOR_MACHINE -# define ASM_DBLPREC_CONTROL_87 +# ifndef MZ_USE_JIT_SSE +# define ASM_DBLPREC_CONTROL_87 +# endif # endif # if defined(powerpc) # define SCHEME_PLATFORM_LIBRARY_SUBPATH "ppc-linux" @@ -186,7 +188,9 @@ # if defined(__x86_64__) # define SCHEME_PLATFORM_LIBRARY_SUBPATH "x86_64-linux" # define REGISTER_POOR_MACHINE -# define ASM_DBLPREC_CONTROL_87 +# ifdef MZ_NO_JIT_SSE +# define ASM_DBLPREC_CONTROL_87 +# endif # endif # ifndef SCHEME_PLATFORM_LIBRARY_SUBPATH # define SCHEME_PLATFORM_LIBRARY_SUBPATH "unknown-linux" @@ -336,17 +340,21 @@ # define SCHEME_PLATFORM_LIBRARY_SUBPATH "i386-freebsd" # define REGISTER_POOR_MACHINE # define MZ_USE_JIT_I386 -# if defined(__FreeBSD_kernel__) -# define ASM_DBLPREC_CONTROL_87 -# else -# define FREEBSD_CONTROL_387 +# ifndef MZ_JIT_X86_SSE +# if defined(__FreeBSD_kernel__) +# define ASM_DBLPREC_CONTROL_87 +# else +# define FREEBSD_CONTROL_387 +# endif # endif # elif defined(__amd64__) # define SCHEME_PLATFORM_LIBRARY_SUBPATH "amd64-freebsd" # define REGISTER_POOR_MACHINE # define MZ_USE_JIT_X86_64 -# if defined(__FreeBSD_kernel__) -# define ASM_DBLPREC_CONTROL_87 +# ifdef MZ_NO_JIT_SSE +# if defined(__FreeBSD_kernel__) +# define ASM_DBLPREC_CONTROL_87 +# endif # endif # elif defined(__sparc64__) # define SCHEME_PLATFORM_LIBRARY_SUBPATH "sparc64-freebsd" @@ -751,7 +759,10 @@ # define MZ_USE_JIT_X86_64 #else # define MZ_USE_JIT_I386 -# define ASM_DBLPREC_CONTROL_87 +# ifndef MZ_NO_JIT_SSE +# define MZ_USE_JIT_SSE +# define ASM_DBLPREC_CONTROL_87 +# endif #endif # define MZ_JIT_USE_MPROTECT diff --git a/src/racket/src/Makefile.in b/src/racket/src/Makefile.in index ea9c78de7b..c04180cd7e 100644 --- a/src/racket/src/Makefile.in +++ b/src/racket/src/Makefile.in @@ -298,7 +298,7 @@ JIT_HEADERS = $(srcdir)/../src/jit.h \ $(srcdir)/lightning/i386/core.h $(srcdir)/lightning/i386/core-common.h \ $(srcdir)/lightning/i386/asm.h $(srcdir)/lightning/i386/asm-common.h \ $(srcdir)/lightning/i386/funcs.h $(srcdir)/lightning/i386/funcs-common.h \ - $(srcdir)/lightning/i386/fp.h $(srcdir)/lightning/i386/fp-common.h \ + $(srcdir)/lightning/i386/fp.h $(srcdir)/lightning/i386/fp-sse.h $(srcdir)/lightning/i386/fp-common.h \ $(srcdir)/lightning/ppc/core.h $(srcdir)/lightning/ppc/core-common.h \ $(srcdir)/lightning/ppc/asm.h $(srcdir)/lightning/ppc/asm-common.h \ $(srcdir)/lightning/ppc/funcs.h $(srcdir)/lightning/ppc/funcs-common.h \ diff --git a/src/racket/src/jit.h b/src/racket/src/jit.h index a67280efa6..00f7938b78 100644 --- a/src/racket/src/jit.h +++ b/src/racket/src/jit.h @@ -43,6 +43,9 @@ END_XFORM_ARITH; #ifdef MZ_USE_JIT_X86_64 # define MZ_USE_JIT_I386 # define JIT_X86_64 +# ifndef MZ_NO_JIT_SSE +# define JIT_X86_SSE +# endif #endif #ifdef MZ_USE_JIT_I386 @@ -51,6 +54,12 @@ END_XFORM_ARITH; # endif #endif +#ifdef MZ_USE_JIT_SSE +# ifndef JIT_X86_SSE +# define JIT_X86_SSE +# endif +#endif + #ifdef MZ_USE_JIT_PPC # ifndef DEFINE_LIGHTNING_FUNCS # define SUPPRESS_LIGHTNING_FUNCS @@ -275,7 +284,7 @@ extern struct scheme_jit_common_record scheme_jit_common; #define sjc scheme_jit_common -typedef struct { +typedef struct mz_jit_state { MZTAG_IF_REQUIRED GC_CAN_IGNORE jit_state js; char *limit; @@ -835,7 +844,7 @@ void scheme_jit_prolog_again(mz_jit_state *jitter, int n, int ret_addr_reg) #define __START_TINY_OR_SHORT_JUMPS__(tcond, cond) if (tcond) { __START_TINY_JUMPS__(1); } else { __START_SHORT_JUMPS__(cond); } #define __END_TINY_OR_SHORT_JUMPS__(tcond, cond) if (tcond) { __END_TINY_JUMPS__(1); } else { __END_SHORT_JUMPS__(cond); } -#ifdef JIT_X86_64 +#if defined(JIT_X86_64) || defined(JIT_X86_SSE) # define __START_TINY_JUMPS_IF_COMPACT__(cond) /* empty */ # define __END_TINY_JUMPS_IF_COMPACT__(cond) /* empty */ #else @@ -941,7 +950,7 @@ static void emit_indentation(mz_jit_state *jitter) pushes and pops much balance. The popping branch operations pop both arguments before branching. */ -#if !defined(MZ_USE_JIT_I386) +#if !defined(MZ_USE_JIT_I386) || defined(JIT_X86_SSE) /* Not FP stack, so use normal variants. */ #define DIRECT_FPR_ACCESS #define jit_movi_d_fppush(rd,immd) jit_movi_d(rd,immd) @@ -959,7 +968,7 @@ static void emit_indentation(mz_jit_state *jitter) #define jit_abs_d_fppop(rd,rs) jit_abs_d(rd,rs) #define jit_sqrt_d_fppop(rd,rs) jit_sqrt_d(rd,rs) #define jit_sti_d_fppop(id, rs) jit_sti_d(id, rs) -#define jit_str_d_fppop(id, rd, rs) jit_str_d(id, rd, rs) +#define jit_str_d_fppop(id, rd) jit_str_d(id, rd) #define jit_stxi_d_fppop(id, rd, rs) jit_stxi_d(id, rd, rs) #define jit_stxr_d_fppop(id, rd, rs) jit_stxr_d(id, rd, rs) #define jit_bger_d_fppop(d, s1, s2) jit_bger_d(d, s1, s2) diff --git a/src/racket/src/jitalloc.c b/src/racket/src/jitalloc.c index 8181f0536b..2c1b54dbf3 100644 --- a/src/racket/src/jitalloc.c +++ b/src/racket/src/jitalloc.c @@ -278,7 +278,7 @@ int scheme_generate_alloc_retry(mz_jit_state *jitter, int i) #ifdef JIT_USE_FP_OPS if (i == 2) { - (void)mz_tl_sti_d_fppop(tl_scheme_jit_save_fp, JIT_FPR1, JIT_R2); + (void)mz_tl_sti_d_fppop(tl_scheme_jit_save_fp, JIT_FPR0, JIT_R2); } #endif JIT_UPDATE_THREAD_RSPTR(); @@ -299,7 +299,7 @@ int scheme_generate_alloc_retry(mz_jit_state *jitter, int i) } #ifdef JIT_USE_FP_OPS if (i == 2) { - (void)mz_tl_ldi_d_fppush(JIT_FPR1, tl_scheme_jit_save_fp, JIT_R2); + (void)mz_tl_ldi_d_fppush(JIT_FPR0, tl_scheme_jit_save_fp, JIT_R2); } #endif return 1; diff --git a/src/racket/src/lightning/i386/asm.h b/src/racket/src/lightning/i386/asm.h index cede95559d..6dbb46ffba 100644 --- a/src/racket/src/lightning/i386/asm.h +++ b/src/racket/src/lightning/i386/asm.h @@ -1220,6 +1220,562 @@ typedef _uc jit_insn; ( ((N)&7) == 0) ? 0 : \ JITFAIL(".align argument too large"))) +/* --- Media 128-bit instructions ------------------------------------------ */ + +typedef enum { + X86_SSE_MOV = 0x10, + X86_SSE_MOVLP = 0x12, + X86_SSE_MOVHP = 0x16, + X86_SSE_MOVA = 0x28, + X86_SSE_CVTIS = 0x2a, + X86_SSE_CVTTSI = 0x2c, + X86_SSE_CVTSI = 0x2d, + X86_SSE_UCOMI = 0x2e, + X86_SSE_COMI = 0x2f, + X86_SSE_ROUND = 0x3a, + X86_SSE_SQRT = 0x51, + X86_SSE_RSQRT = 0x52, + X86_SSE_RCP = 0x53, + X86_SSE_AND = 0x54, + X86_SSE_ANDN = 0x55, + X86_SSE_OR = 0x56, + X86_SSE_XOR = 0x57, + X86_SSE_ADD = 0x58, + X86_SSE_MUL = 0x59, + X86_SSE_CVTSD = 0x5a, + X86_SSE_CVTDT = 0x5b, + X86_SSE_SUB = 0x5c, + X86_SSE_MIN = 0x5d, + X86_SSE_DIV = 0x5e, + X86_SSE_MAX = 0x5f, + X86_SSE_X2G = 0x6e, + X86_SSE_EQB = 0x74, + X86_SSE_EQW = 0x75, + X86_SSE_EQD = 0x76, + X86_SSE_G2X = 0x7e, + X86_SSE_MOV2 = 0xd6 +} x86_sse_t; + + +#define _BIT(X) (!!(X)) +#define _rR(R) ((R) & 0x0f) +#define _rX(R) _rN(R) +#define _rXP(R) ((R) > 0 && _rR(R) > 7) +#define _SCL1 _b00 + +#define _rA(R) _r4(R) + +#define _RSP 0x54 + +#define _i_X(op, md, rb, ri, ms) _r_X(op, md, rb, ri, ms) + +#define _f_X(rd, md, rb, ri, ms) _i_X((int)_rX(rd), md, rb, ri, ms) + +#ifdef JIT_X86_64 +# define x86_REXwrxb(l, w, r, x, b) \ + (((l) || (((int)(w) << 3) | (((int)(r)) << 2) | (((int)(x)) << 1) | ((int)(b)))) \ + ? _jit_B(0x40 | (((int)(w) << 3) | (((int)(r)) << 2) | (((int)(x)) << 1) | ((int)(b)))) \ + : (void)0) +#else +# define x86_REXwrxb(l, w, r, x, b) (void)0 +#endif + +#define x86_REXwrx_(l, w, r, x, mr) x86_REXwrxb(l, w, r, x, _BIT(_rXP(mr))) +#define x86_REXw_x_(l, w, r, x, mr) x86_REXwrx_(l, w, _BIT(_rXP(r)), x, mr) +#define x86_rex_l_rr(rr, mr) x86_REXw_x_(0, 0, rr, 0, mr) +#define x86_rex_l_mr(rb, ri, rd) x86_REXw_x_(0, 0, rd, _BIT(_rXP(ri)), rb) +#define x86_rex_l_rm(rs, rb, ri) x86_rex_l_mr(rb, ri, rs) + + +#define _rex_ff_rr(rr, mr) x86_rex_l_rr(rr, mr) +#define _rex_if_rr(rr, mr) x86_rex_l_rr(rr, mr) +#define _rex_fi_rr(rr, mr) x86_rex_l_rr(rr, mr) +#define _rex_if_mr(rb, ri, rd) x86_rex_l_mr(rb, ri, rd) +#define _rex_fi_rm(rs, rb, ri) x86_rex_l_rm(rs, rb, ri) + +#define __sse_ff_rr(op, rs, rd) (_rex_ff_rr(rd, rs), _O(0x0f), _O(op), _Mrm(_b11, _rX(rd), _rX(rs))) + +#define __sse_id_rr(op, rs, rd) __sse_if_rr(op, rs, rd) +#define __sse_if_rr(op, rs, rd) (_rex_if_rr(rd, rs), _O(0x0f), _O(op), _Mrm(_b11, _rA(rd), _rX(rs))) + +#define __sse_di_rr(op, rs, rd) __sse_fi_rr(op, rs, rd) +#define __sse_fi_rr(op, rs, rd) (_rex_fi_rr(rd, rs), _O(0x0f), _O(op), _Mrm(_b11, _rX(rd), _rA(rs))) + +#define __sse_id_mr(op, md, rb, mi, ms, rd) __sse_if_mr(op, md, rb, mi, ms, rd) +#define __sse_if_mr(op, md, rb, ri, ms, rd) (_rex_if_mr(rb, ri, rd), _O(0x0f), _O(op), _f_X(rd, md, rb, ri, ms)) + +#define __sse_di_rm(op, rs, md, rb, mi, ms) __sse_fi_rm(op, rs, md, rb, mi, ms) +#define __sse_fi_rm(op, rs, md, rb, ri, ms) (_rex_fi_rm(rs, rb, ri), _O(0x0f), _O(op), _f_X(rs, md, rb, ri, ms)) + +#define __sse1_di_rm(op, rs, md, mb, mi, ms) __sse1_fi_rm(op, rs, md, mb, mi, ms) +#define __sse1_fi_rm(op, rs, md, rb, ri, ms) (_rex_fi_rm(rs, rb, ri), _O(0x0f), _O(0x01 | op), _f_X(rs, md, rb, ri, ms)) + +#define _sse_ff_rr(px, op, rs, rd) (_jit_B(px), __sse_ff_rr(op, rs, rd)) + +#define _sse_id_rr(px, op, rs, rd) _sse_if_rr(px, op, rs, rd) +#define _sse_if_rr(px, op, rs, rd) (_jit_B(px), __sse_if_rr(op, rs, rd)) + +#define _sse_di_rr(px, op, rs, rd) _sse_fi_rr(px, op, rs, rd) +#define _sse_fi_rr(px, op, rs, rd) (_jit_B(px), __sse_fi_rr(op, rs, rd)) + +#define _sse_id_mr(px, op, md, rb, mi, ms, rd) _sse_if_mr(px, op, md, rb, mi, ms, rd) +#define _sse_if_mr(px, op, md, rb, ri, ms, rd) (_jit_B(px), __sse_if_mr(op, md, rb, ri, ms, rd)) + +#define _sse_di_rm(px, op, rs, md, rb, mi, ms) _sse_fi_rm(px, op, rs, md, rb, mi, ms) +#define _sse_fi_rm(px, op, rs, md, rb, ri, ms) (_jit_B(px), __sse_fi_rm(op, rs, md, rb, ri, ms)) + +#define _sse1_di_rm(px, op, rs, md, mb, mi, ms) _sse1_fi_rm(px, op, rs, md, mb, mi, ms) +#define _sse1_fi_rm(px, op, rs, md, rb, ri, ms) (_jit_B(px), __sse1_fi_rm(op, rs, md, rb, ri, ms)) + +#define _SSEPSrr(OP,RS,RD) __sse_ff_rr ( OP, RS, RD) +#define _SSEPSmr(OP,MD,MB,MI,MS,RD) __sse_if_mr ( OP, MD, MB, MI, MS, RD) +#define _SSEPSrm(OP,RS,MD,MB,MI,MS) __sse_fi_rm ( OP, RS, MD, MB, MI, MS) +#define _SSEPS1rm(OP,RS,MD,MB,MI,MS) __sse1_fi_rm( OP, RS, MD, MB, MI, MS) + +#define _SSEPDrr(OP,RS,RD) _sse_ff_rr (0x66, OP, RS, RD) +#define _SSEPDmr(OP,MD,MB,MI,MS,RD) _sse_if_mr (0x66, OP, MD, MB, MI, MS, RD) +#define _SSEPDrm(OP,RS,MD,MB,MI,MS) _sse_fi_rm (0x66, OP, RS, MD, MB, MI, MS) +#define _SSEPD1rm(OP,RS,MD,MB,MI,MS) _sse1_fi_rm(0x66, OP, RS, MD, MB, MI, MS) + +#define _SSESSrr(OP,RS,RD) _sse_ff_rr (0xf3, OP, RS, RD) +#define _SSESSmr(OP,MD,MB,MI,MS,RD) _sse_if_mr (0xf3, OP, MD, MB, MI, MS, RD) +#define _SSESSrm(OP,RS,MD,MB,MI,MS) _sse_fi_rm (0xf3, OP, RS, MD, MB, MI, MS) +#define _SSESS1rm(OP,RS,MD,MB,MI,MS) _sse1_fi_rm(0xf3, OP, RS, MD, MB, MI, MS) + +#define _SSESDrr(OP,RS,RD) _sse_ff_rr (0xf2, OP, RS, RD) +#define _SSESDmr(OP,MD,MB,MI,MS,RD) _sse_if_mr (0xf2, OP, MD, MB, MI, MS, RD) +#define _SSESDrm(OP,RS,MD,MB,MI,MS) _sse_fi_rm (0xf2, OP, RS, MD, MB, MI, MS) +#define _SSESD1rm(OP,RS,MD,MB,MI,MS) _sse1_fi_rm(0xf2, OP, RS, MD, MB, MI, MS) + +#define _NOREG 0 + +/* SSE */ +#define LDMXCSRmr(MD, MB, MI, MS) \ + (_REXLmr(MB, MI, _NOREG), \ + _O(0x0f), \ + _O(0xae), \ + _i_X(_b10, MD, MB, MI, MS)) +#define STMXCSRrm(MD, MB, MI, MS) \ + (_REXLrm(_NOREG, MI, MB), \ + _O(0x0f), \ + _O(0xae), \ + _i_X(_b11, MD, MB, MI, MS)) + +/* SSE2 */ +#define ADDPSrr(RS, RD) _SSEPSrr(X86_SSE_ADD, RS, RD) +#define ADDPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_ADD, MD, MB, MI, MS, RD) +#define ADDPDrr(RS, RD) _SSEPDrr(X86_SSE_ADD, RS, RD) +#define ADDPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_ADD, MD, MB, MI, MS, RD) + +/* SSE */ +#define ADDSSrr(RS, RD) _SSESSrr(X86_SSE_ADD, RS, RD) +#define ADDSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_ADD, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define ADDSDrr(RS, RD) _SSESDrr(X86_SSE_ADD, RS, RD) +#define ADDSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_ADD, MD, MB, MI, MS, RD) + +/* SSE */ +#define ANDNPSrr(RS, RD) _SSEPSrr(X86_SSE_ANDN, RS, RD) +#define ANDNPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_ANDN, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define ANDNPDrr(RS, RD) _SSEPDrr(X86_SSE_ANDN, RS, RD) +#define ANDNPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_ANDN, MD, MB, MI, MS, RD) + +/* SSE */ +#define ANDNSSrr ANDNPSrr +#define ANDNSSmr ANDNPSrr + +/* SSE2 */ +#define ANDNSDrr ANDNPDrr +#define ANDNSDmr ANDNPDrr + +/* SSE */ +#define ANDPSrr(RS, RD) _SSEPSrr(X86_SSE_AND, RS, RD) +#define ANDPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_AND, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define ANDPDrr(RS, RD) _SSEPDrr(X86_SSE_AND, RS, RD) +#define ANDPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_AND, MD, MB, MI, MS, RD) + +/* SSE */ +#define ANDSSrr ANDPSrr +#define ANDSSmr ANDPSrr + +/* SSE2 */ +#define ANDSDrr ANDPDrr +#define ANDSDmr ANDPDrr + +/* SSE */ +#define DIVPSrr(RS, RD) _SSEPSrr(X86_SSE_DIV, RS, RD) +#define DIVPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_DIV, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define DIVPDrr(RS, RD) _SSEPDrr(X86_SSE_DIV, RS, RD) +#define DIVPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_DIV, MD, MB, MI, MS, RD) + +/* SSE */ +#define DIVSSrr(RS, RD) _SSESSrr(X86_SSE_DIV, RS, RD) +#define DIVSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_DIV, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define DIVSDrr(RS, RD) _SSESDrr(X86_SSE_DIV, RS, RD) +#define DIVSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_DIV, MD, MB, MI, MS, RD) + +/* SSE */ +#define MAXPSrr(RS, RD) _SSEPSrr(X86_SSE_MAX, RS, RD) +#define MAXPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_MAX, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define MAXPDrr(RS, RD) _SSEPDrr(X86_SSE_MAX, RS, RD) +#define MAXPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_MAX, MD, MB, MI, MS, RD) + +/* SSE */ +#define MAXSSrr(RS, RD) _SSESSrr(X86_SSE_MAX, RS, RD) +#define MAXSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_MAX, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define MAXSDrr(RS, RD) _SSESDrr(X86_SSE_MAX, RS, RD) +#define MAXSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_MAX, MD, MB, MI, MS, RD) + +/* SSE */ +#define MINPSrr(RS, RD) _SSEPSrr(X86_SSE_MIN, RS, RD) +#define MINPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_MIN, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define MINPDrr(RS, RD) _SSEPDrr(X86_SSE_MIN, RS, RD) +#define MINPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_MIN, MD, MB, MI, MS, RD) + +/* SSE */ +#define MINSSrr(RS, RD) _SSESSrr(X86_SSE_MIN, RS, RD) +#define MINSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_MIN, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define MINSDrr(RS, RD) _SSESDrr(X86_SSE_MIN, RS, RD) +#define MINSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_MIN, MD, MB, MI, MS, RD) + +/* SSE */ +#define MULPSrr(RS, RD) _SSEPSrr(X86_SSE_MUL, RS, RD) +#define MULPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_MUL, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define MULPDrr(RS, RD) _SSEPDrr(X86_SSE_MUL, RS, RD) +#define MULPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_MUL, MD, MB, MI, MS, RD) + +/* SSE */ +#define MULSSrr(RS, RD) _SSESSrr(X86_SSE_MUL, RS, RD) +#define MULSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_MUL, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define MULSDrr(RS, RD) _SSESDrr(X86_SSE_MUL, RS, RD) +#define MULSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_MUL, MD, MB, MI, MS, RD) + +/* SSE */ +#define ORPSrr(RS, RD) _SSEPSrr(X86_SSE_OR, RS, RD) +#define ORPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_OR, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define ORPDrr(RS, RD) _SSEPDrr(X86_SSE_OR, RS, RD) +#define ORPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_OR, MD, MB, MI, MS, RD) + +/* SSE */ +#define ORSSrr ORPSrr +#define ORSSmr ORPSrr + +/* SSE2 */ +#define ORSDrr ORPDrr +#define ORSDmr ORPDrr + +/* SSE */ +#define RCPPSrr(RS, RD) _SSEPSrr(X86_SSE_RCP, RS, RD) +#define RCPPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_RCP, MD, MB, MI, MS, RD) +#define RCPSSrr(RS, RD) _SSESSrr(X86_SSE_RCP, RS, RD) +#define RCPSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_RCP, MD, MB, MI, MS, RD) + +/* SSE */ +#define RSQRTPSrr(RS, RD) _SSEPSrr(X86_SSE_RSQRT, RS, RD) +#define RSQRTPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_RSQRT, MD, MB, MI, MS, RD) +#define RSQRTSSrr(RS, RD) _SSESSrr(X86_SSE_RSQRT, RS, RD) +#define RSQRTSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_RSQRT, MD, MB, MI, MS, RD) + +/* SSE */ +#define SQRTPSrr(RS, RD) _SSEPSrr(X86_SSE_SQRT, RS, RD) +#define SQRTPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_SQRT, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define SQRTPDrr(RS, RD) _SSEPDrr(X86_SSE_SQRT, RS, RD) +#define SQRTPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_SQRT, MD, MB, MI, MS, RD) + +/* SSE */ +#define SQRTSSrr(RS, RD) _SSESSrr(X86_SSE_SQRT, RS, RD) +#define SQRTSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_SQRT, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define SQRTSDrr(RS, RD) _SSESDrr(X86_SSE_SQRT, RS, RD) +#define SQRTSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_SQRT, MD, MB, MI, MS, RD) + +/* SSE */ +#define SUBPSrr(RS, RD) _SSEPSrr(X86_SSE_SUB, RS, RD) +#define SUBPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_SUB, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define SUBPDrr(RS, RD) _SSEPDrr(X86_SSE_SUB, RS, RD) +#define SUBPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_SUB, MD, MB, MI, MS, RD) + +/* SSE */ +#define SUBSSrr(RS, RD) _SSESSrr(X86_SSE_SUB, RS, RD) +#define SUBSSmr(MD, MB, MI, MS, RD) _SSESSmr(X86_SSE_SUB, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define SUBSDrr(RS, RD) _SSESDrr(X86_SSE_SUB, RS, RD) +#define SUBSDmr(MD, MB, MI, MS, RD) _SSESDmr(X86_SSE_SUB, MD, MB, MI, MS, RD) + +/* SSE */ +#define XORPSrr(RS, RD) _SSEPSrr(X86_SSE_XOR, RS, RD) +#define XORPSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_XOR, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define XORPDrr(RS, RD) _SSEPDrr(X86_SSE_XOR, RS, RD) +#define XORPDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_XOR, MD, MB, MI, MS, RD) + +/* SSE */ +#define XORSSrr XORPSrr +#define XORSSmr XORPSrr + +/* SSE2 */ +#define XORSDrr XORPDrr +#define XORSDmr XORPDrr + +/* No prefixes here. */ +/* SSE */ +#define COMISSrr(RS, RD) _SSEPSrr(X86_SSE_COMI, RS, RD) +#define COMISSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_COMI, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define COMISDrr(RS, RD) _SSEPDrr(X86_SSE_COMI, RS, RD) +#define COMISDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_COMI, MD, MB, MI, MS, RD) + +/* No prefixes here. */ +/* SSE */ +#define UCOMISSrr(RS, RD) _SSEPSrr(X86_SSE_UCOMI, RS, RD) +#define UCOMISSmr(MD, MB, MI, MS, RD) _SSEPSmr(X86_SSE_UCOMI, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define UCOMISDrr(RS, RD) _SSEPDrr(X86_SSE_UCOMI, RS, RD) +#define UCOMISDmr(MD, MB, MI, MS, RD) _SSEPDmr(X86_SSE_UCOMI, MD, MB, MI, MS, RD) + +/* SSE */ +#define MOVSSrr(RS, RD) _SSESSrr (X86_SSE_MOV, RS, RD) +#define MOVSSmr(MD, MB, MI, MS, RD) _SSESSmr (X86_SSE_MOV, MD, MB, MI, MS, RD) +#define MOVSSrm(RS, MD, MB, MI, MS) _SSESS1rm(X86_SSE_MOV, RS, MD, MB, MI, MS) + +/* SSE2 */ +#define MOVSDrr(RS, RD) _SSESDrr (X86_SSE_MOV, RS, RD) +#define MOVSDmr(MD, MB, MI, MS, RD) _SSESDmr (X86_SSE_MOV, MD, MB, MI, MS, RD) +#define MOVSDrm(RS, MD, MB, MI, MS) _SSESD1rm(X86_SSE_MOV, RS, MD, MB, MI, MS) + +/* SSE */ +#define MOVAPSrr(RS, RD) _SSEPSrr (X86_SSE_MOVA, RS, RD) +#define MOVAPSmr(MD, MB, MI, MS, RD) _SSEPSmr (X86_SSE_MOVA, MD, MB, MI, MS, RD) +#define MOVAPSrm(RS, MD, MB, MI, MS) _SSEPS1rm(X86_SSE_MOVA, RS, MD, MB, MI, MS) + +/* SSE2 */ +#define MOVAPDrr(RS, RD) _SSEPDrr (X86_SSE_MOVA, RS, RD) +#define MOVAPDmr(MD, MB, MI, MS, RD) _SSEPDmr (X86_SSE_MOVA, MD, MB, MI, MS, RD) +#define MOVAPDrm(RS, MD, MB, MI, MS) _SSEPD1rm(X86_SSE_MOVA, RS, MD, MB, MI, MS) + +/* SSE */ +#define CVTPS2PIrr(RS, RD) __sse_ff_rr( X86_SSE_CVTSI, RS, RD) +#define CVTPS2PImr(MD, MB, MI, MS, RD) __sse_if_mr( X86_SSE_CVTSI, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define CVTPD2PIrr(RS, RD) _sse_ff_rr(0x66, X86_SSE_CVTSI, RS, RD) +#define CVTPD2PImr(MD, MB, MI, MS, RD) _sse_id_mr(0x66, X86_SSE_CVTSI, MD, MB, MI, MS, RD) + +/* SSE */ +#define CVTPI2PSrr(RS, RD) __sse_ff_rr( X86_SSE_CVTIS, RS, RD) +#define CVTPI2PSmr(MD, MB, MI, MS, RD) __sse_if_mr( X86_SSE_CVTIS, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define CVTPI2PDrr(RS, RD) _sse_ff_rr(0x66, X86_SSE_CVTIS, RS, RD) +#define CVTPI2PDmr(MD, MB, MI, MS, RD) _sse_id_mr(0x66, X86_SSE_CVTIS, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define CVTPS2PDrr(RS, RD) __sse_ff_rr( X86_SSE_CVTSD, RS, RD) +#define CVTPS2PDmr(MD, MB, MI, MS, RD) __sse_if_mr( X86_SSE_CVTSD, MD, MB, MI, MS, RD) +#define CVTPD2PSrr(RS, RD) _sse_ff_rr(0x66, X86_SSE_CVTSD, RS, RD) +#define CVTPD2PSmr(MD, MB, MI, MS, RD) _sse_id_mr(0x66, X86_SSE_CVTSD, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define CVTSS2SDrr(RS, RD) _sse_ff_rr(0xf3, X86_SSE_CVTSD, RS, RD) +#define CVTSS2SDmr(MD, MB, MI, MS, RD) _sse_id_mr(0xf3, X86_SSE_CVTSD, MD, MB, MI, MS, RD) +#define CVTSD2SSrr(RS, RD) _sse_ff_rr(0xf2, X86_SSE_CVTSD, RS, RD) +#define CVTSD2SSmr(MD, MB, MI, MS, RD) _sse_id_mr(0xf2, X86_SSE_CVTSD, MD, MB, MI, MS, RD) + +/* SSE */ +#define CVTTSS2SILrr(RS, RD) _sse_id_rr(0xf3, X86_SSE_CVTTSI, RS, RD) +#define CVTTSS2SILmr(MD, MB, MI, MS, RD) _sse_id_mr(0xf3, X86_SSE_CVTTSI, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define CVTTSD2SILrr(RS, RD) _sse_id_rr(0xf2, X86_SSE_CVTTSI, RS, RD) +#define CVTTSD2SILmr(MD, MB, MI, MS, RD) _sse_id_mr(0xf2, X86_SSE_CVTTSI, MD, MB, MI, MS, RD) + +/* SSE */ +#define CVTSS2SILrr(RS, RD) _sse_if_rr(0xf3, X86_SSE_CVTSI, RS, RD) +#define CVTSS2SILmr(MD, MB, MI, MS, RD) _sse_if_mr(0xf3, X86_SSE_CVTSI, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define CVTSD2SILrr(RS, RD) _sse_id_rr(0xf2, X86_SSE_CVTSI, RS, RD) +#define CVTSD2SILmr(MD, MB, MI, MS, RD) _sse_id_mr(0xf2, X86_SSE_CVTSI, MD, MB, MI, MS, RD) + +/* SSE */ +#define CVTSI2SSLrr(RS, RD) _sse_fi_rr(0xf3, X86_SSE_CVTIS, RS, RD) +#define CVTSI2SSLmr(MD, MB, MI, MS, RD) _sse_if_mr(0xf3, X86_SSE_CVTIS, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define CVTSI2SDLrr(RS, RD) _sse_di_rr(0xf2, X86_SSE_CVTIS, RS, RD) +#define CVTSI2SDLmr(MD, MB, MI, MS, RD) _sse_id_mr(0xf2, X86_SSE_CVTIS, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define MOVDLXrr(RS, RD) _sse_di_rr(0x66, X86_SSE_X2G, RS, RD) +#define MOVDLXmr(MD, MB, MI, MS, RD) _sse_id_mr(0x66, X86_SSE_X2G, MD, MB, MI, MS, RD) + +/* SSE2 */ +#define MOVDXLrr(RS, RD) _sse_ff_rr(0x66, X86_SSE_G2X, RS, RD) +#define MOVDXLrm(RS, MD, MB, MI, MS) _sse_di_rm(0x66, X86_SSE_G2X, RS, MD, MB, MI, MS) + +/* SSE */ +#define MOVDLMrr(RS, RD) __sse_ff_rr( X86_SSE_X2G, RS, RD) +#define MOVDLMmr(MD, MB, MI, MS, RD) __sse_id_mr( X86_SSE_X2G, MD, MB, MI, MS, RD) + +/* SSE */ +#define MOVDMLrr(RS, RD) __sse_ff_rr( X86_SSE_G2X, RS, RD) +#define MOVDMLrm(RS, MD, MB, MI, MS) __sse_fi_rm( X86_SSE_G2X, RS, MD, MB, MI, MS) + +/* SSE3 */ +#define MOVDQ2Qrr(RS, RD) _sse_ff_rr(0xf2, X86_SSE_MOV2, RS, RD) +#define MOVQ2DQrr(RS, RD) _sse_ff_rr(0xf3, X86_SSE_MOV2, RS, RD) + +/* SSE */ +#define MOVHLPSrr(RS, RD) __sse_ff_rr( X86_SSE_MOVLP, RS, RD) +#define MOVLHPSrr(RS, RD) __sse_ff_rr( X86_SSE_MOVHP, RS, RD) + +/* SSE2 */ +#define MOVDQArr(RS, RD) _sse_ff_rr(0x66, 0x6f, RS, RD) +#define MOVDQAmr(MD, MB, MI, MS, RD) _sse_id_mr(0x66, 0x6f, MD, MB, MI, MS, RD) +#define MOVDQArm(RS, MD, MB, MI, MS) _sse_di_rm(0x66, 0x7f, RS, MD, MB, MI, MS) + +/* SSE2 */ +#define MOVDQUrr(RS, RD) _sse_ff_rr(0xf3, 0x6f, RS, RD) +#define MOVDQUmr(MD, MB, MI, MS, RD) _sse_id_mr(0xf3, 0x6f, MD, MB, MI, MS, RD) +#define MOVDQUrm(RS, MD, MB, MI, MS) _sse_di_rm(0xf3, 0x7f, RS, MD, MB, MI, MS) + +/* SSE2 */ +#define MOVHPDmr(MD, MB, MI, MS, RD) _sse_id_mr (0x66, X86_SSE_MOVHP, MD, MB, MI, MS, RD) +#define MOVHPDrm(RS, MD, MB, MI, MS) _sse1_di_rm(0x66, X86_SSE_MOVHP, RS, MD, MB, MI, MS) + +/* SSE */ +#define MOVHPSmr(MD, MB, MI, MS, RD) __sse_if_mr ( X86_SSE_MOVHP, MD, MB, MI, MS, RD) +#define MOVHPSrm(RS, MD, MB, MI, MS) __sse1_fi_rm( X86_SSE_MOVHP, RS, MD, MB, MI, MS) + +/* SSE2 */ +#define MOVLPDmr(MD, MB, MI, MS, RD) _sse_id_mr (0x66, X86_SSE_MOVLP, MD, MB, MI, MS, RD) +#define MOVLPDrm(RS, MD, MB, MI, MS) _sse1_di_rm(0x66, X86_SSE_MOVLP, RS, MD, MB, MI, MS) + +/* SSE */ +#define MOVLPSmr(MD, MB, MI, MS, RD) __sse_if_mr ( X86_SSE_MOVLP, MD, MB, MI, MS, RD) +#define MOVLPSrm(RS, MD, MB, MI, MS) __sse1_fi_rm( X86_SSE_MOVLP, RS, MD, MB, MI, MS) + +/* FIXME 0x66 prefix actually required to modify 128 bits register */ +/* SSE or SSE2 with 0x66 prefix */ +#define PCMPEQBrr(RS, RD) \ + _sse_ff_rr(0x66, X86_SSE_EQB, RS, RD) +#define PCMPEQBrm(RS, MD, MB, MI, MS) \ + _sse_if_mr(0x66, X86_SSE_EQB, MD, MB, MI, MS, RD) +#define PCMPEQWrr(RS, RD) \ + _sse_ff_rr(0x66, X86_SSE_EQW, RS, RD) +#define PCMPEQWrm(RS, MD, MB, MI, MS) \ + _sse_if_mr(0x66, X86_SSE_EQW, MD, MB, MI, MS, RD) +#define PCMPEQLrr(RS, RD) \ + _sse_ff_rr(0x66, X86_SSE_EQD, RS, RD) +#define PCMPEQLrm(RS, MD, MB, MI, MS) \ + _sse_if_mr(0x66, X86_SSE_EQD, MD, MB, MI, MS, RD) + +/* SSE2 with 0x66 prefix, SSE otherwise */ +#define PSRLWrr(RS, RD) \ + _sse_ff_rr(0x66, 0xd1, RS, RD) +#define PSRLWrm(RS, MD, MB, MI, MS) \ + _sse_if_mr(0x66, 0xd1, MD, MB, MI, MS, RD) +#define PSRLWir(IM, RD) \ + (_O(0x66), \ + _REXLrr(_NOREG, RD), \ + _O(0x0f), \ + _O(0x71), \ + _Mrm(_b11, _b10, _rX(RD)), \ + _O(IM)) + +/* SSE2 with 0x66 prefix, SSE otherwise */ +#define PSRLLrr(RS, RD) \ + _sse_ff_rr(0x66, 0xd2, RS, RD) +#define PSRLLrm(RS, MD, MB, MI, MS) \ + _sse_id_mr(0x66, 0xd2, MD, MB, MI, MS, RD) +#define PSRLLir(IM, RD) \ + (_O(0x66), \ + _rex_if_rr(_NOREG, RD), \ + _O(0x0f), \ + _O(0x72), \ + _Mrm(_b11, _b10, _rX(RD)), \ + _O(IM)) + +/* SSE2 */ +#define PSRLQrr(RS, RD) \ + _sse_ff_rr(0x66, 0xd3, RS, RD) +#define PSRLQrm(RS, MD, MB, MI, MS) \ + _sse_id_mr(0x66, 0xd3, MD, MB, MI, MS, RD) +#define PSRLQir(IM, RD) \ + (_O(0x66), \ + _rex_if_rr(_NOREG, RD), \ + _O(0x0f), \ + _O(0x73), \ + _Mrm(_b11, _b10, _rX(RD)), \ + _O(IM)) + +/* SSE4.1 */ +#define ROUNDSSrri(RS, RD, IM) \ + (_O(0x66), _rex_ff_rr(RD, RS), _OO(0xf00|X86_SSE_ROUND), _O(0x0a), \ + _Mrm(_b11, _rX(RD), _rX(RS)), _O(IM)) +#define ROUNDSDrri(RS, RD, IM) \ + (_O(0x66), _rex_ff_rr(RD, RS), _OO(0xf00|X86_SSE_ROUND), _O(0x0b), \ + _Mrm(_b11, _rX(RD), _rX(RS)), _O(IM)) +#define PCMPEQQrr(RS, RD) \ + (_O(0x66), _rex_ff_rr(RD, RS), _OO(0x0f38), _O(0x29), \ + _Mrm(_b11, _rX(RD), _rX(RS))) + + +#ifdef JIT_X86_64 + +#define _rex_q_rr(rr, mr) x86_REXw_x_(0, 1, rr, 0, mr) +#define _rex_dl_rr(rr, mr) _rex_q_rr(rr, mr) +#define _rex_ld_rr(rr, mr) _rex_q_rr(rr, mr) + +#define __sse_lf_rr(op, rs, rd) __sse_ld_rr(op, rs, rd) +#define __sse_ld_rr(op, rs, rd) (_rex_ld_rr(rd, rs), _O(0x0f), _O(op), _Mrm(_b11, _rA(rd), _rX(rs))) + +#define __sse_fl_rr(op, rs, rd) __sse_dl_rr(op, rs, rd) +#define __sse_dl_rr(op, rs, rd) (_rex_dl_rr(rd, rs), _O(0x0f), _O(op), _Mrm(_b11, _rX(rd), _rA(rs))) + +#define _sse_lf_rr(px, op, rs, rd) _sse_ld_rr(px, op, rs, rd) +#define _sse_ld_rr(px, op, rs, rd) (_jit_B(px), __sse_ld_rr(op, rs, rd)) + +#define _sse_fl_rr(px, op, rs, rd) _sse_dl_rr(px, op, rs, rd) +#define _sse_dl_rr(px, op, rs, rd) (_jit_B(px), __sse_dl_rr(op, rs, rd)) + +#define CVTTSD2SIQrr(RS, RD) _sse_lf_rr(0xf2, X86_SSE_CVTTSI, RS, RD) +#define CVTSI2SDQrr(RS, RD) _sse_dl_rr(0xf2, X86_SSE_CVTIS, RS, RD) +#define MOVDQXrr(RS, RD) _sse_dl_rr(0x66, X86_SSE_X2G, RS, RD) + +#endif /*** References: */ /* */ diff --git a/src/racket/src/lightning/i386/core.h b/src/racket/src/lightning/i386/core.h index f622db79a3..99cb6132d5 100644 --- a/src/racket/src/lightning/i386/core.h +++ b/src/racket/src/lightning/i386/core.h @@ -65,6 +65,14 @@ struct jit_local_state { #ifdef JIT_X86_64 int argpushes; #endif +#ifdef JIT_X86_SSE + union { + int i[2]; + long l; + double d; + } d_data; + jit_insn *tmp_label; +#endif }; /* 3-parameter operation */ @@ -345,6 +353,7 @@ struct jit_local_state { #define jit_rshr_ul(d, r1, r2) jit_replace((r1), (r2), _ECX, jit_qop_ ((d), (r1), SHRQrr(_CL, (d)) )) /* Stack */ +#define jit_pushi_i(is) PUSHLi(is) #define jit_pushr_i(rs) PUSHLr(rs) #define jit_popr_i(rs) POPLr(rs) #define jit_pushr_l(rs) jit_pushr_i(rs) @@ -598,14 +607,16 @@ static const int const jit_arg_reg_order[] = { _EDI, _ESI, _EDX, _ECX }; #define jit_bxeqi_s(label, rs, is) (CMPWim(is, 0, rs, 0, 0), JEm(label,0,0,0), _jit.x.pc) #define jit_bxnei_s(label, rs, is) (CMPWim(is, 0, rs, 0, 0), JNEm(label,0,0,0), _jit.x.pc) +#if 0 +XFORM_NONGCING static intptr_t _CHECK_TINY(intptr_t diff) { if ((diff < -128) || (diff > 127)) *(intptr_t *)0x0 = 1; return diff; } +#else +# define _CHECK_TINY(x) x +#endif +#define jit_patch_tiny_at(jump_pc,v) (*_PSC((jump_pc) - sizeof(char)) = _jit_SC(_CHECK_TINY((jit_insn *)(v) - (jump_pc)))) + #ifdef SUPPORT_TINY_JUMPS -# if 0 -static intptr_t _CHECK_TINY(intptr_t diff) { if ((diff < -128) || (diff > 127)) *(intptr_t *)0x0 = 1; return diff; } -# else -# define _CHECK_TINY(x) x -# endif # define jit_patch_normal_at(jump_pc,v) (_jitl.tiny_jumps \ - ? (*_PSC((jump_pc) - sizeof(char)) = _jit_SC(_CHECK_TINY((jit_insn *)(v) - (jump_pc)))) \ + ? jit_patch_tiny_at(jump_pc, v) \ : (*_PSI((jump_pc) - sizeof(int)) = _jit_SI((jit_insn *)(v) - (jump_pc)))) #else # define jit_patch_normal_at(jump_pc,v) (*_PSI((jump_pc) - sizeof(int)) = _jit_SI((jit_insn *)(v) - (jump_pc))) diff --git a/src/racket/src/lightning/i386/fp-common.h b/src/racket/src/lightning/i386/fp-common.h index 907fdc4d95..487a24c5b6 100644 --- a/src/racket/src/lightning/i386/fp-common.h +++ b/src/racket/src/lightning/i386/fp-common.h @@ -61,8 +61,6 @@ #define jit_abs_f(rd,rs) jit_abs_d(rd,rs) #define jit_negr_f(rd,rs) jit_negr_d(rd,rs) #define jit_sqrt_f(rd,rs) jit_sqrt_d(rd,rs) -#define jit_extr_f_d(rs, rd) -#define jit_extr_d_f(rs, rd) #define jit_extr_i_f(rd, rs) jit_extr_i_d(rd, rs) #define jit_roundr_f_i(rd, rs) jit_roundr_d_i(rd, rs) #define jit_floorr_f_i(rd, rs) jit_floorr_d_i(rd, rs) diff --git a/src/racket/src/lightning/i386/fp-sse.h b/src/racket/src/lightning/i386/fp-sse.h new file mode 100644 index 0000000000..193a833c8c --- /dev/null +++ b/src/racket/src/lightning/i386/fp-sse.h @@ -0,0 +1,202 @@ +/******************************** -*- C -*- **************************** + * + * Support macros for SSE floating-point math + * + ***********************************************************************/ + + +/*********************************************************************** + * + * Copyright 2006,2010 Free Software Foundation, Inc. + * + * This file is part of GNU lightning. + * + * GNU lightning is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU lightning is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with GNU lightning; see the file COPYING.LESSER; if not, write to the + * Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + * + * Authors: + * Paolo Bonzini + * Paulo Cesar Pereira de Andrade + ***********************************************************************/ + + +#ifndef __lightning_fp_sse_h +#define __lightning_fp_sse_h + +#define JIT_FPR_NUM 6 + +#define _XMM0 0x60 +#ifdef JIT_X86_64 +# define JIT_FPR(i) (_XMM0 + 8 + (i)) +#else +# define JIT_FPR(i) (_XMM0 + (i)) +#endif +#define JIT_FPTMP0 JIT_FPR(6) + +#define jit_addr_d(f0, f1, f2) \ + ((f0 == f1) \ + ? ADDSDrr(f2, f0) \ + : ((f0 == f2) \ + ? ADDSDrr(f1, f0) \ + : (MOVSDrr(f1, f0), ADDSDrr(f2, f0)))) + +#define jit_subr_d(f0, f1, f2) \ + ((f0 == f1) \ + ? SUBSDrr(f2, f0) \ + : ((f0 == f2) \ + ? (MOVSDrr(f0, JIT_FPTMP0), MOVSDrr(f1, f0), SUBSDrr(JIT_FPTMP0, f0)) \ + : (MOVSDrr(f1, f0), SUBSDrr(f2, f0)))) + +#define jit_subrr_d(f0, f1, f2) jit_subr_d(f0, f2, f1) + +#define jit_mulr_d(f0, f1, f2) \ + ((f0 == f1) \ + ? MULSDrr(f2, f0) \ + : ((f0 == f2) \ + ? MULSDrr(f1, f0) \ + : (MOVSDrr(f1, f0), MULSDrr(f2, f0)))) + +#define jit_divr_d(f0, f1, f2) \ + ((f0 == f1) \ + ? DIVSDrr(f2, f0) \ + : ((f0 == f2) \ + ? (MOVSDrr(f0, JIT_FPTMP0), MOVSDrr(f1, f0), DIVSDrr(JIT_FPTMP0, f0)) \ + : (MOVSDrr(f1, f0), DIVSDrr(f2, f0)))) + +#define jit_divrr_d(f0, f1, f2) jit_divr_d(f0, f2, f1) + +#define jit_ldr_f(f0, r0) MOVSSmr(0, r0, _NOREG, _SCL1, f0) +#define jit_ldr_d(f0, r0) MOVSDmr(0, r0, _NOREG, _SCL1, f0) + +#define _jit_ldi_d(f0, i0) MOVSDmr((long)i0, _NOREG, _NOREG, _SCL1, f0) +#ifdef JIT_X86_64 +# define jit_ldi_d(f0, i0) \ + (_u32P((intptr_t)(i0)) \ + ? _jit_ldi_d(f0, i0) \ + : (jit_movi_l(JIT_REXTMP, i0), jit_ldr_d(f0, JIT_REXTMP))) +#else +# define jit_ldi_d(f0, i0) _jit_ldi_d(f0, i0) +#endif + +#define jit_ldxr_d(f0, r0, r1) MOVSDmr(0, r0, r1, _SCL1, f0) + +#define jit_ldxi_d(f0, r0, i0) MOVSDmr(i0, r0, _NOREG, _SCL1, f0); + +#define jit_str_d(r0, f0) MOVSDrm(f0, 0, r0, _NOREG, _SCL1) + +#define _jit_sti_d(i0, f0) MOVSDrm(f0, (long)i0, _NOREG, _NOREG, _SCL1) +#ifdef JIT_X86_64 +# define jit_sti_d(i0, f0) \ + (_u32P((intptr_t)(i0)) \ + ? _jit_sti_d(i0, f0) \ + : (jit_movi_l(JIT_REXTMP, i0), jit_str_d(JIT_REXTMP, f0))) +#else +# define jit_sti_d(i0, f0) _jit_sti_d(i0, f0) +#endif + +#define jit_stxr_d(r0, r1, f0) MOVSDrm(f0, 0, r0, r1, _SCL1) + +#define jit_stxi_d(i0, r1, f0) MOVSDrm(f0, i0, r1, _NOREG, _SCL1) + +#define jit_movi_d(f0, i0) \ + (_jitl.d_data.d = i0, \ + ((_jitl.d_data.d == 0.0 && !(_jitl.d_data.i[1] & 0x80000000)) \ + ? XORPDrr(f0, f0) \ + : finish_movi_d(f0, i0))) +#ifdef JIT_X86_64 +# define finish_movi_d(f0, i0) (jit_movi_l(JIT_REXTMP, _jitl.d_data.l), MOVDQXrr(JIT_REXTMP, f0)) +#else +# define finish_movi_d(f0, i0) \ + (jit_pushi_i(_jitl.d_data.i[1]), jit_pushi_i(_jitl.d_data.i[0]), \ + jit_ldr_d(f0, JIT_SP), \ + jit_addi_l(JIT_SP, JIT_SP, sizeof(double))) +#endif + +# define jit_movr_d(f0, f1) ((f0 != f1) ? MOVSDrr(f1, f0) : (void)0) +# define jit_extr_i_d(f0, r0) CVTSI2SDLrr(r0, f0) + +#ifdef JIT_X86_64 +# define jit_extr_l_d(f0, r0) CVTSI2SDQrr(r0, f0) +#else +# define jit_extr_l_d(f0, r0) jit_extr_i_d(f0, r0) +#endif + +# define jit_extr_d_f(f0, f1) CVTSD2SSrr(f1, f0) + +#define jit_abs_d(f0, f1) \ + ((f0 == f1) \ + ? (PCMPEQLrr(JIT_FPTMP0, JIT_FPTMP0), PSRLQir(1, JIT_FPTMP0), ANDPDrr(JIT_FPTMP0, f0)) \ + : (PCMPEQLrr(f0, f0), PSRLQir(1, f0), ANDPDrr(f1, f0))) + +#define jit_sqrt_d(f0, f1) SQRTSDrr(f1, f0) + +#ifdef JIT_X86_64 +# define jit_negr_d(f0, f1) \ + (jit_movi_l(JIT_REXTMP, 0x8000000000000000), \ + ((f0 == f1) \ + ? (MOVDQXrr(JIT_REXTMP, JIT_FPTMP0), \ + XORPDrr(JIT_FPTMP0, f0)) \ + : (MOVDQXrr(JIT_REXTMP, f0), \ + XORPDrr(f1, f0)))) +#else +# define jit_negr_d(f0, f1) \ + (jit_pushi_i(0x80000000), \ + jit_pushi_i(0), \ + ((f0 == f1) \ + ? (jit_ldr_d(JIT_FPTMP0, JIT_SP), \ + XORPDrr(JIT_FPTMP0, f0)) \ + : (jit_ldr_d(f0, JIT_SP), \ + XORPDrr(f1, f0))), \ + jit_addi_l(JIT_SP, JIT_SP, sizeof(int) << 1)) +#endif + +/* Racket uses jit_roundr_l only for inexact->exact of fixnums, + so a truncate is good enough. */ +#define jit_roundr_d_i(r0, f0) jit_truncr_d_i(r0, f0) +#define jit_roundr_d_l(r0, f0) jit_truncr_d_l(r0, f0) + +#define jit_truncr_d_i(r0, f0) CVTTSD2SILrr(f0, r0) +#ifdef JIT_X86_64 +# define jit_truncr_d_l(r0, f0) CVTTSD2SIQrr(f0, r0) +#else +# define jit_truncr_d_l(r0, f0) jit_truncr_d_i(r0, f0) +#endif + +#define jit_bltr_d(label, f0, f1) (UCOMISDrr(f0, f1), JAEm(label,0,0,0), (_jit.x.pc)) +#define jit_bler_d(label, f0, f1) (UCOMISDrr(f0, f1), JBEm(label,0,0,0), (_jit.x.pc)) +#define jit_bgtr_d(label, f0, f1) (UCOMISDrr(f1, f0), JAm(label,0,0,0), (_jit.x.pc)) +#define jit_bger_d(label, f0, f1) (UCOMISDrr(f1, f0), JAEm(label,0,0,0), (_jit.x.pc)) +#define jit_beqr_d(label, f0, f1) \ + (UCOMISDrr(f0, f1), \ + _O_D8(0x70|(0xa), 0), /*JP */ \ + _jitl.tmp_label = _jit.x.pc, \ + JEm(label,0,0,0), \ + jit_patch_tiny_at(_jitl.tmp_label, _jit.x.pc), \ + _jit.x.pc) + +#define jit_bantiltr_d(label, f0, f1) (UCOMISDrr(f0, f1), JBEm(label,0,0,0), (_jit.x.pc)) +#define jit_bantiler_d(label, f0, f1) (UCOMISDrr(f0, f1), JBm(label,0,0,0), (_jit.x.pc)) +#define jit_bantigtr_d(label, f0, f1) (UCOMISDrr(f1, f0), JBEm(label,0,0,0), (_jit.x.pc)) +#define jit_bantiger_d(label, f0, f1) (UCOMISDrr(f1, f0), JBm(label,0,0,0), (_jit.x.pc)) +#define jit_bantieqr_d(label, f0, f1) \ + (UCOMISDrr(f0, f1), \ + _O_D8(0x70|(0xb), 0), /*JNP */ \ + _jitl.tmp_label = _jit.x.pc, \ + CMPLir(0, JIT_SP), \ + jit_patch_tiny_at(_jitl.tmp_label, _jit.x.pc), \ + JNEm(label,0,0,0), \ + _jit.x.pc) + +#endif /* __lightning_fp_sse_h */ diff --git a/src/racket/src/lightning/i386/fp.h b/src/racket/src/lightning/i386/fp.h index b099bb1623..a545b5a7aa 100644 --- a/src/racket/src/lightning/i386/fp.h +++ b/src/racket/src/lightning/i386/fp.h @@ -33,6 +33,12 @@ #ifndef __lightning_asm_fp_h #define __lightning_asm_fp_h +#ifdef JIT_X86_SSE + +# include "fp-sse.h" + +#else + /* We really must map the x87 stack onto a flat register file. In practice, we can provide something sensible and make it work on the x86 using the stack like a file of eight registers. @@ -478,4 +484,6 @@ union jit_double_imm { _OO(0xd9f1)) /* fyl2x */ #endif +#endif + #endif /* __lightning_asm_h */