From b223ad2d90597d20964330c94154336d6305f012 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Fri, 9 Nov 2012 21:25:57 -0700 Subject: [PATCH] x86_64 JIT: use 32-bit jumps until forced to allocate far away The JIT was pessimistically using 64-bit jumps for long branches or any jump between code that is allocated at different times. Normally, though, code allocation stays within the same 32-bit range of the heap, so stick to 32-bit jumps until forced by allocation addresses to use 64-bit jump targets. --- .../tests/racket/benchmarks/common/maze.sch | 2 +- .../tests/racket/benchmarks/common/sort1.sch | 2 +- src/racket/src/jit.c | 13 +-- src/racket/src/jit.h | 28 +++++- src/racket/src/jitcall.c | 2 +- src/racket/src/jitstate.c | 85 ++++++++++++++++++- src/racket/src/lightning/i386/asm.h | 25 ++++-- src/racket/src/lightning/i386/core.h | 15 +++- src/racket/src/lightning/ppc/core.h | 1 + 9 files changed, 148 insertions(+), 25 deletions(-) diff --git a/collects/tests/racket/benchmarks/common/maze.sch b/collects/tests/racket/benchmarks/common/maze.sch index 7e6a332877..3c7473337a 100644 --- a/collects/tests/racket/benchmarks/common/maze.sch +++ b/collects/tests/racket/benchmarks/common/maze.sch @@ -673,7 +673,7 @@ (let ((input (with-input-from-file "input.txt" read))) (time (let loop ((n 10000) (v 0)) (if (zero? n) - v + (length v) (begin (set! output '()) (pmaze 20 (if input 7 0)) diff --git a/collects/tests/racket/benchmarks/common/sort1.sch b/collects/tests/racket/benchmarks/common/sort1.sch index 8d118a0985..49fa82b82b 100644 --- a/collects/tests/racket/benchmarks/common/sort1.sch +++ b/collects/tests/racket/benchmarks/common/sort1.sch @@ -141,7 +141,7 @@ (define (sort-benchmark sorter n) (let ((l (rgen n 1000000))) - (time (sorter l <)))) + (time (length (sorter l <))))) (sort-benchmark sort1 1000000) diff --git a/src/racket/src/jit.c b/src/racket/src/jit.c index d8bf71e3e6..b520f7b5f7 100644 --- a/src/racket/src/jit.c +++ b/src/racket/src/jit.c @@ -321,7 +321,7 @@ int scheme_jit_check_closure_flonum_bit(Scheme_Closure_Data *data, int pos, int } #endif -#ifdef NEED_LONG_JUMPS +#ifdef NEED_LONG_BRANCHES static int is_short(Scheme_Object *obj, int fuel) { Scheme_Type t; @@ -1589,14 +1589,17 @@ static int generate_branch(Scheme_Object *obj, mz_jit_state *jitter, int is_tail int pushed_marks; int nsrs, nsrs1, g1, g2, amt, need_sync, flostack, flostack_pos; int else_is_empty = 0, i, can_chain_branch, chain_true, chain_false, old_self_pos; -#ifdef NEED_LONG_JUMPS +#ifdef NEED_LONG_BRANCHES int then_short_ok, else_short_ok; #else int then_short_ok = 1; +# ifdef NEED_LONG_JUMPS + int else_short_ok = 1; +# endif #endif START_JIT_DATA(); -#ifdef NEED_LONG_JUMPS +#ifdef NEED_LONG_BRANCHES /* It's possible that the code for a then or else branch will be so large that we might need a long jump. Conservatively analyze the @@ -1626,12 +1629,12 @@ static int generate_branch(Scheme_Object *obj, mz_jit_state *jitter, int is_tail if (can_chain_branch && chain_true) for_this_branch.true_needs_jump = 1; -#ifdef NEED_LONG_JUMPS +#ifdef NEED_LONG_BRANCHES if (can_chain_branch && (chain_true || chain_false) && !for_branch->branch_short) then_short_ok = 0; - for_this_branch.branch_short = then_short_ok; #endif + for_this_branch.branch_short = then_short_ok; LOG_IT(("if...\n")); diff --git a/src/racket/src/jit.h b/src/racket/src/jit.h index 0bd01ae12b..cfa2525261 100644 --- a/src/racket/src/jit.h +++ b/src/racket/src/jit.h @@ -7,7 +7,8 @@ visible to the GC. 3) Immediate operands must be 32-bit values on x86_64, except with - jit_movi, jit_sti, jit_ldi, jit_bXi, jit_calli, and jit_finishi. + jit_movi, jit_sti, jit_ldi, jit_bXi, jit_calli (in default-long + mode), and jit_finishi. 4) Function calls are limited to 3 arguments (i.e., jit_prepare() must never be called with a number greater than 3). This limit @@ -101,8 +102,17 @@ END_XFORM_ARITH; #define LOG_MZCHAR_SIZE 2 #if defined(MZ_USE_JIT_PPC) || defined(MZ_USE_JIT_X86_64) +/* Both PPC and x86_64 need long jumps, sometimes */ # define NEED_LONG_JUMPS #endif +#if defined(MZ_USE_JIT_PPC) +/* For PPC, long jumps may be needed even within a JIT-generated block */ +# define NEED_LONG_BRANCHES +#endif +#if defined(MZ_USE_JIT_X86_64) +/* For x86_64, long jumps are needed only if we start allocating far away */ +# define SET_DEFAULT_LONG_JUMPS +#endif /* Tiny jumps seem worthwhile for x86, but they don't seem to help for x86_64: */ #if defined(MZ_USE_JIT_I386) && !defined(MZ_USE_JIT_X86_64) # define USE_TINY_JUMPS @@ -865,7 +875,7 @@ static jit_insn *fp_tmpr; #ifdef NEED_LONG_JUMPS # define __START_SHORT_JUMPS__(cond) if (cond) { _jitl.long_jumps = 0; } -# define __END_SHORT_JUMPS__(cond) if (cond) { _jitl.long_jumps= 1; } +# define __END_SHORT_JUMPS__(cond) if (cond) { _jitl.long_jumps = LONG_JUMPS_DEFAULT(_jitl); } #else # define __START_SHORT_JUMPS__(cond) /* empty */ # define __END_SHORT_JUMPS__(cond) /* empty */ @@ -922,7 +932,19 @@ static jit_insn *fp_tmpr; Tiny-jump mode is like short-jump mode, but the offset must be within +/- 2^7. Favor tiny jumps over short jumps when possible. - All mz_finish() and jit_calli() are implicitly long jumps. + On x86_64, short is the default, since "short" is pretty long. + Short mode is never needed for jumps within a single allocated + block (on the assumption that a single block of code can never get + that long). Default-long mode must be enabled if allocated code + blocks can be far apart. + + A jit_calli() is "medium": for x86_64, it is short unless + default-long mode is enabled; otherwise, it is always + long. + + All mz_finish() are long jumps. This is true even in default-short + jump mode on x86_64, since the target is likely to be C code that + is not necessarily close to JIT-allocate code. */ /* A lightweight continuation is one that contains only frames from diff --git a/src/racket/src/jitcall.c b/src/racket/src/jitcall.c index a9200b65fa..6c594f206e 100644 --- a/src/racket/src/jitcall.c +++ b/src/racket/src/jitcall.c @@ -861,7 +861,7 @@ int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direc } #ifdef MZ_USE_JIT_I386 mz_patch_ucbranch(refr); - (void)jit_calli(refxr); + (void)jit_short_calli(refxr); #else jit_patch_movi(refr, (_jit.x.pc)); #endif diff --git a/src/racket/src/jitstate.c b/src/racket/src/jitstate.c index 1d433f9b58..c5da94fee7 100644 --- a/src/racket/src/jitstate.c +++ b/src/racket/src/jitstate.c @@ -42,6 +42,11 @@ THREAD_LOCAL_DECL(static void *jit_buffer_cache); THREAD_LOCAL_DECL(static intptr_t jit_buffer_cache_size); THREAD_LOCAL_DECL(static int jit_buffer_cache_registered); +#ifdef SET_DEFAULT_LONG_JUMPS +static int default_long_jumps; +static volatile uintptr_t code_low, code_high; +#endif + static void *get_end_pointer(mz_jit_state *jitter) { return jit_get_ip().ptr; @@ -101,6 +106,49 @@ double *scheme_mz_retain_double(mz_jit_state *jitter, double d) } #endif +#ifdef SET_DEFAULT_LONG_JUMPS +static int check_long_mode(uintptr_t low, uintptr_t high) +{ + if (default_long_jumps) + return 1; + + if (!code_low) + code_low = low; + else if (low < code_low) { +#ifdef MZ_USE_PLACES + while (!mzrt_cas(&code_low, code_low, low)) { + if (low >= code_low) + break; + } +#else + code_low = low; +#endif + } + + if (high > code_high) { +#ifdef MZ_USE_PLACES + while (!mzrt_cas(&code_high, code_high, high)) { + if (high <= code_high) + break; + } +#else + code_high = high; +#endif + } + + if ((code_high - code_low) >= ((uintptr_t)1 << 31)) { + if (!default_long_jumps) { + scheme_log_warning("warning: JIT switching to long-jump mode"); + default_long_jumps = 1; + } + return 1; + } + + return 0; +} +#endif + + void *scheme_generate_one(mz_jit_state *old_jitter, Generate_Proc generate, void *data, @@ -117,6 +165,9 @@ void *scheme_generate_one(mz_jit_state *old_jitter, intptr_t size_pre_retained = 0, size_pre_retained_double = 0, num_retained = 0, num_retained_double = 0, padding; int mappings_size = JIT_INIT_MAPPINGS_SIZE; int ok, max_extra_pushed = 0; +#ifdef SET_DEFAULT_LONG_JUMPS + int use_long_jumps = default_long_jumps; +#endif #ifdef MZ_PRECISE_GC Scheme_Object *fnl_obj; @@ -139,8 +190,11 @@ void *scheme_generate_one(mz_jit_state *old_jitter, while (1) { memset(jitter, 0, sizeof(_jitter)); +#ifdef SET_DEFAULT_LONG_JUMPS + _jitl.long_jumps_default = use_long_jumps; +#endif #ifdef NEED_LONG_JUMPS - _jitl.long_jumps = 1; + _jitl.long_jumps = LONG_JUMPS_DEFAULT(_jitl); #endif #ifdef USE_TINY_JUMPS _jitl.tiny_jumps = 0; @@ -197,7 +251,21 @@ void *scheme_generate_one(mz_jit_state *old_jitter, size_pre_retained = size; size_pre_retained_double = size; } - + +#ifdef SET_DEFAULT_LONG_JUMPS + if (!use_long_jumps) { + /* In the case that we start allocating so much that the address + moves beyond the 32-bit half where code normally resides, + then switch over to long-jump mode. */ + if (check_long_mode((uintptr_t)buffer, (uintptr_t)(buffer+size))) { + /* start over */ + known_size = 0; + use_long_jumps = 1; + continue; + } + } +#endif + (void)jit_set_ip(buffer).ptr; jitter->limit = (char *)buffer + size_pre_retained_double - padding; if (known_size) { @@ -238,6 +306,19 @@ void *scheme_generate_one(mz_jit_state *old_jitter, ok = generate(jitter, data); +#ifdef SET_DEFAULT_LONG_JUMPS + /* Check again after generate, because we may have + generated new code blocks along the way. */ + if (!use_long_jumps) { + if (check_long_mode((uintptr_t)buffer, (uintptr_t)(buffer+size))) { + /* start over */ + known_size = 0; + use_long_jumps = 1; + continue; + } + } +#endif + if (save_ptr) { scheme_mz_retain_it(jitter, save_ptr); } diff --git a/src/racket/src/lightning/i386/asm.h b/src/racket/src/lightning/i386/asm.h index f201b8366e..d391fc9fe5 100644 --- a/src/racket/src/lightning/i386/asm.h +++ b/src/racket/src/lightning/i386/asm.h @@ -207,7 +207,7 @@ typedef _uc jit_insn; #ifdef JIT_X86_64 # define _REX_(P,R,X,B) ( _jit_B(P|((R&0x8)>>1)|((X&0x8)>>2)|((B&0x8)>>3)) ) # define _REX(R,X,B) _REX_(0x48,R,X,B) -# define _REXd(R,X,B) _REX_(0x40,R,X,B) +# define _REXd(R,X,B) ((B&0x8) ? _REX_(0x40,R,X,B) : 0) # define _qO( OP, R,X,B ) ( _REX(R,X,B), _jit_B( OP ) ) # define _qOd( OP, R,X,B ) ( _REXd(R,X,B), _jit_B( OP ) ) #else @@ -217,8 +217,10 @@ typedef _uc jit_insn; #define _Or( OP,R ) ( _jit_B( (OP)|_r(R)) ) #ifdef JIT_X86_64 # define _qOr( OP,R ) ( _REX(0,0,R), _jit_B( (OP)|_r(R&0x7)) ) +# define _qOdr( OP,R ) ( _REXd(0,0,R), _jit_B( (OP)|_r(R&0x7)) ) #else # define _qOr( OP,R ) _Or(OP,R) +# define _qOdr( OP,R ) _Or(OP,R) #endif #define _OO( OP ) ( _jit_B((OP)>>8), _jit_B( (OP) ) ) #ifdef JIT_X86_64 @@ -248,6 +250,7 @@ typedef _uc jit_insn; #define _qOr_Q( OP,R ,Q ) ( _qOr ( OP,R) ,_jit_L(Q) ) #define _O_Mrm( OP ,MO,R,M ) ( _O ( OP ),_Mrm(MO,R,M ) ) #define _qO_Mrm( OP ,MO,R,M ) ( _qO ( OP,R,0,M),_qMrm(MO,R,M ) ) +#define _qOd_Mrm( OP ,MO,R,M ) ( _qOd ( OP,R,0,M),_qMrm(MO,R,M ) ) #define _OO_Mrm( OP ,MO,R,M ) ( _OO ( OP ),_Mrm(MO,R,M ) ) #define _qOO_Mrm( OP ,MO,R,M ) ( _qOO ( OP ),_Mrm(MO,R,M ) ) #define _O_Mrm_B( OP ,MO,R,M ,B ) ( _O ( OP ),_Mrm(MO,R,M ) ,_jit_B(B) ) @@ -263,7 +266,7 @@ typedef _uc jit_insn; #define _O_r_X( OP ,R ,MD,MB,MI,MS ) ( _O ( OP ),_r_X( R ,MD,MB,MI,MS) ) #define _qO_r_X( OP ,R ,MD,MB,MI,MS ) ( _qO ( OP,R,0,MS),_qr_X(R,MD,MB,MI,MS) ) #define _qO_r_XB( OP ,R ,MD,MB,MI,MS ) ( _qO ( OP,R,0,MB),_qr_X(R,MD,MB,MI,MS) ) -#define _qO_r_Xd( OP ,R ,MD,MB,MI,MS ) ( _qOd ( OP,R,0,MB),_qr_X(R,MD,MB,MI,MS) ) +#define _qOd_r_X( OP ,R ,MD,MB,MI,MS ) ( _qOd ( OP,R,0,MB),_qr_X(R,MD,MB,MI,MS) ) #define _OO_r_X( OP ,R ,MD,MB,MI,MS ) ( _OO ( OP ),_r_X( R ,MD,MB,MI,MS) ) #define _qOO_r_X( OP ,R ,MD,MB,MI,MS ) ( _qOO ( OP ),_r_X( R ,MD,MB,MI,MS) ) #define _O_r_X_B( OP ,R ,MD,MB,MI,MS,B ) ( _O ( OP ),_r_X( R ,MD,MB,MI,MS) ,_jit_B(B) ) @@ -406,15 +409,21 @@ typedef _uc jit_insn; #define BTSLrr(RS,RD) _OO_Mrm (0x0fab ,_b11,_r4(RS),_r4(RD) ) #define BTSLrm(RS,MD,MB,MI,MS) _OO_r_X (0x0fab ,_r4(RS) ,MD,MB,MI,MS ) +#ifdef _ASM_SAFETY +# define CALLmL(D,B,I,S) ((_r0P(B) && _r0P(I)) ? _O_D32 (0xe8 ,(intptr_t)(D) ) : \ + JITFAIL("illegal mode in direct jump")) +#else +# define CALLmL(D,B,I,S) _O_D32 (0xe8 ,(intptr_t)(D) ) +#endif + #ifdef JIT_X86_64 # define CALLm(D,B,I,S) (MOVQir((D), JIT_REXTMP), CALQsr(JIT_REXTMP)) #else -# define CALLm(D,B,I,S) ((_r0P(B) && _r0P(I)) ? _O_D32 (0xe8 ,(intptr_t)(D) ) : \ - JITFAIL("illegal mode in direct jump")) +# define CALLm(D,B,I,S) CALLmL(D,B,I,S) #endif #define CALLsr(R) _O_Mrm (0xff ,_b11,_b010,_r4(R) ) -#define CALQsr(R) _qO_Mrm (0xff ,_b11,_b010,_r8(R)) +#define CALQsr(R) _qOd_Mrm(0xff ,_b11,_b010,_r8(R)) #define CALLsm(D,B,I,S) _O_r_X (0xff ,_b010 ,(intptr_t)(D),B,I,S ) @@ -693,7 +702,7 @@ typedef _uc jit_insn; #define MOVQmr(MD, MB, MI, MS, RD) _qO_r_X (0x8b ,_r8(RD) ,MD,MB,MI,MS ) #define MOVQmQr(MD, MB, MI, MS, RD) _qO_r_XB (0x8b ,_r8(RD) ,MD,MB,MI,MS ) -#define MOVQrm(RS, MD, MB, MI, MS) _qO_r_Xd (0x89 ,_r8(RS) ,MD,MB,MI,MS ) +#define MOVQrm(RS, MD, MB, MI, MS) _qOd_r_X (0x89 ,_r8(RS) ,MD,MB,MI,MS ) #define MOVQrQm(RS, MD, MB, MI, MS) _qO_r_XB (0x89 ,_r8(RS) ,MD,MB,MI,MS ) #define MOVQir(IM, R) _qOr_Q (0xb8,_r8(R) ,IM ) @@ -776,7 +785,7 @@ typedef _uc jit_insn; #define POPLr(RD) _Or (0x58,_r4(RD) ) #define POPLm(MD,MB,MI,MS) _O_r_X (0x8f ,_b000 ,MD,MB,MI,MS ) -#define POPQr(RD) _qOr (0x58,_r8(RD) ) +#define POPQr(RD) _qOdr (0x58,_r8(RD) ) #define POPA_() _wO (0x61 ) @@ -794,7 +803,7 @@ typedef _uc jit_insn; #define PUSHLm(MD,MB,MI,MS) _O_r_X (0xff ,_b110 ,MD,MB,MI,MS ) #define PUSHLi(IM) _Os_sL (0x68 ,IM ) -#define PUSHQr(R) _qOr (0x50,_r8(R) ) +#define PUSHQr(R) _qOdr (0x50,_r8(R) ) #define PUSHA_() _wO (0x60 ) #define PUSHAD_() _O (0x60 ) diff --git a/src/racket/src/lightning/i386/core.h b/src/racket/src/lightning/i386/core.h index 4693041eac..791584b183 100644 --- a/src/racket/src/lightning/i386/core.h +++ b/src/racket/src/lightning/i386/core.h @@ -52,7 +52,8 @@ struct jit_local_state { #ifdef JIT_X86_64 - int long_jumps; + int long_jumps, long_jumps_default; +# define LONG_JUMPS_DEFAULT(jitl) (jitl.long_jumps_default) int nextarg_geti; #else int framesize; @@ -432,8 +433,8 @@ struct jit_local_state { # define jit_normal_pushonlyarg_i(rs) (_jitl.argpushes--, MOVQrr(rs, jit_arg_reg_order[0])) # define jit_save_argstate(curstate) curstate = _jitl.argpushes; # define jit_restore_argstate(curstate) _jitl.argpushes = curstate; -# define jit_finish(sub) (jit_shift_args(), (void)jit_calli((sub)), jit_restore_locals()) -# define jit_normal_finish(sub) jit_calli((sub)) +# define jit_finish(sub) (jit_shift_args(), (void)jit_long_calli((sub)), jit_restore_locals()) +# define jit_normal_finish(sub) jit_long_calli((sub)) # define jit_return_pop_insn_len() 0 # define jit_finishr(reg) ((jit_reg_is_arg((reg)) ? MOVQrr(reg, JIT_REXTMP) : (void)0), \ jit_shift_args(), \ @@ -604,7 +605,13 @@ static const int const jit_arg_reg_order[] = { _EDI, _ESI, _EDX, _ECX }; #define jit_bmci_l(label, rs, is) jit_bmci_i(label, rs, is) #define jit_jmpi(label) (JMPm( ((uintptr_t) (label)), 0, 0, 0), _jit.x.pc) -#define jit_calli(label) (CALLm( ((uintptr_t) (label)), 0, 0, 0), _jit.x.pc) +#define jit_long_calli(label) (CALLm( ((uintptr_t) (label)), 0, 0, 0), _jit.x.pc) +#define jit_short_calli(label) (CALLmL( ((uintptr_t) (label)), 0, 0, 0), _jit.x.pc) +#ifdef JIT_X86_64 +# define jit_calli(label) (_jitl.long_jumps_default ? jit_long_calli(label) : jit_short_calli(label)) +#else +# define jit_calli(label) jit_long_calli(label) +#endif #define jit_callr(reg) (CALLsr(reg)) #define jit_jmpr(reg) JMPsr(reg) diff --git a/src/racket/src/lightning/ppc/core.h b/src/racket/src/lightning/ppc/core.h index fc6b638fc3..3b26d9c8b1 100644 --- a/src/racket/src/lightning/ppc/core.h +++ b/src/racket/src/lightning/ppc/core.h @@ -43,6 +43,7 @@ struct jit_local_state { int nextarg_getd; /* The FP args are picked up from FPR1 -> FPR10 */ int nbArgs; /* Number of arguments for the prolog */ int long_jumps; /* 1 => patch or leave room for long jumps */ +# define LONG_JUMPS_DEFAULT(jitl) 1 }; #define JIT_SP 1