x86_64 JIT: use 32-bit jumps until forced to allocate far away
The JIT was pessimistically using 64-bit jumps for long branches or any jump between code that is allocated at different times. Normally, though, code allocation stays within the same 32-bit range of the heap, so stick to 32-bit jumps until forced by allocation addresses to use 64-bit jump targets.
This commit is contained in:
parent
e217aaa507
commit
b223ad2d90
|
@ -673,7 +673,7 @@
|
|||
(let ((input (with-input-from-file "input.txt" read)))
|
||||
(time (let loop ((n 10000) (v 0))
|
||||
(if (zero? n)
|
||||
v
|
||||
(length v)
|
||||
(begin
|
||||
(set! output '())
|
||||
(pmaze 20 (if input 7 0))
|
||||
|
|
|
@ -141,7 +141,7 @@
|
|||
|
||||
(define (sort-benchmark sorter n)
|
||||
(let ((l (rgen n 1000000)))
|
||||
(time (sorter l <))))
|
||||
(time (length (sorter l <)))))
|
||||
|
||||
(sort-benchmark sort1 1000000)
|
||||
|
||||
|
|
|
@ -321,7 +321,7 @@ int scheme_jit_check_closure_flonum_bit(Scheme_Closure_Data *data, int pos, int
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef NEED_LONG_JUMPS
|
||||
#ifdef NEED_LONG_BRANCHES
|
||||
static int is_short(Scheme_Object *obj, int fuel)
|
||||
{
|
||||
Scheme_Type t;
|
||||
|
@ -1589,14 +1589,17 @@ static int generate_branch(Scheme_Object *obj, mz_jit_state *jitter, int is_tail
|
|||
int pushed_marks;
|
||||
int nsrs, nsrs1, g1, g2, amt, need_sync, flostack, flostack_pos;
|
||||
int else_is_empty = 0, i, can_chain_branch, chain_true, chain_false, old_self_pos;
|
||||
#ifdef NEED_LONG_JUMPS
|
||||
#ifdef NEED_LONG_BRANCHES
|
||||
int then_short_ok, else_short_ok;
|
||||
#else
|
||||
int then_short_ok = 1;
|
||||
# ifdef NEED_LONG_JUMPS
|
||||
int else_short_ok = 1;
|
||||
# endif
|
||||
#endif
|
||||
START_JIT_DATA();
|
||||
|
||||
#ifdef NEED_LONG_JUMPS
|
||||
#ifdef NEED_LONG_BRANCHES
|
||||
/* It's possible that the code for a then
|
||||
or else branch will be so large that we might
|
||||
need a long jump. Conservatively analyze the
|
||||
|
@ -1626,12 +1629,12 @@ static int generate_branch(Scheme_Object *obj, mz_jit_state *jitter, int is_tail
|
|||
|
||||
if (can_chain_branch && chain_true)
|
||||
for_this_branch.true_needs_jump = 1;
|
||||
#ifdef NEED_LONG_JUMPS
|
||||
#ifdef NEED_LONG_BRANCHES
|
||||
if (can_chain_branch && (chain_true || chain_false)
|
||||
&& !for_branch->branch_short)
|
||||
then_short_ok = 0;
|
||||
for_this_branch.branch_short = then_short_ok;
|
||||
#endif
|
||||
for_this_branch.branch_short = then_short_ok;
|
||||
|
||||
LOG_IT(("if...\n"));
|
||||
|
||||
|
|
|
@ -7,7 +7,8 @@
|
|||
visible to the GC.
|
||||
|
||||
3) Immediate operands must be 32-bit values on x86_64, except with
|
||||
jit_movi, jit_sti, jit_ldi, jit_bXi, jit_calli, and jit_finishi.
|
||||
jit_movi, jit_sti, jit_ldi, jit_bXi, jit_calli (in default-long
|
||||
mode), and jit_finishi.
|
||||
|
||||
4) Function calls are limited to 3 arguments (i.e., jit_prepare()
|
||||
must never be called with a number greater than 3). This limit
|
||||
|
@ -101,8 +102,17 @@ END_XFORM_ARITH;
|
|||
#define LOG_MZCHAR_SIZE 2
|
||||
|
||||
#if defined(MZ_USE_JIT_PPC) || defined(MZ_USE_JIT_X86_64)
|
||||
/* Both PPC and x86_64 need long jumps, sometimes */
|
||||
# define NEED_LONG_JUMPS
|
||||
#endif
|
||||
#if defined(MZ_USE_JIT_PPC)
|
||||
/* For PPC, long jumps may be needed even within a JIT-generated block */
|
||||
# define NEED_LONG_BRANCHES
|
||||
#endif
|
||||
#if defined(MZ_USE_JIT_X86_64)
|
||||
/* For x86_64, long jumps are needed only if we start allocating far away */
|
||||
# define SET_DEFAULT_LONG_JUMPS
|
||||
#endif
|
||||
/* Tiny jumps seem worthwhile for x86, but they don't seem to help for x86_64: */
|
||||
#if defined(MZ_USE_JIT_I386) && !defined(MZ_USE_JIT_X86_64)
|
||||
# define USE_TINY_JUMPS
|
||||
|
@ -865,7 +875,7 @@ static jit_insn *fp_tmpr;
|
|||
|
||||
#ifdef NEED_LONG_JUMPS
|
||||
# define __START_SHORT_JUMPS__(cond) if (cond) { _jitl.long_jumps = 0; }
|
||||
# define __END_SHORT_JUMPS__(cond) if (cond) { _jitl.long_jumps= 1; }
|
||||
# define __END_SHORT_JUMPS__(cond) if (cond) { _jitl.long_jumps = LONG_JUMPS_DEFAULT(_jitl); }
|
||||
#else
|
||||
# define __START_SHORT_JUMPS__(cond) /* empty */
|
||||
# define __END_SHORT_JUMPS__(cond) /* empty */
|
||||
|
@ -922,7 +932,19 @@ static jit_insn *fp_tmpr;
|
|||
Tiny-jump mode is like short-jump mode, but the offset must be
|
||||
within +/- 2^7. Favor tiny jumps over short jumps when possible.
|
||||
|
||||
All mz_finish() and jit_calli() are implicitly long jumps.
|
||||
On x86_64, short is the default, since "short" is pretty long.
|
||||
Short mode is never needed for jumps within a single allocated
|
||||
block (on the assumption that a single block of code can never get
|
||||
that long). Default-long mode must be enabled if allocated code
|
||||
blocks can be far apart.
|
||||
|
||||
A jit_calli() is "medium": for x86_64, it is short unless
|
||||
default-long mode is enabled; otherwise, it is always
|
||||
long.
|
||||
|
||||
All mz_finish() are long jumps. This is true even in default-short
|
||||
jump mode on x86_64, since the target is likely to be C code that
|
||||
is not necessarily close to JIT-allocate code.
|
||||
*/
|
||||
|
||||
/* A lightweight continuation is one that contains only frames from
|
||||
|
|
|
@ -861,7 +861,7 @@ int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direc
|
|||
}
|
||||
#ifdef MZ_USE_JIT_I386
|
||||
mz_patch_ucbranch(refr);
|
||||
(void)jit_calli(refxr);
|
||||
(void)jit_short_calli(refxr);
|
||||
#else
|
||||
jit_patch_movi(refr, (_jit.x.pc));
|
||||
#endif
|
||||
|
|
|
@ -42,6 +42,11 @@ THREAD_LOCAL_DECL(static void *jit_buffer_cache);
|
|||
THREAD_LOCAL_DECL(static intptr_t jit_buffer_cache_size);
|
||||
THREAD_LOCAL_DECL(static int jit_buffer_cache_registered);
|
||||
|
||||
#ifdef SET_DEFAULT_LONG_JUMPS
|
||||
static int default_long_jumps;
|
||||
static volatile uintptr_t code_low, code_high;
|
||||
#endif
|
||||
|
||||
static void *get_end_pointer(mz_jit_state *jitter)
|
||||
{
|
||||
return jit_get_ip().ptr;
|
||||
|
@ -101,6 +106,49 @@ double *scheme_mz_retain_double(mz_jit_state *jitter, double d)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef SET_DEFAULT_LONG_JUMPS
|
||||
static int check_long_mode(uintptr_t low, uintptr_t high)
|
||||
{
|
||||
if (default_long_jumps)
|
||||
return 1;
|
||||
|
||||
if (!code_low)
|
||||
code_low = low;
|
||||
else if (low < code_low) {
|
||||
#ifdef MZ_USE_PLACES
|
||||
while (!mzrt_cas(&code_low, code_low, low)) {
|
||||
if (low >= code_low)
|
||||
break;
|
||||
}
|
||||
#else
|
||||
code_low = low;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (high > code_high) {
|
||||
#ifdef MZ_USE_PLACES
|
||||
while (!mzrt_cas(&code_high, code_high, high)) {
|
||||
if (high <= code_high)
|
||||
break;
|
||||
}
|
||||
#else
|
||||
code_high = high;
|
||||
#endif
|
||||
}
|
||||
|
||||
if ((code_high - code_low) >= ((uintptr_t)1 << 31)) {
|
||||
if (!default_long_jumps) {
|
||||
scheme_log_warning("warning: JIT switching to long-jump mode");
|
||||
default_long_jumps = 1;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void *scheme_generate_one(mz_jit_state *old_jitter,
|
||||
Generate_Proc generate,
|
||||
void *data,
|
||||
|
@ -117,6 +165,9 @@ void *scheme_generate_one(mz_jit_state *old_jitter,
|
|||
intptr_t size_pre_retained = 0, size_pre_retained_double = 0, num_retained = 0, num_retained_double = 0, padding;
|
||||
int mappings_size = JIT_INIT_MAPPINGS_SIZE;
|
||||
int ok, max_extra_pushed = 0;
|
||||
#ifdef SET_DEFAULT_LONG_JUMPS
|
||||
int use_long_jumps = default_long_jumps;
|
||||
#endif
|
||||
#ifdef MZ_PRECISE_GC
|
||||
Scheme_Object *fnl_obj;
|
||||
|
||||
|
@ -139,8 +190,11 @@ void *scheme_generate_one(mz_jit_state *old_jitter,
|
|||
|
||||
while (1) {
|
||||
memset(jitter, 0, sizeof(_jitter));
|
||||
#ifdef SET_DEFAULT_LONG_JUMPS
|
||||
_jitl.long_jumps_default = use_long_jumps;
|
||||
#endif
|
||||
#ifdef NEED_LONG_JUMPS
|
||||
_jitl.long_jumps = 1;
|
||||
_jitl.long_jumps = LONG_JUMPS_DEFAULT(_jitl);
|
||||
#endif
|
||||
#ifdef USE_TINY_JUMPS
|
||||
_jitl.tiny_jumps = 0;
|
||||
|
@ -197,7 +251,21 @@ void *scheme_generate_one(mz_jit_state *old_jitter,
|
|||
size_pre_retained = size;
|
||||
size_pre_retained_double = size;
|
||||
}
|
||||
|
||||
|
||||
#ifdef SET_DEFAULT_LONG_JUMPS
|
||||
if (!use_long_jumps) {
|
||||
/* In the case that we start allocating so much that the address
|
||||
moves beyond the 32-bit half where code normally resides,
|
||||
then switch over to long-jump mode. */
|
||||
if (check_long_mode((uintptr_t)buffer, (uintptr_t)(buffer+size))) {
|
||||
/* start over */
|
||||
known_size = 0;
|
||||
use_long_jumps = 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
(void)jit_set_ip(buffer).ptr;
|
||||
jitter->limit = (char *)buffer + size_pre_retained_double - padding;
|
||||
if (known_size) {
|
||||
|
@ -238,6 +306,19 @@ void *scheme_generate_one(mz_jit_state *old_jitter,
|
|||
|
||||
ok = generate(jitter, data);
|
||||
|
||||
#ifdef SET_DEFAULT_LONG_JUMPS
|
||||
/* Check again after generate, because we may have
|
||||
generated new code blocks along the way. */
|
||||
if (!use_long_jumps) {
|
||||
if (check_long_mode((uintptr_t)buffer, (uintptr_t)(buffer+size))) {
|
||||
/* start over */
|
||||
known_size = 0;
|
||||
use_long_jumps = 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (save_ptr) {
|
||||
scheme_mz_retain_it(jitter, save_ptr);
|
||||
}
|
||||
|
|
|
@ -207,7 +207,7 @@ typedef _uc jit_insn;
|
|||
#ifdef JIT_X86_64
|
||||
# define _REX_(P,R,X,B) ( _jit_B(P|((R&0x8)>>1)|((X&0x8)>>2)|((B&0x8)>>3)) )
|
||||
# define _REX(R,X,B) _REX_(0x48,R,X,B)
|
||||
# define _REXd(R,X,B) _REX_(0x40,R,X,B)
|
||||
# define _REXd(R,X,B) ((B&0x8) ? _REX_(0x40,R,X,B) : 0)
|
||||
# define _qO( OP, R,X,B ) ( _REX(R,X,B), _jit_B( OP ) )
|
||||
# define _qOd( OP, R,X,B ) ( _REXd(R,X,B), _jit_B( OP ) )
|
||||
#else
|
||||
|
@ -217,8 +217,10 @@ typedef _uc jit_insn;
|
|||
#define _Or( OP,R ) ( _jit_B( (OP)|_r(R)) )
|
||||
#ifdef JIT_X86_64
|
||||
# define _qOr( OP,R ) ( _REX(0,0,R), _jit_B( (OP)|_r(R&0x7)) )
|
||||
# define _qOdr( OP,R ) ( _REXd(0,0,R), _jit_B( (OP)|_r(R&0x7)) )
|
||||
#else
|
||||
# define _qOr( OP,R ) _Or(OP,R)
|
||||
# define _qOdr( OP,R ) _Or(OP,R)
|
||||
#endif
|
||||
#define _OO( OP ) ( _jit_B((OP)>>8), _jit_B( (OP) ) )
|
||||
#ifdef JIT_X86_64
|
||||
|
@ -248,6 +250,7 @@ typedef _uc jit_insn;
|
|||
#define _qOr_Q( OP,R ,Q ) ( _qOr ( OP,R) ,_jit_L(Q) )
|
||||
#define _O_Mrm( OP ,MO,R,M ) ( _O ( OP ),_Mrm(MO,R,M ) )
|
||||
#define _qO_Mrm( OP ,MO,R,M ) ( _qO ( OP,R,0,M),_qMrm(MO,R,M ) )
|
||||
#define _qOd_Mrm( OP ,MO,R,M ) ( _qOd ( OP,R,0,M),_qMrm(MO,R,M ) )
|
||||
#define _OO_Mrm( OP ,MO,R,M ) ( _OO ( OP ),_Mrm(MO,R,M ) )
|
||||
#define _qOO_Mrm( OP ,MO,R,M ) ( _qOO ( OP ),_Mrm(MO,R,M ) )
|
||||
#define _O_Mrm_B( OP ,MO,R,M ,B ) ( _O ( OP ),_Mrm(MO,R,M ) ,_jit_B(B) )
|
||||
|
@ -263,7 +266,7 @@ typedef _uc jit_insn;
|
|||
#define _O_r_X( OP ,R ,MD,MB,MI,MS ) ( _O ( OP ),_r_X( R ,MD,MB,MI,MS) )
|
||||
#define _qO_r_X( OP ,R ,MD,MB,MI,MS ) ( _qO ( OP,R,0,MS),_qr_X(R,MD,MB,MI,MS) )
|
||||
#define _qO_r_XB( OP ,R ,MD,MB,MI,MS ) ( _qO ( OP,R,0,MB),_qr_X(R,MD,MB,MI,MS) )
|
||||
#define _qO_r_Xd( OP ,R ,MD,MB,MI,MS ) ( _qOd ( OP,R,0,MB),_qr_X(R,MD,MB,MI,MS) )
|
||||
#define _qOd_r_X( OP ,R ,MD,MB,MI,MS ) ( _qOd ( OP,R,0,MB),_qr_X(R,MD,MB,MI,MS) )
|
||||
#define _OO_r_X( OP ,R ,MD,MB,MI,MS ) ( _OO ( OP ),_r_X( R ,MD,MB,MI,MS) )
|
||||
#define _qOO_r_X( OP ,R ,MD,MB,MI,MS ) ( _qOO ( OP ),_r_X( R ,MD,MB,MI,MS) )
|
||||
#define _O_r_X_B( OP ,R ,MD,MB,MI,MS,B ) ( _O ( OP ),_r_X( R ,MD,MB,MI,MS) ,_jit_B(B) )
|
||||
|
@ -406,15 +409,21 @@ typedef _uc jit_insn;
|
|||
#define BTSLrr(RS,RD) _OO_Mrm (0x0fab ,_b11,_r4(RS),_r4(RD) )
|
||||
#define BTSLrm(RS,MD,MB,MI,MS) _OO_r_X (0x0fab ,_r4(RS) ,MD,MB,MI,MS )
|
||||
|
||||
#ifdef _ASM_SAFETY
|
||||
# define CALLmL(D,B,I,S) ((_r0P(B) && _r0P(I)) ? _O_D32 (0xe8 ,(intptr_t)(D) ) : \
|
||||
JITFAIL("illegal mode in direct jump"))
|
||||
#else
|
||||
# define CALLmL(D,B,I,S) _O_D32 (0xe8 ,(intptr_t)(D) )
|
||||
#endif
|
||||
|
||||
#ifdef JIT_X86_64
|
||||
# define CALLm(D,B,I,S) (MOVQir((D), JIT_REXTMP), CALQsr(JIT_REXTMP))
|
||||
#else
|
||||
# define CALLm(D,B,I,S) ((_r0P(B) && _r0P(I)) ? _O_D32 (0xe8 ,(intptr_t)(D) ) : \
|
||||
JITFAIL("illegal mode in direct jump"))
|
||||
# define CALLm(D,B,I,S) CALLmL(D,B,I,S)
|
||||
#endif
|
||||
|
||||
#define CALLsr(R) _O_Mrm (0xff ,_b11,_b010,_r4(R) )
|
||||
#define CALQsr(R) _qO_Mrm (0xff ,_b11,_b010,_r8(R))
|
||||
#define CALQsr(R) _qOd_Mrm(0xff ,_b11,_b010,_r8(R))
|
||||
|
||||
#define CALLsm(D,B,I,S) _O_r_X (0xff ,_b010 ,(intptr_t)(D),B,I,S )
|
||||
|
||||
|
@ -693,7 +702,7 @@ typedef _uc jit_insn;
|
|||
|
||||
#define MOVQmr(MD, MB, MI, MS, RD) _qO_r_X (0x8b ,_r8(RD) ,MD,MB,MI,MS )
|
||||
#define MOVQmQr(MD, MB, MI, MS, RD) _qO_r_XB (0x8b ,_r8(RD) ,MD,MB,MI,MS )
|
||||
#define MOVQrm(RS, MD, MB, MI, MS) _qO_r_Xd (0x89 ,_r8(RS) ,MD,MB,MI,MS )
|
||||
#define MOVQrm(RS, MD, MB, MI, MS) _qOd_r_X (0x89 ,_r8(RS) ,MD,MB,MI,MS )
|
||||
#define MOVQrQm(RS, MD, MB, MI, MS) _qO_r_XB (0x89 ,_r8(RS) ,MD,MB,MI,MS )
|
||||
#define MOVQir(IM, R) _qOr_Q (0xb8,_r8(R) ,IM )
|
||||
|
||||
|
@ -776,7 +785,7 @@ typedef _uc jit_insn;
|
|||
#define POPLr(RD) _Or (0x58,_r4(RD) )
|
||||
#define POPLm(MD,MB,MI,MS) _O_r_X (0x8f ,_b000 ,MD,MB,MI,MS )
|
||||
|
||||
#define POPQr(RD) _qOr (0x58,_r8(RD) )
|
||||
#define POPQr(RD) _qOdr (0x58,_r8(RD) )
|
||||
|
||||
|
||||
#define POPA_() _wO (0x61 )
|
||||
|
@ -794,7 +803,7 @@ typedef _uc jit_insn;
|
|||
#define PUSHLm(MD,MB,MI,MS) _O_r_X (0xff ,_b110 ,MD,MB,MI,MS )
|
||||
#define PUSHLi(IM) _Os_sL (0x68 ,IM )
|
||||
|
||||
#define PUSHQr(R) _qOr (0x50,_r8(R) )
|
||||
#define PUSHQr(R) _qOdr (0x50,_r8(R) )
|
||||
|
||||
#define PUSHA_() _wO (0x60 )
|
||||
#define PUSHAD_() _O (0x60 )
|
||||
|
|
|
@ -52,7 +52,8 @@
|
|||
|
||||
struct jit_local_state {
|
||||
#ifdef JIT_X86_64
|
||||
int long_jumps;
|
||||
int long_jumps, long_jumps_default;
|
||||
# define LONG_JUMPS_DEFAULT(jitl) (jitl.long_jumps_default)
|
||||
int nextarg_geti;
|
||||
#else
|
||||
int framesize;
|
||||
|
@ -432,8 +433,8 @@ struct jit_local_state {
|
|||
# define jit_normal_pushonlyarg_i(rs) (_jitl.argpushes--, MOVQrr(rs, jit_arg_reg_order[0]))
|
||||
# define jit_save_argstate(curstate) curstate = _jitl.argpushes;
|
||||
# define jit_restore_argstate(curstate) _jitl.argpushes = curstate;
|
||||
# define jit_finish(sub) (jit_shift_args(), (void)jit_calli((sub)), jit_restore_locals())
|
||||
# define jit_normal_finish(sub) jit_calli((sub))
|
||||
# define jit_finish(sub) (jit_shift_args(), (void)jit_long_calli((sub)), jit_restore_locals())
|
||||
# define jit_normal_finish(sub) jit_long_calli((sub))
|
||||
# define jit_return_pop_insn_len() 0
|
||||
# define jit_finishr(reg) ((jit_reg_is_arg((reg)) ? MOVQrr(reg, JIT_REXTMP) : (void)0), \
|
||||
jit_shift_args(), \
|
||||
|
@ -604,7 +605,13 @@ static const int const jit_arg_reg_order[] = { _EDI, _ESI, _EDX, _ECX };
|
|||
#define jit_bmci_l(label, rs, is) jit_bmci_i(label, rs, is)
|
||||
|
||||
#define jit_jmpi(label) (JMPm( ((uintptr_t) (label)), 0, 0, 0), _jit.x.pc)
|
||||
#define jit_calli(label) (CALLm( ((uintptr_t) (label)), 0, 0, 0), _jit.x.pc)
|
||||
#define jit_long_calli(label) (CALLm( ((uintptr_t) (label)), 0, 0, 0), _jit.x.pc)
|
||||
#define jit_short_calli(label) (CALLmL( ((uintptr_t) (label)), 0, 0, 0), _jit.x.pc)
|
||||
#ifdef JIT_X86_64
|
||||
# define jit_calli(label) (_jitl.long_jumps_default ? jit_long_calli(label) : jit_short_calli(label))
|
||||
#else
|
||||
# define jit_calli(label) jit_long_calli(label)
|
||||
#endif
|
||||
#define jit_callr(reg) (CALLsr(reg))
|
||||
#define jit_jmpr(reg) JMPsr(reg)
|
||||
|
||||
|
|
|
@ -43,6 +43,7 @@ struct jit_local_state {
|
|||
int nextarg_getd; /* The FP args are picked up from FPR1 -> FPR10 */
|
||||
int nbArgs; /* Number of arguments for the prolog */
|
||||
int long_jumps; /* 1 => patch or leave room for long jumps */
|
||||
# define LONG_JUMPS_DEFAULT(jitl) 1
|
||||
};
|
||||
|
||||
#define JIT_SP 1
|
||||
|
|
Loading…
Reference in New Issue
Block a user