streamline thread-local setup in JIT-generated code

Pass a pointer to the thread-local table on entry to JIT-generated
code, instead of having the JIT-generated code call a C function
to get the table. This doesn't seem to improve performance on my
machine, but it generates less code and is probably faster in
some cases.
This commit is contained in:
Matthew Flatt 2011-12-14 04:27:57 -07:00
parent 761a40d483
commit 558b03034a
10 changed files with 81 additions and 47 deletions

View File

@ -2783,7 +2783,7 @@ scheme_do_eval(Scheme_Object *obj, int num_rands, Scheme_Object **rands,
tmpv = obj;
obj = NULL; /* save for space, since tmpv is ignored by the GC */
v = data->start_code(tmpv, num_rands, rands);
v = data->start_code(tmpv, num_rands, rands EXTRA_NATIVE_ARGUMENT);
if (v == SCHEME_TAIL_CALL_WAITING) {
/* [TC-SFS]; see schnapp.inc */

View File

@ -3124,7 +3124,7 @@ static Scheme_Object *_apply_native(Scheme_Object *obj, int num_rands, Scheme_Ob
MZ_CONT_MARK_POS += 2;
old_cont_mark_stack = MZ_CONT_MARK_STACK;
obj = data->start_code(obj, num_rands, rands);
obj = data->start_code(obj, num_rands, rands EXTRA_NATIVE_ARGUMENT);
if (obj == SCHEME_TAIL_CALL_WAITING)
return force_values(obj, 1);

View File

@ -1974,7 +1974,7 @@ void *worker_thread_future_loop(void *arg)
Scheme_Future_Thread_State *fts = params->fts;
Scheme_Future_State *fs = params->fs;
Scheme_Object *v;
Scheme_Closed_Prim *jitcode;
Scheme_Native_Proc *jitcode;
future_t *ft;
mz_jmp_buf newbuf;
int fid;

View File

@ -120,10 +120,6 @@ static Scheme_Object *clear_rs_arguments(Scheme_Object *v, int size, int delta)
return v;
}
#ifdef JIT_THREAD_LOCAL
void *scheme_jit_get_threadlocal_table() XFORM_SKIP_PROC { return &BOTTOM_VARIABLE; }
#endif
#define JIT_TS_PROCS
#define JIT_BOX_TS_PROCS
#include "jit_ts.c"
@ -134,14 +130,20 @@ void *scheme_jit_get_threadlocal_table() XFORM_SKIP_PROC { return &BOTTOM_VARIAB
THREAD_LOCAL_DECL(Scheme_Current_LWC *scheme_current_lwc);
Scheme_Object *scheme_call_as_lightweight_continuation(Scheme_Closed_Prim *code,
Scheme_Object *scheme_call_as_lightweight_continuation(Scheme_Native_Proc *code,
void *data,
int argc,
Scheme_Object **argv)
{
#ifdef JIT_THREAD_LOCAL
# define THDLOC &BOTTOM_VARIABLE
#else
# define THDLOC NULL
#endif
scheme_current_lwc->runstack_start = MZ_RUNSTACK;
scheme_current_lwc->cont_mark_stack_start = MZ_CONT_MARK_STACK;
return sjc.native_starter_code(data, argc, argv, code, (void **)&scheme_current_lwc->stack_start);
return sjc.native_starter_code(data, argc, argv, THDLOC, code, (void **)&scheme_current_lwc->stack_start);
#undef THDLOC
}
void scheme_fill_stack_lwc_end(void) XFORM_SKIP_PROC
@ -2974,6 +2976,8 @@ static void generate_function_prolog(mz_jit_state *jitter, void *code, int max_l
jit_prolog(NATIVE_ARG_COUNT);
mz_push_threadlocal_early();
in = jit_arg_p();
jit_getarg_p(JIT_R0, in); /* closure */
in = jit_arg_i();
@ -2982,7 +2986,7 @@ static void generate_function_prolog(mz_jit_state *jitter, void *code, int max_l
jit_getarg_p(JIT_R2, in); /* argv */
mz_push_locals();
mz_push_threadlocal();
mz_push_threadlocal(in);
mz_tl_ldi_p(JIT_RUNSTACK, tl_MZ_RUNSTACK);
@ -3835,7 +3839,7 @@ int scheme_native_arity_check(Scheme_Object *closure, int argc)
return 1;
}
return sjc.check_arity_code(closure, argc + 1, 0);
return sjc.check_arity_code(closure, argc + 1, 0 EXTRA_NATIVE_ARGUMENT);
}
Scheme_Object *scheme_get_native_arity(Scheme_Object *closure)
@ -3878,7 +3882,7 @@ Scheme_Object *scheme_get_native_arity(Scheme_Object *closure)
return a;
}
return sjc.get_arity_code(closure, 0, 0);
return sjc.get_arity_code(closure, 0, 0 EXTRA_NATIVE_ARGUMENT);
}
/**********************************************************************/

View File

@ -14,9 +14,14 @@
is related to the way the x86_64 port shuffles arguments into
temporary registers.
5) On x86_64, arguments are delivered in JIT_V2, JIT_V3, and JIT_R2,
in that order. So don't set JIT_R2 before getting the third
argument, etc.
5) On non-Win64 x86_64, arguments are delivered in JIT_V2, JIT_V3,
JIT_R2, and JIT_R1 in that order. So don't set JIT_R2 before
getting the third argument, etc.
On non-Win64 x86_64, arguments are delivered in JIT_R1, JIT_R2,
and other registers. So don't set JIT_R2 before getting the
second argument, etc.
*/
#ifdef __APPLE__
@ -83,7 +88,11 @@ END_XFORM_ARITH;
#define WORDS_TO_BYTES(x) ((x) << JIT_LOG_WORD_SIZE)
#define MAX_TRY_SHIFT 30
#define NATIVE_ARG_COUNT 3
#ifdef USE_THREAD_LOCAL
# define NATIVE_ARG_COUNT 4
#else
# define NATIVE_ARG_COUNT 3
#endif
#define JIT_LOG_DOUBLE_SIZE 3
#define JIT_DOUBLE_SIZE (1 << JIT_LOG_DOUBLE_SIZE)
@ -166,12 +175,13 @@ extern int scheme_jit_malloced;
THREAD_LOCAL_DECL(extern double scheme_jit_save_fp);
#endif
typedef int (*Native_Check_Arity_Proc)(Scheme_Object *o, int argc, int dummy);
typedef Scheme_Object *(*Native_Get_Arity_Proc)(Scheme_Object *o, int dumm1, int dummy2);
typedef int (*Native_Check_Arity_Proc)(Scheme_Object *o, int argc, int dummy EXTRA_NATIVE_ARGUMENT_TYPE);
typedef Scheme_Object *(*Native_Get_Arity_Proc)(Scheme_Object *o, int dumm1, int dummy2 EXTRA_NATIVE_ARGUMENT_TYPE);
typedef Scheme_Object *(*LWC_Native_Starter)(void *data,
int argc,
Scheme_Object **argv,
Scheme_Closed_Prim *chain_to,
void *thdloc,
Scheme_Native_Proc *chain_to,
void **save_pos);
typedef struct Apply_LWC_Args {
@ -374,7 +384,6 @@ typedef struct {
#endif
#ifdef JIT_THREAD_LOCAL
# define BOTTOM_VARIABLE GC_variable_stack
# define tl_delta(id) ((uintptr_t)&(id) - (uintptr_t)&BOTTOM_VARIABLE)
# define tl_MZ_RUNSTACK tl_delta(MZ_RUNSTACK)
# define tl_MZ_RUNSTACK_START tl_delta(MZ_RUNSTACK_START)
@ -787,19 +796,16 @@ void scheme_jit_prolog_again(mz_jit_state *jitter, int n, int ret_addr_reg)
#endif
#ifdef JIT_THREAD_LOCAL
# define mz_get_threadlocal() (mz_prepare(0), (void)mz_finish(scheme_jit_get_threadlocal_table), jit_retval(JIT_R0))
# ifdef JIT_X86_64
# define mz_pop_threadlocal() mz_get_local_p(JIT_R14, JIT_LOCAL4)
# define mz_push_threadlocal() (mz_set_local_p(JIT_R14, JIT_LOCAL4), \
PUSHQr(JIT_R0), PUSHQr(JIT_R1), PUSHQr(JIT_R2), PUSHQr(JIT_R2), \
mz_get_threadlocal(), jit_retval(JIT_R0), jit_movr_p(JIT_R14, JIT_R0), \
POPQr(JIT_R2), POPQr(JIT_R2), POPQr(JIT_R1), POPQr(JIT_R0))
# define mz_push_threadlocal(in) /* empty */
# define mz_push_threadlocal_early() (mz_set_local_p(JIT_R14, JIT_LOCAL4), jit_movr_p(JIT_R14, JIT_R1))
# define mz_repush_threadlocal() mz_set_local_p(JIT_R14, JIT_LOCAL4)
# else
# define mz_pop_threadlocal() /* empty */
# ifdef THREAD_LOCAL_USES_JIT_V2
# define _mz_install_threadlocal(reg) jit_movr_p(JIT_V2, reg)
# define mz_repush_threadlocal() /* empty */
# define mz_repush_threadlocal(in) /* empty */
# else
# define _mz_install_threadlocal(reg) mz_set_local_p(reg, JIT_LOCAL4)
# define mz_repush_threadlocal() (PUSHQr(JIT_R0), jit_ldr_p(JIT_R0, _EBP), \
@ -807,13 +813,13 @@ void scheme_jit_prolog_again(mz_jit_state *jitter, int n, int ret_addr_reg)
jit_stxi_p(JIT_LOCAL4, _EBP, JIT_R0), \
POPQr(JIT_R0))
# endif
# define mz_push_threadlocal() (PUSHQr(JIT_R0), PUSHQr(JIT_R1), PUSHQr(JIT_R2), PUSHQr(JIT_R2), \
mz_get_threadlocal(), jit_retval(JIT_R0), _mz_install_threadlocal(JIT_R0), \
POPQr(JIT_R2), POPQr(JIT_R2), POPQr(JIT_R1), POPQr(JIT_R0))
# define mz_push_threadlocal(in) (in = jit_arg_p(), jit_getarg_p(JIT_V2, in), _mz_install_threadlocal(JIT_V2))
# define mz_push_threadlocal_early() /* empty */
# endif
#else
# define mz_pop_threadlocal() /* empty */
# define mz_push_threadlocal() /* empty */
# define mz_push_threadlocal(in) /* empty */
# define mz_push_threadlocal_early() /* empty */
# define mz_repush_threadlocal() /* empty */
#endif

View File

@ -678,7 +678,7 @@ int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direc
jit_base_prolog();
#else
refr = jit_patchable_movi_p(JIT_R1, jit_forward());
_jit_prolog_again(jitter, 3, JIT_R1); /* saves V registers (or copied V registers) */
_jit_prolog_again(jitter, NATIVE_ARG_COUNT, JIT_R1); /* saves V registers (or copied V registers) */
#endif
if (num_rands >= 0) {
if (nontail_self) { jit_movr_p(JIT_R1, JIT_R0); }

View File

@ -184,12 +184,15 @@ static int common0(mz_jit_state *jitter, void *_data)
/* Called as a function: */
sjc.check_arity_code = (Native_Check_Arity_Proc)jit_get_ip().ptr;
jit_prolog(NATIVE_ARG_COUNT); /* only need 2 arguments, but return path overlaps with proc conventions */
mz_push_threadlocal_early();
in = jit_arg_p();
jit_getarg_p(JIT_R0, in); /* closure */
in = jit_arg_p();
jit_getarg_i(JIT_R2, in); /* argc */
in = jit_arg_p();
jit_getarg_i(JIT_R1, in); /* ignored */
mz_push_locals();
mz_push_threadlocal();
mz_push_threadlocal(in);
jit_movi_i(JIT_R1, -1);
jit_ldxi_p(JIT_V1, JIT_R0, &((Scheme_Native_Closure *)0x0)->code);
jit_ldxi_p(JIT_V1, JIT_V1, &((Scheme_Native_Closure_Data *)0x0)->arity_code);
@ -200,10 +203,15 @@ static int common0(mz_jit_state *jitter, void *_data)
/* Called as a function: */
sjc.get_arity_code = (Native_Get_Arity_Proc)jit_get_ip().ptr;
jit_prolog(NATIVE_ARG_COUNT); /* only need 1 argument, but return path overlaps with proc conventions */
mz_push_threadlocal_early();
in = jit_arg_p();
jit_getarg_p(JIT_R0, in); /* closure */
in = jit_arg_p();
jit_getarg_p(JIT_R1, in); /* ignored */
in = jit_arg_p();
jit_getarg_i(JIT_R1, in); /* ignored */
mz_push_locals();
mz_push_threadlocal();
mz_push_threadlocal(in);
jit_movi_i(JIT_R1, -1);
(void)jit_movi_p(JIT_R2, 0x0);
jit_ldxi_p(JIT_V1, JIT_R0, &((Scheme_Native_Closure *)0x0)->code);
@ -658,6 +666,7 @@ static int common2(mz_jit_state *jitter, void *_data)
for the state of registers on entry */
scheme_on_demand_jit_code = jit_get_ip().ptr;
jit_prolog(NATIVE_ARG_COUNT);
mz_push_threadlocal_early();
in = jit_arg_p();
jit_getarg_p(JIT_R0, in); /* closure */
in = jit_arg_i();
@ -666,7 +675,7 @@ static int common2(mz_jit_state *jitter, void *_data)
jit_getarg_p(JIT_R2, in); /* argv */
CHECK_LIMIT();
mz_push_locals();
mz_push_threadlocal();
mz_push_threadlocal(in);
mz_tl_ldi_p(JIT_RUNSTACK, tl_MZ_RUNSTACK);
sjc.on_demand_jit_arity_code = jit_get_ip().ptr; /* <<<- arity variant starts here */
jit_subi_p(JIT_RUNSTACK, JIT_RUNSTACK, WORDS_TO_BYTES(2));
@ -3198,9 +3207,9 @@ static int more_common1(mz_jit_state *jitter, void *_data)
/* store stack pointer in address given by 5th argument, then jump to
the address given by the 4th argument */
jit_getprearg_pipp_p(JIT_PREARG);
jit_getprearg_pippp_p(JIT_PREARG);
jit_str_p(JIT_PREARG, JIT_SP);
jit_getprearg_pip_p(JIT_PREARG);
jit_getprearg_pipp_p(JIT_PREARG);
jit_jmpr(JIT_PREARG);
CHECK_LIMIT();

View File

@ -454,15 +454,15 @@ static jit_state _jit;
#define jit_getarg_ul(reg, ofs) jit_extr_uc_ul((reg), jit_arg_reg(ofs))
#define jit_getarg_us(reg, ofs) jit_extr_us_ul((reg), jit_arg_reg(ofs))
#else
#define jit_getarg_c(reg, ofs) jit_ldxi_c((reg), JIT_FP, (ofs));
#define jit_getarg_uc(reg, ofs) jit_ldxi_uc((reg), JIT_FP, (ofs));
#define jit_getarg_s(reg, ofs) jit_ldxi_s((reg), JIT_FP, (ofs));
#define jit_getarg_us(reg, ofs) jit_ldxi_us((reg), JIT_FP, (ofs));
#define jit_getarg_i(reg, ofs) jit_ldxi_i((reg), JIT_FP, (ofs));
#define jit_getarg_ui(reg, ofs) jit_ldxi_ui((reg), JIT_FP, (ofs));
#define jit_getarg_l(reg, ofs) jit_ldxi_l((reg), JIT_FP, (ofs));
#define jit_getarg_ul(reg, ofs) jit_ldxi_ul((reg), JIT_FP, (ofs));
#define jit_getarg_p(reg, ofs) jit_ldxi_p((reg), JIT_FP, (ofs));
#define jit_getarg_c(reg, ofs) jit_ldxi_c((reg), JIT_FP, (ofs))
#define jit_getarg_uc(reg, ofs) jit_ldxi_uc((reg), JIT_FP, (ofs))
#define jit_getarg_s(reg, ofs) jit_ldxi_s((reg), JIT_FP, (ofs))
#define jit_getarg_us(reg, ofs) jit_ldxi_us((reg), JIT_FP, (ofs))
#define jit_getarg_i(reg, ofs) jit_ldxi_i((reg), JIT_FP, (ofs))
#define jit_getarg_ui(reg, ofs) jit_ldxi_ui((reg), JIT_FP, (ofs))
#define jit_getarg_l(reg, ofs) jit_ldxi_l((reg), JIT_FP, (ofs))
#define jit_getarg_ul(reg, ofs) jit_ldxi_ul((reg), JIT_FP, (ofs))
#define jit_getarg_p(reg, ofs) jit_ldxi_p((reg), JIT_FP, (ofs))
#endif
#endif

View File

@ -367,15 +367,18 @@ struct jit_local_state {
# define jit_getprearg__p(r) (MOVQrr(_ECX, r))
# define jit_getprearg_pip_p(r) (MOVQrr(JIT_R(9), r))
# define jit_getprearg_pipp_p(r) (jit_ldxi_p(r, JIT_SP, 40))
# define jit_getprearg_pippp_p(r) (jit_ldxi_p(r, JIT_SP, 48))
# else
# define jit_getprearg__p(r) (MOVQrr(_EDI, r))
# define jit_getprearg_pip_p(r) (MOVQrr(_ECX, r))
# define jit_getprearg_pipp_p(r) (MOVQrr(JIT_R(8), r))
# define jit_getprearg_pippp_p(r) (MOVQrr(JIT_R(9), r))
# endif
#else
# define jit_getprearg__p(r) (jit_ldxi_p(r, JIT_SP, 4))
# define jit_getprearg_pip_p(r) (jit_ldxi_p(r, JIT_SP, 16))
# define jit_getprearg_pipp_p(r) (jit_ldxi_p(r, JIT_SP, 20))
# define jit_getprearg_pippp_p(r) (jit_ldxi_p(r, JIT_SP, 24))
#endif
#ifdef JIT_X86_64

View File

@ -1337,6 +1337,18 @@ void scheme_clean_cust_box_list(void);
void scheme_notify_code_gc(void);
#endif
#ifdef USE_THREAD_LOCAL
# define BOTTOM_VARIABLE GC_variable_stack
# define EXTRA_NATIVE_ARGUMENT , &BOTTOM_VARIABLE
# define EXTRA_NATIVE_ARGUMENT_TYPE , void* thdloc
#else
# define EXTRA_NATIVE_ARGUMENT /* empty */
# define EXTRA_NATIVE_ARGUMENT_TYPE /* empty */
#endif
typedef struct Scheme_Object *(Scheme_Native_Proc)(void *d, int argc, struct Scheme_Object *argv[]
EXTRA_NATIVE_ARGUMENT_TYPE);
/*========================================================================*/
/* control flow */
/*========================================================================*/
@ -2321,7 +2333,7 @@ typedef struct {
typedef struct Scheme_Native_Closure_Data {
Scheme_Inclhash_Object iso; /* type tag only set when needed, but
flags always needed */
Scheme_Closed_Prim *start_code; /* When not yet JITted, this is = to
Scheme_Native_Proc *start_code; /* When not yet JITted, this is = to
scheme_on_demand_jit_code */
union {
void *tail_code; /* For non-case-lambda */
@ -2379,7 +2391,7 @@ void scheme_clear_lwc(void);
THREAD_LOCAL_DECL(MZ_EXTERN Scheme_Current_LWC *scheme_current_lwc);
Scheme_Object *scheme_call_as_lightweight_continuation(Scheme_Closed_Prim *code,
Scheme_Object *scheme_call_as_lightweight_continuation(Scheme_Native_Proc *code,
void *data,
int argc,
Scheme_Object **argv);