JIT improvement: more direct path through loop wrappers

This commit is contained in:
Matthew Flatt 2011-04-23 21:37:56 -06:00
parent 2f8006aa6b
commit 6078013e32
4 changed files with 90 additions and 27 deletions

View File

@ -2949,7 +2949,7 @@ static int generate_function_getarg(mz_jit_state *jitter, int has_rest, int num_
typedef struct { typedef struct {
Scheme_Closure_Data *data; Scheme_Closure_Data *data;
void *arity_code, *code, *tail_code, *code_end, **patch_depth; void *arity_code, *code, *tail_code, *code_end, **patch_depth;
int max_extra, max_depth; int max_extra, max_depth, max_tail_depth;
Scheme_Native_Closure *nc; Scheme_Native_Closure *nc;
int argc; int argc;
Scheme_Object **argv; Scheme_Object **argv;
@ -3281,6 +3281,7 @@ static int do_generate_closure(mz_jit_state *jitter, void *_data)
gdata->tail_code = tail_code; gdata->tail_code = tail_code;
gdata->max_extra = jitter->max_extra_pushed; gdata->max_extra = jitter->max_extra_pushed;
gdata->max_depth = jitter->max_depth; gdata->max_depth = jitter->max_depth;
gdata->max_tail_depth = jitter->max_tail_depth;
gdata->code_end = code_end; gdata->code_end = code_end;
gdata->patch_depth = jitter->patch_depth; gdata->patch_depth = jitter->patch_depth;
} }
@ -3307,9 +3308,11 @@ static void on_demand_generate_lambda(Scheme_Native_Closure *nc, int argc, Schem
scheme_delay_load_closure(data); scheme_delay_load_closure(data);
/* So, check again whether we still need to generate: */ /* So, check again whether we still need to generate: */
if (nc->code->code != scheme_on_demand_jit_code) if (ndata->code != scheme_on_demand_jit_code)
return; return;
ndata->arity_code = NULL; /* => in progress */
scheme_generate_one(NULL, do_generate_closure, &gdata, 1, data->name, ndata); scheme_generate_one(NULL, do_generate_closure, &gdata, 1, data->name, ndata);
if (gdata.max_depth > data->max_let_depth) { if (gdata.max_depth > data->max_let_depth) {
@ -3337,6 +3340,8 @@ static void on_demand_generate_lambda(Scheme_Native_Closure *nc, int argc, Schem
/* Add a couple of extra slots to computed let-depth, in case /* Add a couple of extra slots to computed let-depth, in case
we haven't quite computed right for inlined uses, etc. */ we haven't quite computed right for inlined uses, etc. */
max_depth = WORDS_TO_BYTES(data->max_let_depth + gdata.max_extra + 2); max_depth = WORDS_TO_BYTES(data->max_let_depth + gdata.max_extra + 2);
if (gdata.max_tail_depth > max_depth)
max_depth = gdata.max_tail_depth;
/* max_let_depth is used for flags by generate_lambda: */ /* max_let_depth is used for flags by generate_lambda: */
if (ndata->max_let_depth & 0x1) { if (ndata->max_let_depth & 0x1) {

View File

@ -274,7 +274,7 @@ typedef struct {
char *limit; char *limit;
int extra_pushed, max_extra_pushed; int extra_pushed, max_extra_pushed;
int depth; /* the position of the closure's first value on the stack */ int depth; /* the position of the closure's first value on the stack */
int max_depth; int max_depth, max_tail_depth;
int *mappings; /* For each element, int *mappings; /* For each element,
case 0x1 bit: case 0x1 bit:
. 0 -> case 0x2 bit: . 0 -> case 0x2 bit:
@ -1028,7 +1028,7 @@ static int past_limit(mz_jit_state *jitter)
{ {
if (((uintptr_t)jit_get_ip().ptr > (uintptr_t)jitter->limit + JIT_BUFFER_PAD_SIZE) if (((uintptr_t)jit_get_ip().ptr > (uintptr_t)jitter->limit + JIT_BUFFER_PAD_SIZE)
|| (jitter->retain_start)) { || (jitter->retain_start)) {
printf("way past\n"); printf("way past\n"); abort();
} }
return 0; return 0;
} }
@ -1136,7 +1136,8 @@ void *scheme_generate_shared_call(int num_rands, mz_jit_state *old_jitter, int m
void scheme_ensure_retry_available(mz_jit_state *jitter, int multi_ok); void scheme_ensure_retry_available(mz_jit_state *jitter, int multi_ok);
int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_rands, int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_rands,
mz_jit_state *jitter, int is_tail, int multi_ok, int no_call); mz_jit_state *jitter, int is_tail, int multi_ok, int no_call);
int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs, int is_inline); int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs,
int is_inline, void *direct_to_code);
int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs, int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs,
int multi_ok, int nontail_self, int pop_and_jump, int is_inlined); int multi_ok, int nontail_self, int pop_and_jump, int is_inlined);
int scheme_generate_finish_tail_call(mz_jit_state *jitter, int direct_native); int scheme_generate_finish_tail_call(mz_jit_state *jitter, int direct_native);

View File

@ -248,7 +248,8 @@ static int generate_direct_prim_tail_call(mz_jit_state *jitter, int num_rands)
return 1; return 1;
} }
int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs, int is_inline) int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs,
int is_inline, void *direct_to_code)
/* Proc is in V1, args are at RUNSTACK. /* Proc is in V1, args are at RUNSTACK.
If num_rands < 0, then argc is in LOCAL2 and arguments are already below RUNSTACK_BASE. If num_rands < 0, then argc is in LOCAL2 and arguments are already below RUNSTACK_BASE.
If direct_native == 2, then some arguments are already in place (shallower in the runstack If direct_native == 2, then some arguments are already in place (shallower in the runstack
@ -272,23 +273,29 @@ int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_na
refagain = _jit.x.pc; refagain = _jit.x.pc;
/* Right kind of function. Extract data and check stack depth: */ /* Right kind of function. Extract data and check stack depth: */
jit_ldxi_p(JIT_R0, JIT_V1, &((Scheme_Native_Closure *)0x0)->code); if (!direct_to_code) {
jit_ldxi_i(JIT_R2, JIT_R0, &((Scheme_Native_Closure_Data *)0x0)->max_let_depth); jit_ldxi_p(JIT_R0, JIT_V1, &((Scheme_Native_Closure *)0x0)->code);
mz_tl_ldi_p(JIT_R1, tl_MZ_RUNSTACK_START); jit_ldxi_i(JIT_R2, JIT_R0, &((Scheme_Native_Closure_Data *)0x0)->max_let_depth);
jit_subr_ul(JIT_R1, JIT_RUNSTACK, JIT_R1); mz_tl_ldi_p(JIT_R1, tl_MZ_RUNSTACK_START);
ref4 = jit_bltr_ul(jit_forward(), JIT_R1, JIT_R2); jit_subr_ul(JIT_R1, JIT_RUNSTACK, JIT_R1);
CHECK_LIMIT(); ref4 = jit_bltr_ul(jit_forward(), JIT_R1, JIT_R2);
CHECK_LIMIT();
} else
ref4 = NULL;
/* Fast jump ok (proc will check argc). /* Fast jump ok (proc will check argc).
At this point, V1 = closure and R0 = code. */ At this point, V1 = closure and R0 = code. */
/* Check for thread swap: */ /* Check for thread swap: */
(void)mz_tl_ldi_i(JIT_R2, tl_scheme_fuel_counter); if (!direct_to_code) {
ref5 = jit_blei_i(jit_forward(), JIT_R2, 0); (void)mz_tl_ldi_i(JIT_R2, tl_scheme_fuel_counter);
ref5 = jit_blei_i(jit_forward(), JIT_R2, 0);
#ifndef FUEL_AUTODECEREMENTS #ifndef FUEL_AUTODECEREMENTS
jit_subi_p(JIT_R2, JIT_R2, 0x1); jit_subi_p(JIT_R2, JIT_R2, 0x1);
(void)mz_tl_sti_i(tl_scheme_fuel_counter, JIT_R2, JIT_R1); (void)mz_tl_sti_i(tl_scheme_fuel_counter, JIT_R2, JIT_R1);
#endif #endif
} else
ref5 = NULL;
CHECK_LIMIT(); CHECK_LIMIT();
/* Copy args to runstack base: */ /* Copy args to runstack base: */
@ -324,14 +331,18 @@ int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_na
/* RUNSTACK, RUNSTACK_BASE, V1, and R0 are ready */ /* RUNSTACK, RUNSTACK_BASE, V1, and R0 are ready */
/* Extract function and data: */ /* Extract function and data: */
jit_movr_p(JIT_R2, JIT_V1); if (!direct_to_code) {
if (direct_native) { jit_movr_p(JIT_R2, JIT_V1);
jit_ldxi_p(JIT_V1, JIT_R0, &((Scheme_Native_Closure_Data *)0x0)->u.tail_code); if (direct_native) {
jit_ldxi_p(JIT_V1, JIT_R0, &((Scheme_Native_Closure_Data *)0x0)->u.tail_code);
} else {
jit_ldxi_p(JIT_V1, JIT_R0, &((Scheme_Native_Closure_Data *)0x0)->arity_code);
}
jit_movr_p(JIT_R0, JIT_R2);
} else { } else {
jit_ldxi_p(JIT_V1, JIT_R0, &((Scheme_Native_Closure_Data *)0x0)->arity_code); jit_movr_p(JIT_R0, JIT_V1);
} }
/* Set up arguments; JIT_RUNSTACK and JIT_RUNSTACK_BASE must also be ready */ /* Set up arguments; JIT_RUNSTACK and JIT_RUNSTACK_BASE must also be ready */
jit_movr_p(JIT_R0, JIT_R2);
if (num_rands >= 0) { if (num_rands >= 0) {
jit_movi_i(JIT_R1, num_rands); jit_movi_i(JIT_R1, num_rands);
if (direct_native > 1) { /* => some_args_already_in_place */ if (direct_native > 1) { /* => some_args_already_in_place */
@ -342,10 +353,16 @@ int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_na
mz_get_local_p(JIT_R1, JIT_LOCAL2); mz_get_local_p(JIT_R1, JIT_LOCAL2);
} }
jit_movr_p(JIT_R2, JIT_RUNSTACK); jit_movr_p(JIT_R2, JIT_RUNSTACK);
if (need_set_rs) { if (need_set_rs && !direct_to_code) {
/* In case arity check fails, need to update runstack now: */ /* In case arity check fails, need to update runstack now: */
JIT_UPDATE_THREAD_RSPTR(); JIT_UPDATE_THREAD_RSPTR();
} }
if (direct_native && direct_to_code) {
__END_SHORT_JUMPS__(num_rands < 100);
(void)jit_jmpi(direct_to_code);
/* no slow path in this mode */
return 1;
}
/* Now jump: */ /* Now jump: */
jit_jmpr(JIT_V1); jit_jmpr(JIT_V1);
CHECK_LIMIT(); CHECK_LIMIT();
@ -359,7 +376,8 @@ int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_na
/* The slow way: */ /* The slow way: */
/* V1 and RUNSTACK must be intact! */ /* V1 and RUNSTACK must be intact! */
mz_patch_branch(ref5); if (ref5)
mz_patch_branch(ref5);
generate_pause_for_gc_and_retry(jitter, generate_pause_for_gc_and_retry(jitter,
num_rands < 100, /* in short jumps */ num_rands < 100, /* in short jumps */
JIT_V1, /* expose V1 to GC */ JIT_V1, /* expose V1 to GC */
@ -369,7 +387,8 @@ int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_na
mz_patch_branch(ref); mz_patch_branch(ref);
mz_patch_branch(ref2); mz_patch_branch(ref2);
} }
mz_patch_branch(ref4); if (ref4)
mz_patch_branch(ref4);
CHECK_LIMIT(); CHECK_LIMIT();
if (need_set_rs) { if (need_set_rs) {
JIT_UPDATE_THREAD_RSPTR(); JIT_UPDATE_THREAD_RSPTR();
@ -1093,7 +1112,7 @@ static int do_generate_shared_call(mz_jit_state *jitter, void *_data)
if (data->direct_prim) if (data->direct_prim)
ok = generate_direct_prim_tail_call(jitter, data->num_rands); ok = generate_direct_prim_tail_call(jitter, data->num_rands);
else else
ok = scheme_generate_tail_call(jitter, data->num_rands, data->direct_native, 1, 0); ok = scheme_generate_tail_call(jitter, data->num_rands, data->direct_native, 1, 0, NULL);
scheme_jit_register_helper_func(jitter, code); scheme_jit_register_helper_func(jitter, code);
@ -1200,6 +1219,7 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_
{ {
int i, offset, need_safety = 0, apply_to_list = 0; int i, offset, need_safety = 0, apply_to_list = 0;
int direct_prim = 0, need_non_tail = 0, direct_native = 0, direct_self = 0, nontail_self = 0; int direct_prim = 0, need_non_tail = 0, direct_native = 0, direct_self = 0, nontail_self = 0;
void *inline_direct_native = NULL;
int proc_already_in_place = 0; int proc_already_in_place = 0;
Scheme_Object *rator, *v, *arg; Scheme_Object *rator, *v, *arg;
int reorder_ok = 0; int reorder_ok = 0;
@ -1290,6 +1310,28 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_
direct_self = 1; direct_self = 1;
else if (jitter->self_nontail_code) else if (jitter->self_nontail_code)
nontail_self = 1; nontail_self = 1;
} else if (is_tail) {
Scheme_Closure *c = (Scheme_Closure *)rator;
if (ZERO_SIZED_CLOSUREP(c)) {
/* If we're calling a constant function in tail position, then
there's a good chance that this function is a wrapper to
get to a loop. Inline the jump to the potential loop,
absorbing the runstack and C stack checks into the check
for this function --- only works if we can JIT the target
of the call. */
Scheme_Native_Closure *nc;
nc = (Scheme_Native_Closure *)scheme_jit_closure((Scheme_Object *)data, NULL);
if (nc->code->code == scheme_on_demand_jit_code) {
if (nc->code->arity_code) { /* i.e., not in progress */
scheme_on_demand_generate_lambda(nc, 0, NULL);
}
}
if (nc->code->code != scheme_on_demand_jit_code) {
if (nc->code->max_let_depth > jitter->max_tail_depth)
jitter->max_tail_depth = nc->code->max_let_depth;
inline_direct_native = nc->code->u.tail_code;
}
}
} }
} }
reorder_ok = 1; reorder_ok = 1;
@ -1443,13 +1485,17 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_
scheme_generate_non_tail(arg, jitter, 0, !need_non_tail, 0); /* sync'd below */ scheme_generate_non_tail(arg, jitter, 0, !need_non_tail, 0); /* sync'd below */
RESUME_JIT_DATA(); RESUME_JIT_DATA();
CHECK_LIMIT(); CHECK_LIMIT();
if ((i == num_rands - 1) && !direct_prim && !reorder_ok && !direct_self && !proc_already_in_place) { if ((i == num_rands - 1) && !direct_prim && !reorder_ok && !direct_self && !proc_already_in_place) {
/* Move rator back to register: */ /* Move rator back to register: */
mz_rs_ldxi(JIT_V1, i + offset); mz_rs_ldxi(JIT_V1, i + offset);
} }
if ((!direct_prim || (num_rands > 1) || (no_call == 2)) if ((!direct_prim || (num_rands > 1) || (no_call == 2))
&& (!direct_self || !is_tail || no_call || (i + 1 < num_rands))) { && (!direct_self || !is_tail || no_call || (i + 1 < num_rands))) {
int r0;
r0 = (mz_CURRENT_R0_STATUS_VALID() ? mz_CURRENT_R0_STATUS() : -1);
mz_rs_stxi(i + offset, JIT_R0); mz_rs_stxi(i + offset, JIT_R0);
if (r0 > -1) mz_RECORD_R0_STATUS(r0);
} }
} }
/* not sync'd... */ /* not sync'd... */
@ -1498,6 +1544,7 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_
if (no_call) { if (no_call) {
/* leave actual call to inlining code */ /* leave actual call to inlining code */
} else if (!(direct_self && is_tail) } else if (!(direct_self && is_tail)
&& !inline_direct_native
&& (num_rands >= MAX_SHARED_CALL_RANDS)) { && (num_rands >= MAX_SHARED_CALL_RANDS)) {
LOG_IT(("<-many args\n")); LOG_IT(("<-many args\n"));
if (is_tail) { if (is_tail) {
@ -1509,7 +1556,7 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_
jit_movi_l(JIT_R2, args_already_in_place); jit_movi_l(JIT_R2, args_already_in_place);
mz_set_local_p(JIT_R2, JIT_LOCAL2); mz_set_local_p(JIT_R2, JIT_LOCAL2);
} }
scheme_generate_tail_call(jitter, num_rands, direct_native, jitter->need_set_rs, 1); scheme_generate_tail_call(jitter, num_rands, direct_native, jitter->need_set_rs, 1, NULL);
} }
} else { } else {
if (direct_prim) if (direct_prim)
@ -1536,6 +1583,16 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_
LOG_IT(("<-self\n")); LOG_IT(("<-self\n"));
generate_self_tail_call(rator, jitter, num_rands, code, args_already_in_place, app, alt_rands); generate_self_tail_call(rator, jitter, num_rands, code, args_already_in_place, app, alt_rands);
CHECK_LIMIT(); CHECK_LIMIT();
} else if (inline_direct_native) {
LOG_IT(("<-native-tail\n"));
scheme_mz_flostack_restore(jitter, 0, 0, 1, 1);
if (args_already_in_place) {
jit_movi_l(JIT_R2, args_already_in_place);
mz_set_local_p(JIT_R2, JIT_LOCAL2);
}
scheme_generate_tail_call(jitter, num_rands, direct_native, jitter->need_set_rs, 1,
inline_direct_native);
CHECK_LIMIT();
} else { } else {
scheme_mz_flostack_restore(jitter, 0, 0, 1, 1); scheme_mz_flostack_restore(jitter, 0, 0, 1, 1);
LOG_IT(("<-tail\n")); LOG_IT(("<-tail\n"));

View File

@ -2464,7 +2464,7 @@ static int more_common1(mz_jit_state *jitter, void *_data)
__END_SHORT_JUMPS__(1); __END_SHORT_JUMPS__(1);
scheme_generate_tail_call(jitter, -1, 0, 1, 0); scheme_generate_tail_call(jitter, -1, 0, 1, 0, NULL);
CHECK_LIMIT(); CHECK_LIMIT();
} }