From 6078013e3228fbbd5020361a8bc45c84d410699d Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Sat, 23 Apr 2011 21:37:56 -0600 Subject: [PATCH] JIT improvement: more direct path through loop wrappers --- src/racket/src/jit.c | 9 +++- src/racket/src/jit.h | 7 +-- src/racket/src/jitcall.c | 99 ++++++++++++++++++++++++++++++-------- src/racket/src/jitcommon.c | 2 +- 4 files changed, 90 insertions(+), 27 deletions(-) diff --git a/src/racket/src/jit.c b/src/racket/src/jit.c index 6e4cc4cc38..6704357bd9 100644 --- a/src/racket/src/jit.c +++ b/src/racket/src/jit.c @@ -2949,7 +2949,7 @@ static int generate_function_getarg(mz_jit_state *jitter, int has_rest, int num_ typedef struct { Scheme_Closure_Data *data; void *arity_code, *code, *tail_code, *code_end, **patch_depth; - int max_extra, max_depth; + int max_extra, max_depth, max_tail_depth; Scheme_Native_Closure *nc; int argc; Scheme_Object **argv; @@ -3281,6 +3281,7 @@ static int do_generate_closure(mz_jit_state *jitter, void *_data) gdata->tail_code = tail_code; gdata->max_extra = jitter->max_extra_pushed; gdata->max_depth = jitter->max_depth; + gdata->max_tail_depth = jitter->max_tail_depth; gdata->code_end = code_end; gdata->patch_depth = jitter->patch_depth; } @@ -3307,9 +3308,11 @@ static void on_demand_generate_lambda(Scheme_Native_Closure *nc, int argc, Schem scheme_delay_load_closure(data); /* So, check again whether we still need to generate: */ - if (nc->code->code != scheme_on_demand_jit_code) + if (ndata->code != scheme_on_demand_jit_code) return; + ndata->arity_code = NULL; /* => in progress */ + scheme_generate_one(NULL, do_generate_closure, &gdata, 1, data->name, ndata); if (gdata.max_depth > data->max_let_depth) { @@ -3337,6 +3340,8 @@ static void on_demand_generate_lambda(Scheme_Native_Closure *nc, int argc, Schem /* Add a couple of extra slots to computed let-depth, in case we haven't quite computed right for inlined uses, etc. */ max_depth = WORDS_TO_BYTES(data->max_let_depth + gdata.max_extra + 2); + if (gdata.max_tail_depth > max_depth) + max_depth = gdata.max_tail_depth; /* max_let_depth is used for flags by generate_lambda: */ if (ndata->max_let_depth & 0x1) { diff --git a/src/racket/src/jit.h b/src/racket/src/jit.h index 2d2ee66125..3d5663b602 100644 --- a/src/racket/src/jit.h +++ b/src/racket/src/jit.h @@ -274,7 +274,7 @@ typedef struct { char *limit; int extra_pushed, max_extra_pushed; int depth; /* the position of the closure's first value on the stack */ - int max_depth; + int max_depth, max_tail_depth; int *mappings; /* For each element, case 0x1 bit: . 0 -> case 0x2 bit: @@ -1028,7 +1028,7 @@ static int past_limit(mz_jit_state *jitter) { if (((uintptr_t)jit_get_ip().ptr > (uintptr_t)jitter->limit + JIT_BUFFER_PAD_SIZE) || (jitter->retain_start)) { - printf("way past\n"); + printf("way past\n"); abort(); } return 0; } @@ -1136,7 +1136,8 @@ void *scheme_generate_shared_call(int num_rands, mz_jit_state *old_jitter, int m void scheme_ensure_retry_available(mz_jit_state *jitter, int multi_ok); int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_rands, mz_jit_state *jitter, int is_tail, int multi_ok, int no_call); -int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs, int is_inline); +int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs, + int is_inline, void *direct_to_code); int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs, int multi_ok, int nontail_self, int pop_and_jump, int is_inlined); int scheme_generate_finish_tail_call(mz_jit_state *jitter, int direct_native); diff --git a/src/racket/src/jitcall.c b/src/racket/src/jitcall.c index fbe44aa48d..663af0dbcd 100644 --- a/src/racket/src/jitcall.c +++ b/src/racket/src/jitcall.c @@ -248,7 +248,8 @@ static int generate_direct_prim_tail_call(mz_jit_state *jitter, int num_rands) return 1; } -int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs, int is_inline) +int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs, + int is_inline, void *direct_to_code) /* Proc is in V1, args are at RUNSTACK. If num_rands < 0, then argc is in LOCAL2 and arguments are already below RUNSTACK_BASE. If direct_native == 2, then some arguments are already in place (shallower in the runstack @@ -272,23 +273,29 @@ int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_na refagain = _jit.x.pc; /* Right kind of function. Extract data and check stack depth: */ - jit_ldxi_p(JIT_R0, JIT_V1, &((Scheme_Native_Closure *)0x0)->code); - jit_ldxi_i(JIT_R2, JIT_R0, &((Scheme_Native_Closure_Data *)0x0)->max_let_depth); - mz_tl_ldi_p(JIT_R1, tl_MZ_RUNSTACK_START); - jit_subr_ul(JIT_R1, JIT_RUNSTACK, JIT_R1); - ref4 = jit_bltr_ul(jit_forward(), JIT_R1, JIT_R2); - CHECK_LIMIT(); + if (!direct_to_code) { + jit_ldxi_p(JIT_R0, JIT_V1, &((Scheme_Native_Closure *)0x0)->code); + jit_ldxi_i(JIT_R2, JIT_R0, &((Scheme_Native_Closure_Data *)0x0)->max_let_depth); + mz_tl_ldi_p(JIT_R1, tl_MZ_RUNSTACK_START); + jit_subr_ul(JIT_R1, JIT_RUNSTACK, JIT_R1); + ref4 = jit_bltr_ul(jit_forward(), JIT_R1, JIT_R2); + CHECK_LIMIT(); + } else + ref4 = NULL; /* Fast jump ok (proc will check argc). At this point, V1 = closure and R0 = code. */ /* Check for thread swap: */ - (void)mz_tl_ldi_i(JIT_R2, tl_scheme_fuel_counter); - ref5 = jit_blei_i(jit_forward(), JIT_R2, 0); + if (!direct_to_code) { + (void)mz_tl_ldi_i(JIT_R2, tl_scheme_fuel_counter); + ref5 = jit_blei_i(jit_forward(), JIT_R2, 0); #ifndef FUEL_AUTODECEREMENTS - jit_subi_p(JIT_R2, JIT_R2, 0x1); - (void)mz_tl_sti_i(tl_scheme_fuel_counter, JIT_R2, JIT_R1); + jit_subi_p(JIT_R2, JIT_R2, 0x1); + (void)mz_tl_sti_i(tl_scheme_fuel_counter, JIT_R2, JIT_R1); #endif + } else + ref5 = NULL; CHECK_LIMIT(); /* Copy args to runstack base: */ @@ -324,14 +331,18 @@ int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_na /* RUNSTACK, RUNSTACK_BASE, V1, and R0 are ready */ /* Extract function and data: */ - jit_movr_p(JIT_R2, JIT_V1); - if (direct_native) { - jit_ldxi_p(JIT_V1, JIT_R0, &((Scheme_Native_Closure_Data *)0x0)->u.tail_code); + if (!direct_to_code) { + jit_movr_p(JIT_R2, JIT_V1); + if (direct_native) { + jit_ldxi_p(JIT_V1, JIT_R0, &((Scheme_Native_Closure_Data *)0x0)->u.tail_code); + } else { + jit_ldxi_p(JIT_V1, JIT_R0, &((Scheme_Native_Closure_Data *)0x0)->arity_code); + } + jit_movr_p(JIT_R0, JIT_R2); } else { - jit_ldxi_p(JIT_V1, JIT_R0, &((Scheme_Native_Closure_Data *)0x0)->arity_code); + jit_movr_p(JIT_R0, JIT_V1); } /* Set up arguments; JIT_RUNSTACK and JIT_RUNSTACK_BASE must also be ready */ - jit_movr_p(JIT_R0, JIT_R2); if (num_rands >= 0) { jit_movi_i(JIT_R1, num_rands); if (direct_native > 1) { /* => some_args_already_in_place */ @@ -342,10 +353,16 @@ int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_na mz_get_local_p(JIT_R1, JIT_LOCAL2); } jit_movr_p(JIT_R2, JIT_RUNSTACK); - if (need_set_rs) { + if (need_set_rs && !direct_to_code) { /* In case arity check fails, need to update runstack now: */ JIT_UPDATE_THREAD_RSPTR(); } + if (direct_native && direct_to_code) { + __END_SHORT_JUMPS__(num_rands < 100); + (void)jit_jmpi(direct_to_code); + /* no slow path in this mode */ + return 1; + } /* Now jump: */ jit_jmpr(JIT_V1); CHECK_LIMIT(); @@ -359,7 +376,8 @@ int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_na /* The slow way: */ /* V1 and RUNSTACK must be intact! */ - mz_patch_branch(ref5); + if (ref5) + mz_patch_branch(ref5); generate_pause_for_gc_and_retry(jitter, num_rands < 100, /* in short jumps */ JIT_V1, /* expose V1 to GC */ @@ -369,7 +387,8 @@ int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_na mz_patch_branch(ref); mz_patch_branch(ref2); } - mz_patch_branch(ref4); + if (ref4) + mz_patch_branch(ref4); CHECK_LIMIT(); if (need_set_rs) { JIT_UPDATE_THREAD_RSPTR(); @@ -1093,7 +1112,7 @@ static int do_generate_shared_call(mz_jit_state *jitter, void *_data) if (data->direct_prim) ok = generate_direct_prim_tail_call(jitter, data->num_rands); else - ok = scheme_generate_tail_call(jitter, data->num_rands, data->direct_native, 1, 0); + ok = scheme_generate_tail_call(jitter, data->num_rands, data->direct_native, 1, 0, NULL); scheme_jit_register_helper_func(jitter, code); @@ -1200,6 +1219,7 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_ { int i, offset, need_safety = 0, apply_to_list = 0; int direct_prim = 0, need_non_tail = 0, direct_native = 0, direct_self = 0, nontail_self = 0; + void *inline_direct_native = NULL; int proc_already_in_place = 0; Scheme_Object *rator, *v, *arg; int reorder_ok = 0; @@ -1290,6 +1310,28 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_ direct_self = 1; else if (jitter->self_nontail_code) nontail_self = 1; + } else if (is_tail) { + Scheme_Closure *c = (Scheme_Closure *)rator; + if (ZERO_SIZED_CLOSUREP(c)) { + /* If we're calling a constant function in tail position, then + there's a good chance that this function is a wrapper to + get to a loop. Inline the jump to the potential loop, + absorbing the runstack and C stack checks into the check + for this function --- only works if we can JIT the target + of the call. */ + Scheme_Native_Closure *nc; + nc = (Scheme_Native_Closure *)scheme_jit_closure((Scheme_Object *)data, NULL); + if (nc->code->code == scheme_on_demand_jit_code) { + if (nc->code->arity_code) { /* i.e., not in progress */ + scheme_on_demand_generate_lambda(nc, 0, NULL); + } + } + if (nc->code->code != scheme_on_demand_jit_code) { + if (nc->code->max_let_depth > jitter->max_tail_depth) + jitter->max_tail_depth = nc->code->max_let_depth; + inline_direct_native = nc->code->u.tail_code; + } + } } } reorder_ok = 1; @@ -1443,13 +1485,17 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_ scheme_generate_non_tail(arg, jitter, 0, !need_non_tail, 0); /* sync'd below */ RESUME_JIT_DATA(); CHECK_LIMIT(); + if ((i == num_rands - 1) && !direct_prim && !reorder_ok && !direct_self && !proc_already_in_place) { /* Move rator back to register: */ mz_rs_ldxi(JIT_V1, i + offset); } if ((!direct_prim || (num_rands > 1) || (no_call == 2)) && (!direct_self || !is_tail || no_call || (i + 1 < num_rands))) { + int r0; + r0 = (mz_CURRENT_R0_STATUS_VALID() ? mz_CURRENT_R0_STATUS() : -1); mz_rs_stxi(i + offset, JIT_R0); + if (r0 > -1) mz_RECORD_R0_STATUS(r0); } } /* not sync'd... */ @@ -1498,6 +1544,7 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_ if (no_call) { /* leave actual call to inlining code */ } else if (!(direct_self && is_tail) + && !inline_direct_native && (num_rands >= MAX_SHARED_CALL_RANDS)) { LOG_IT(("<-many args\n")); if (is_tail) { @@ -1509,7 +1556,7 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_ jit_movi_l(JIT_R2, args_already_in_place); mz_set_local_p(JIT_R2, JIT_LOCAL2); } - scheme_generate_tail_call(jitter, num_rands, direct_native, jitter->need_set_rs, 1); + scheme_generate_tail_call(jitter, num_rands, direct_native, jitter->need_set_rs, 1, NULL); } } else { if (direct_prim) @@ -1536,6 +1583,16 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_ LOG_IT(("<-self\n")); generate_self_tail_call(rator, jitter, num_rands, code, args_already_in_place, app, alt_rands); CHECK_LIMIT(); + } else if (inline_direct_native) { + LOG_IT(("<-native-tail\n")); + scheme_mz_flostack_restore(jitter, 0, 0, 1, 1); + if (args_already_in_place) { + jit_movi_l(JIT_R2, args_already_in_place); + mz_set_local_p(JIT_R2, JIT_LOCAL2); + } + scheme_generate_tail_call(jitter, num_rands, direct_native, jitter->need_set_rs, 1, + inline_direct_native); + CHECK_LIMIT(); } else { scheme_mz_flostack_restore(jitter, 0, 0, 1, 1); LOG_IT(("<-tail\n")); diff --git a/src/racket/src/jitcommon.c b/src/racket/src/jitcommon.c index eac6868574..c982324833 100644 --- a/src/racket/src/jitcommon.c +++ b/src/racket/src/jitcommon.c @@ -2464,7 +2464,7 @@ static int more_common1(mz_jit_state *jitter, void *_data) __END_SHORT_JUMPS__(1); - scheme_generate_tail_call(jitter, -1, 0, 1, 0); + scheme_generate_tail_call(jitter, -1, 0, 1, 0, NULL); CHECK_LIMIT(); }