diff --git a/collects/tests/racket/optimize.rktl b/collects/tests/racket/optimize.rktl index a8841f73ac..770a8be845 100644 --- a/collects/tests/racket/optimize.rktl +++ b/collects/tests/racket/optimize.rktl @@ -1979,6 +1979,53 @@ (for-each values numlist) (+ n1 n2))))) +;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Check JIT handling of unboxed arguments in loops, +;; including a loop starts in tail and non-tail positions. + +(let () + (define N 100000) + + (define (non-tail) + (define-values (a b) + (let loop ([n N] [x -1.0] [y 1.0]) + (cond + [(zero? n) (values x y)] + [else (loop (sub1 n) + (fl+ x -1.0) + (fl+ y 1.0))]))) + (values a b)) + + (define (non-tail2) + (for/fold ([v 0.0]) ([i (in-range N)]) + (define-values (a b) + (let loop ([n 10] [x -1.0] [y 1.0]) + (cond + [(zero? n) (values x y)] + [else (loop (sub1 n) + (fl+ x -1.0) + (fl+ y 1.0))]))) + (fl+ v (fl- a b)))) + + (define (tail) + (let loop ([n N] [x -1.0] [y 1.0]) + (cond + [(zero? n) (values x y)] + [else (loop (sub1 n) + (fl+ x -1.0) + (fl+ y 1.0))]))) + + (define x-tail #f) + (define x-non-tail #f) + (define x-non-tail2 #f) + (set! x-tail tail) + (set! x-non-tail non-tail) + (set! x-non-tail2 non-tail2) + + (test-values '(-100001.0 100001.0) non-tail) + (test -2200000.0 non-tail2) + (test-values '(-100001.0 100001.0) tail)) + ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (report-errs) diff --git a/src/racket/src/jit.c b/src/racket/src/jit.c index 5e22da74ee..78d8a87156 100644 --- a/src/racket/src/jit.c +++ b/src/racket/src/jit.c @@ -1089,8 +1089,6 @@ static int generate_flonum_local_boxing(mz_jit_state *jitter, int pos, int local int scheme_generate_flonum_local_unboxing(mz_jit_state *jitter, int push) /* Move FPR0 onto C stack */ { - int offset; - if (jitter->flostack_offset == jitter->flostack_space) { int space = FLOSTACK_SPACE_CHUNK * sizeof(double); jitter->flostack_space += FLOSTACK_SPACE_CHUNK; @@ -1102,8 +1100,7 @@ int scheme_generate_flonum_local_unboxing(mz_jit_state *jitter, int push) mz_runstack_flonum_pushed(jitter, jitter->flostack_offset); CHECK_LIMIT(); - offset = JIT_FRAME_FLONUM_OFFSET - (jitter->flostack_offset * sizeof(double)); - (void)jit_stxi_d_fppop(offset, JIT_FP, JIT_FPR0); + mz_st_fppop(jitter->flostack_offset, JIT_FPR0); return 1; } @@ -2278,7 +2275,7 @@ int scheme_generate(Scheme_Object *obj, mz_jit_state *jitter, int is_tail, int w if (is_tail) { if (!sjc.shared_tail_argc_code) { - sjc.shared_tail_argc_code = scheme_generate_shared_call(-1, jitter, 1, 1, 0, 0, 0); + sjc.shared_tail_argc_code = scheme_generate_shared_call(-1, jitter, 1, 1, 0, 0, 0, 0); } mz_set_local_p(JIT_R0, JIT_LOCAL2); (void)jit_jmpi(sjc.shared_tail_argc_code); @@ -2287,7 +2284,7 @@ int scheme_generate(Scheme_Object *obj, mz_jit_state *jitter, int is_tail, int w void *code; if (!sjc.shared_non_tail_argc_code[mo]) { scheme_ensure_retry_available(jitter, multi_ok); - code = scheme_generate_shared_call(-2, jitter, multi_ok, 0, 0, 0, 0); + code = scheme_generate_shared_call(-2, jitter, multi_ok, 0, 0, 0, 0, 0); sjc.shared_non_tail_argc_code[mo] = code; } code = sjc.shared_non_tail_argc_code[mo]; @@ -3295,10 +3292,10 @@ static int do_generate_closure(mz_jit_state *jitter, void *_data) GC_CAN_IGNORE jit_insn *zref; int f_offset; - /* In the case of an inline_direct_native call, the flonums are - already unpacked and JIT_SP is set up. Check whether JIT_SP - is already different than the 0 flonums. */ - f_offset = JIT_FRAME_FLONUM_OFFSET - (jitter->flostack_offset * sizeof(double)); + /* In the case of an direct native call, the flonums can be + already unpacked, in which case JIT_SP is set up. Check whether + JIT_SP is already different than the 0-flonums case. */ + f_offset = JIT_FRAME_FLONUM_OFFSET - (jitter->flostack_space * sizeof(double)); jit_subr_p(JIT_R1, JIT_SP, JIT_FP); zref = jit_bnei_l(jit_forward(), JIT_R1, f_offset); diff --git a/src/racket/src/jit.h b/src/racket/src/jit.h index 28e7bab2fb..d53fde2882 100644 --- a/src/racket/src/jit.h +++ b/src/racket/src/jit.h @@ -210,7 +210,7 @@ struct scheme_jit_common_record { #define MAX_SHARED_CALL_RANDS 25 void *shared_tail_code[4][MAX_SHARED_CALL_RANDS]; - void *shared_non_tail_code[4][MAX_SHARED_CALL_RANDS][2]; + void *shared_non_tail_code[5][MAX_SHARED_CALL_RANDS][2]; void *shared_non_tail_retry_code[2]; void *shared_non_tail_argc_code[2]; void *shared_tail_argc_code; @@ -683,13 +683,16 @@ int check_location; # define NEED_LOCAL4 #endif +#define mz_set_local_p(x, l) mz_set_local_p_x(x, l, JIT_FP) +#define mz_get_local_p(x, l) mz_get_local_p_x(x, l, JIT_FP) + #ifdef MZ_USE_JIT_PPC /* JIT_LOCAL1, JIT_LOCAL2, and JIT_LOCAL3 are offsets in the stack frame. */ # define JIT_LOCAL1 56 # define JIT_LOCAL2 60 # define JIT_LOCAL3 64 -# define mz_set_local_p(x, l) jit_stxi_p(l, JIT_FP, x) -# define mz_get_local_p(x, l) jit_ldxi_p(x, JIT_FP, l) +# define mz_set_local_p_x(x, l, FP) jit_stxi_p(l, FP, x) +# define mz_get_local_p_x(x, l, FP) jit_ldxi_p(x, FP, l) # define mz_patch_branch_at(a, v) (_jitl.long_jumps ? (void)jit_patch_movei(a-4, a-3, v) : (void)jit_patch_branch(a-1, v)) # define mz_patch_ucbranch_at(a, v) (_jitl.long_jumps ? (void)jit_patch_movei(a-4, a-3, v) : (void)jit_patch_ucbranch(a-1, v)) # define mz_prolog(x) (MFLRr(x), mz_set_local_p(x, JIT_LOCAL2)) @@ -741,8 +744,8 @@ void scheme_jit_prolog_again(mz_jit_state *jitter, int n, int ret_addr_reg) 1 word (for the return address) below alignment. */ # define JIT_LOCAL1 -(JIT_WORD_SIZE * 4) # define JIT_LOCAL2 -(JIT_WORD_SIZE * 5) -# define mz_set_local_p(x, l) jit_stxi_p((l), JIT_FP, (x)) -# define mz_get_local_p(x, l) jit_ldxi_p((x), JIT_FP, (l)) +# define mz_set_local_p_x(x, l, FP) jit_stxi_p((l), FP, (x)) +# define mz_get_local_p_x(x, l, FP) jit_ldxi_p((x), FP, (l)) # define mz_patch_branch_at(a, v) jit_patch_branch_at(a, v) # define mz_patch_ucbranch_at(a, v) jit_patch_ucbranch_at(a, v) /* The ABI for _CALL_DARWIN or JIT_X86_64 requires alignment. Even @@ -826,7 +829,23 @@ void scheme_jit_prolog_again(mz_jit_state *jitter, int n, int ret_addr_reg) # define mz_repush_threadlocal() /* empty */ #endif +#if 0 +static jit_insn *fp_tmpr; +# define check_fp_depth(i, FP) \ + (jit_addi_l(FP, FP, (JIT_FRAME_FLONUM_OFFSET - ((i) * sizeof(double)))), \ + fp_tmpr = jit_bger_l(0, FP, JIT_SP), \ + jit_ldi_p(FP, 0), \ + mz_patch_branch(fp_tmpr), \ + jit_subi_l(FP, FP, (JIT_FRAME_FLONUM_OFFSET - ((i) * sizeof(double))))) +#else +# define check_fp_depth(i, FP) (void)0 +#endif + #define FLOSTACK_SPACE_CHUNK 4 +# define mz_ld_fppush_x(r, i, FP) (check_fp_depth(i, FP), jit_ldxi_d_fppush(r, FP, (JIT_FRAME_FLONUM_OFFSET - ((i) * sizeof(double))))) +# define mz_ld_fppush(r, i) mz_ld_fppush_x(r, i, JIT_FP) +# define mz_st_fppop_x(i, r, FP) (check_fp_depth(i, FP), (void)jit_stxi_d_fppop((JIT_FRAME_FLONUM_OFFSET - ((i) * sizeof(double))), FP, r)) +# define mz_st_fppop(i, r) mz_st_fppop_x(i, r, JIT_FP) #define mz_patch_branch(a) mz_patch_branch_at(a, (_jit.x.pc)) #define mz_patch_ucbranch(a) mz_patch_ucbranch_at(a, (_jit.x.pc)) @@ -1198,14 +1217,14 @@ int scheme_generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj typedef struct jit_direct_arg jit_direct_arg; void *scheme_generate_shared_call(int num_rands, mz_jit_state *old_jitter, int multi_ok, int is_tail, - int direct_prim, int direct_native, int nontail_self); + int direct_prim, int direct_native, int nontail_self, int unboxed_args); void scheme_ensure_retry_available(mz_jit_state *jitter, int multi_ok); int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_rands, mz_jit_state *jitter, int is_tail, int multi_ok, int no_call); int scheme_generate_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs, int is_inline, Scheme_Native_Closure *direct_to_code, jit_direct_arg *direct_arg); int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs, - int multi_ok, int nontail_self, int pop_and_jump, int is_inlined); + int multi_ok, int nontail_self, int pop_and_jump, int is_inlined, int unboxed_args); int scheme_generate_finish_tail_call(mz_jit_state *jitter, int direct_native); void scheme_jit_register_sub_func(mz_jit_state *jitter, void *code, Scheme_Object *protocol); void scheme_jit_register_helper_func(mz_jit_state *jitter, void *code); @@ -1242,8 +1261,6 @@ int scheme_generate_non_tail_with_branch(Scheme_Object *obj, mz_jit_state *jitte int scheme_generate(Scheme_Object *obj, mz_jit_state *jitter, int tail_ok, int wcm_may_replace, int multi_ok, int target, Branch_Info *for_branch); int scheme_generate_unboxed(Scheme_Object *obj, mz_jit_state *jitter, int inlined_ok, int unbox_anyway); -void *scheme_generate_shared_call(int num_rands, mz_jit_state *old_jitter, int multi_ok, int is_tail, - int direct_prim, int direct_native, int nontail_self); #ifdef USE_FLONUM_UNBOXING int scheme_generate_flonum_local_unboxing(mz_jit_state *jitter, int push); diff --git a/src/racket/src/jitarith.c b/src/racket/src/jitarith.c index 43a94ba32a..dc86d7a66b 100644 --- a/src/racket/src/jitarith.c +++ b/src/racket/src/jitarith.c @@ -143,7 +143,8 @@ int scheme_can_unbox_inline(Scheme_Object *obj, int fuel, int regs, int unsafely just unbox it without using more than `regs' registers? There cannot be any errors or function calls, unless we've specifically instrumented them to save/pop floating-point values before - jumping. */ + jumping. If the result is true, then arguments must be evaluated in + order. */ { Scheme_Type t; @@ -932,11 +933,9 @@ int scheme_generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj if (!(inlined_flonum1 && inlined_flonum2)) { if ((can_direct1 || (unsafe_fl > 0)) && !inlined_flonum2) { #ifdef USE_FLONUM_UNBOXING - int aoffset; int fpr0; fpr0 = JIT_FPR_0(jitter->unbox_depth); - aoffset = JIT_FRAME_FLONUM_OFFSET - (jitter->flostack_offset * sizeof(double)); - jit_ldxi_d_fppush(fpr0, JIT_FP, aoffset); + mz_ld_fppush(fpr0, jitter->flostack_offset); scheme_mz_flostack_restore(jitter, flostack, flopos, 1, 1); CHECK_LIMIT(); jitter->unbox_depth++; diff --git a/src/racket/src/jitcall.c b/src/racket/src/jitcall.c index d2312940a1..18d1405cf7 100644 --- a/src/racket/src/jitcall.c +++ b/src/racket/src/jitcall.c @@ -667,11 +667,16 @@ static int generate_clear_slow_previous_args(mz_jit_state *jitter) } int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direct_native, int need_set_rs, - int multi_ok, int nontail_self, int pop_and_jump, int is_inlined) + int multi_ok, int nontail_self, int pop_and_jump, int is_inlined, + int unboxed_args) { /* Non-tail call. Proc is in V1, args are at RUNSTACK. If nontail_self, then R0 has proc pointer, and R2 has max_let_depth. + If unboxed_args, LOCAL3 holds address with argument-copying code, + where R2 is set before jumping to the old FP, and R1 holds + return address back here, and V1 and R0 must be preserved; + num_rands >= 0 in this case, and the "slow path" returns NULL. If num_rands < 0, then argc is in R0, and need to pop runstack before returning. If num_rands == -1, skip prolog. */ GC_CAN_IGNORE jit_insn *ref, *ref2, *ref4, *ref5, *ref6, *ref7, *ref8, *ref9; @@ -738,6 +743,9 @@ int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direc jit_movr_p(JIT_R2, JIT_V1); jit_movr_p(JIT_V1, JIT_R0); } + if (unboxed_args) { + jit_movr_p(JIT_R2, JIT_FP); /* save old FP */ + } jit_shuffle_saved_regs(); /* maybe copies V registers to be restored */ #ifdef MZ_USE_JIT_I386 /* keep call & ret paired by jumping to where we really @@ -752,7 +760,7 @@ int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direc if (num_rands >= 0) { if (nontail_self) { jit_movr_p(JIT_R1, JIT_R0); } jit_movr_p(JIT_R0, JIT_V1); /* closure */ - if (!nontail_self) { + if (!nontail_self && !unboxed_args) { /* nontail_self is only enabled when there are no rest args: */ jit_movi_i(JIT_R1, num_rands); /* argc */ jit_movr_p(JIT_R2, JIT_RUNSTACK); /* argv */ @@ -769,8 +777,20 @@ int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direc jit_movr_p(JIT_R2, JIT_RUNSTACK); /* argv */ } CHECK_LIMIT(); + if (unboxed_args) { + /* old FP is still in R2 */ + mz_get_local_p_x(JIT_V1, JIT_LOCAL3, JIT_R2); + } mz_push_locals(); mz_repush_threadlocal(); + if (unboxed_args) { + GC_CAN_IGNORE jit_insn *refrr; + refrr = jit_patchable_movi_p(JIT_R1, jit_forward()); + jit_jmpr(JIT_V1); + jit_patch_movi(refrr, _jit.x.pc); + jit_movi_i(JIT_R1, num_rands); /* argc */ + jit_movr_p(JIT_R2, JIT_RUNSTACK); /* argv */ + } if (!nontail_self) { jit_ldxi_p(JIT_V1, JIT_R0, &((Scheme_Native_Closure *)0x0)->code); if (direct_native) { @@ -925,10 +945,13 @@ int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direc /* The slow way: */ mz_patch_branch(ref9); - generate_pause_for_gc_and_retry(jitter, - 1, /* in short jumps */ - JIT_V1, /* expose V1 to GC */ - refagain); /* retry code pointer */ + if (!unboxed_args) { + generate_pause_for_gc_and_retry(jitter, + 1, /* in short jumps */ + JIT_V1, /* expose V1 to GC */ + refagain); /* retry code pointer */ + } + CHECK_LIMIT(); if (!direct_native) { mz_patch_branch(ref); @@ -939,24 +962,34 @@ int scheme_generate_non_tail_call(mz_jit_state *jitter, int num_rands, int direc #ifndef FUEL_AUTODECEREMENTS mz_patch_branch(ref11); #endif - if (need_set_rs) { - JIT_UPDATE_THREAD_RSPTR(); - } - if (num_rands >= 0) { - jit_movi_i(JIT_R0, num_rands); - } - mz_prepare(3); - CHECK_LIMIT(); - jit_pusharg_p(JIT_RUNSTACK); - jit_pusharg_i(JIT_R0); - jit_pusharg_p(JIT_V1); - if (num_rands < 0) { jit_movr_p(JIT_V1, JIT_R0); } /* save argc to manually pop runstack */ - if (multi_ok) { - (void)mz_finish_lwe(x_ts__scheme_apply_multi_from_native, refrts); + if (unboxed_args) { + /* no slow path here; return NULL to box arguments fall back to generic */ + jit_movi_p(JIT_R0, NULL); + if (pop_and_jump) { + mz_epilog(JIT_V1); + } } else { - (void)mz_finish_lwe(x_ts__scheme_apply_from_native, refrts); + /* normal slow path: */ + if (need_set_rs) { + JIT_UPDATE_THREAD_RSPTR(); + } + if (num_rands >= 0) { + jit_movi_i(JIT_R0, num_rands); + } + mz_prepare(3); + CHECK_LIMIT(); + jit_pusharg_p(JIT_RUNSTACK); + jit_pusharg_i(JIT_R0); + jit_pusharg_p(JIT_V1); + if (num_rands < 0) { jit_movr_p(JIT_V1, JIT_R0); } /* save argc to manually pop runstack */ + if (multi_ok) { + (void)mz_finish_lwe(x_ts__scheme_apply_multi_from_native, refrts); + } else { + (void)mz_finish_lwe(x_ts__scheme_apply_from_native, refrts); + } + CHECK_LIMIT(); } - CHECK_LIMIT(); + mz_patch_ucbranch(ref5); if (!direct_native) { mz_patch_ucbranch(ref8); @@ -1032,13 +1065,11 @@ static int generate_self_tail_call(Scheme_Object *rator, mz_jit_state *jitter, i int is_flonum, already_unboxed = 0; if ((SCHEME_CLOSURE_DATA_FLAGS(jitter->self_data) & CLOS_HAS_TYPED_ARGS) && CLOSURE_ARGUMENT_IS_FLONUM(jitter->self_data, i + args_already_in_place)) { - int aoffset; is_flonum = 1; rand = (alt_rands ? alt_rands[i+1+args_already_in_place] : app->args[i+1+args_already_in_place]); - aoffset = JIT_FRAME_FLONUM_OFFSET - (arg_tmp_offset * sizeof(double)); - jit_ldxi_d_fppush(JIT_FPR0, JIT_FP, aoffset); + mz_ld_fppush(JIT_FPR0, arg_tmp_offset); --arg_tmp_offset; already_unboxed = 1; if (!already_loaded && !SAME_TYPE(SCHEME_TYPE(rand), scheme_local_type)) { @@ -1053,11 +1084,9 @@ static int generate_self_tail_call(Scheme_Object *rator, mz_jit_state *jitter, i jit_stxi_p(WORDS_TO_BYTES(i + closure_size + args_already_in_place), JIT_R2, JIT_R0); #ifdef USE_FLONUM_UNBOXING if (is_flonum) { - int aoffset; if (!already_unboxed) jit_ldxi_d_fppush(JIT_FPR0, JIT_R0, &((Scheme_Double *)0x0)->double_val); - aoffset = JIT_FRAME_FLONUM_OFFSET - (arg_offset * sizeof(double)); - (void)jit_stxi_d_fppop(aoffset, JIT_FP, JIT_FPR0); + mz_st_fppop(arg_offset, JIT_FPR0); arg_offset++; } #endif @@ -1143,9 +1172,7 @@ static int generate_self_tail_call(Scheme_Object *rator, mz_jit_state *jitter, i iref = jit_bnei_p(jit_forward(), JIT_R0, NULL); __END_TINY_JUMPS__(1); { - int aoffset; - aoffset = JIT_FRAME_FLONUM_OFFSET - (arg_tmp_offset * sizeof(double)); - jit_ldxi_d_fppush(JIT_FPR0, JIT_FP, aoffset); + mz_ld_fppush(JIT_FPR0, arg_tmp_offset); (void)jit_calli(sjc.box_flonum_from_stack_code); mz_ld_runstack_base_alt(JIT_R2); jit_subi_p(JIT_R2, JIT_RUNSTACK_BASE_OR_ALT(JIT_R2), WORDS_TO_BYTES(num_rands + closure_size + args_already_in_place)); @@ -1186,7 +1213,7 @@ typedef struct { mz_jit_state *old_jitter; int multi_ok; int is_tail; - int direct_prim, direct_native, nontail_self; + int direct_prim, direct_native, nontail_self, unboxed_args; } Generate_Call_Data; void scheme_jit_register_sub_func(mz_jit_state *jitter, void *code, Scheme_Object *protocol) @@ -1243,7 +1270,7 @@ static int do_generate_shared_call(mz_jit_state *jitter, void *_data) ok = generate_direct_prim_non_tail_call(jitter, data->num_rands, data->multi_ok, 1); else ok = scheme_generate_non_tail_call(jitter, data->num_rands, data->direct_native, 1, - data->multi_ok, data->nontail_self, 1, 0); + data->multi_ok, data->nontail_self, 1, 0, data->unboxed_args); scheme_jit_register_sub_func(jitter, code, scheme_false); @@ -1252,7 +1279,7 @@ static int do_generate_shared_call(mz_jit_state *jitter, void *_data) } void *scheme_generate_shared_call(int num_rands, mz_jit_state *old_jitter, int multi_ok, int is_tail, - int direct_prim, int direct_native, int nontail_self) + int direct_prim, int direct_native, int nontail_self, int unboxed_args) { Generate_Call_Data data; @@ -1263,6 +1290,7 @@ void *scheme_generate_shared_call(int num_rands, mz_jit_state *old_jitter, int m data.direct_prim = direct_prim; data.direct_native = direct_native; data.nontail_self = nontail_self; + data.unboxed_args = unboxed_args; return scheme_generate_one(old_jitter, do_generate_shared_call, &data, 0, NULL, NULL); } @@ -1272,7 +1300,7 @@ void scheme_ensure_retry_available(mz_jit_state *jitter, int multi_ok) int mo = multi_ok ? 1 : 0; if (!sjc.shared_non_tail_retry_code[mo]) { void *code; - code = scheme_generate_shared_call(-1, jitter, multi_ok, 0, 0, 0, 0); + code = scheme_generate_shared_call(-1, jitter, multi_ok, 0, 0, 0, 0, 0); sjc.shared_non_tail_retry_code[mo] = code; } } @@ -1410,17 +1438,14 @@ static jit_direct_arg *check_special_direct_args(Scheme_App_Rec *app, Scheme_Obj return inline_direct_args; } +#ifdef USE_FLONUM_UNBOXING -int generate_fp_argument_shuffle(int direct_flostack_offset, mz_jit_state *jitter) +static int generate_fp_argument_shuffle(int direct_flostack_offset, mz_jit_state *jitter) { int i, j; /* Copy unboxed flonums into place where the target code expects them, which is shifted and reverse of the order that we pushed. */ - -# define mz_ld_fppush(r, i) jit_ldxi_d_fppush(r, JIT_FP, (JIT_FRAME_FLONUM_OFFSET - ((i) * sizeof(double)))) -# define mz_st_fppop(i, r) (void)jit_stxi_d_fppop((JIT_FRAME_FLONUM_OFFSET - ((i) * sizeof(double))), JIT_FP, r) - if (direct_flostack_offset && ((direct_flostack_offset > 1) || (direct_flostack_offset != jitter->flostack_offset))) { @@ -1472,6 +1497,78 @@ int generate_fp_argument_shuffle(int direct_flostack_offset, mz_jit_state *jitte return 1; } +static int generate_call_path_with_unboxes(mz_jit_state *jitter, int direct_flostack_offset, void *unboxed_code, + GC_CAN_IGNORE jit_insn **_refdone, + int num_rands, Scheme_Closure_Data *direct_data, Scheme_Object *rator) +{ + GC_CAN_IGNORE jit_insn *refdone, *refgo, *refcopy, *ref; + int i, k, offset; + + refgo = jit_jmpi(jit_forward()); + refcopy = _jit.x.pc; + + /* Callback code to copy unboxed arguments. + R1 has the return address, R2 holds the old FP */ + + offset = FLOSTACK_SPACE_CHUNK * ((direct_flostack_offset + (FLOSTACK_SPACE_CHUNK - 1)) + / FLOSTACK_SPACE_CHUNK); + jit_subi_l(JIT_SP, JIT_SP, offset * sizeof(double)); + + for (i = 0; i < direct_flostack_offset; i++) { + int i_pos, a_pos; + i_pos = jitter->flostack_offset - direct_flostack_offset + i + 1; + a_pos = direct_flostack_offset - i; + mz_ld_fppush_x(JIT_FPR0, i_pos, JIT_R2); + mz_st_fppop(a_pos, JIT_FPR0); + CHECK_LIMIT(); + } + + jit_jmpr(JIT_R1); + + mz_patch_ucbranch(refgo); + + /* install callback pointer and jump to shared code: */ + + (void)jit_patchable_movi_p(JIT_R1, refcopy); + mz_set_local_p(JIT_R1, JIT_LOCAL3); + + (void)jit_calli(unboxed_code); + + refdone = jit_bnei_p(jit_forward(), JIT_R0, NULL); + *_refdone = refdone; + + CHECK_LIMIT(); + + /* box arguments for slow path */ + for (i = 0, k = 0; i < num_rands; i++) { + if ((SCHEME_CLOSURE_DATA_FLAGS(direct_data) & CLOS_HAS_TYPED_ARGS) + && (CLOSURE_ARGUMENT_IS_FLONUM(direct_data, i))) { + k++; + offset = jitter->flostack_offset - direct_flostack_offset + k; + offset = JIT_FRAME_FLONUM_OFFSET - (offset * sizeof(double)); + jit_ldxi_p(JIT_R0, JIT_RUNSTACK, WORDS_TO_BYTES(i)); + __START_TINY_JUMPS__(1); + ref = jit_bnei_p(jit_forward(), JIT_R0, NULL); + __END_TINY_JUMPS__(1); + CHECK_LIMIT(); + jit_movi_l(JIT_R0, offset); + (void)jit_calli(sjc.box_flonum_from_stack_code); + jit_stxi_p(WORDS_TO_BYTES(i), JIT_RUNSTACK, JIT_R0); + __START_TINY_JUMPS__(1); + mz_patch_branch(ref); + __END_TINY_JUMPS__(1); + } + } + + /* Reset V1 to rator for slow path: */ + scheme_generate(rator, jitter, 0, 0, 0, JIT_V1, NULL); + mz_rs_sync(); + + return 1; +} + +#endif + int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_rands, mz_jit_state *jitter, int is_tail, int multi_ok, int no_call) /* de-sync'd ok @@ -1482,7 +1579,7 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_ int direct_prim = 0, need_non_tail = 0, direct_native = 0, direct_self = 0, nontail_self = 0; Scheme_Native_Closure *inline_direct_native = NULL; Scheme_Closure_Data *direct_data = NULL; - int direct_flostack_offset = 0; + int direct_flostack_offset = 0, unboxed_non_tail_args = 0; jit_direct_arg *inline_direct_args = NULL; int proc_already_in_place = 0; Scheme_Object *rator, *v, *arg; @@ -1570,7 +1667,7 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_ direct_self = 1; else if (jitter->self_nontail_code) nontail_self = 1; - } else if (is_tail) { + } else { Scheme_Closure *c = (Scheme_Closure *)rator; if (ZERO_SIZED_CLOSUREP(c)) { /* If we're calling a constant function in tail position, then @@ -1587,12 +1684,19 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_ } } if (nc->code->start_code != scheme_on_demand_jit_code) { - if (nc->code->max_let_depth > jitter->max_tail_depth) - jitter->max_tail_depth = nc->code->max_let_depth; - - direct_data = data; /* for flonum handling */ - - inline_direct_native = nc; + if (is_tail) { + if (nc->code->max_let_depth > jitter->max_tail_depth) + jitter->max_tail_depth = nc->code->max_let_depth; + + direct_data = data; /* for flonum handling */ + + inline_direct_native = nc; + } else { + if (num_rands < MAX_SHARED_CALL_RANDS) { + direct_data = data; + unboxed_non_tail_args = 1; + } + } } } } @@ -1844,16 +1948,18 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_ generate_nontail_self_setup(jitter); } scheme_generate_non_tail_call(jitter, num_rands, direct_native, jitter->need_set_rs, - multi_ok, nontail_self, 0, 1); + multi_ok, nontail_self, 0, 1, 0); } } } else { - /* Jump to code to implement a tail call for num_rands arguments */ + /* Jump to code to implement a [tail-]call for `num_rands' arguments */ void *code; int dp = (direct_prim ? 1 : (direct_native ? (1 + direct_native + (nontail_self ? 1 : 0)) : 0)); + /* if unboxed_non_tail_args, then we'll also use index 4 in place of dp */ + if (is_tail) { if (!sjc.shared_tail_code[dp][num_rands]) { - code = scheme_generate_shared_call(num_rands, jitter, multi_ok, is_tail, direct_prim, direct_native, 0); + code = scheme_generate_shared_call(num_rands, jitter, multi_ok, is_tail, direct_prim, direct_native, 0, 0); sjc.shared_tail_code[dp][num_rands] = code; } code = sjc.shared_tail_code[dp][num_rands]; @@ -1897,10 +2003,24 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_ } } else { int mo = (multi_ok ? 1 : 0); + void *unboxed_code; + + if (unboxed_non_tail_args && !direct_flostack_offset) + unboxed_non_tail_args = 0; + + if (unboxed_non_tail_args) { + if (!sjc.shared_non_tail_code[4][num_rands][mo]) { + scheme_ensure_retry_available(jitter, multi_ok); + code = scheme_generate_shared_call(num_rands, jitter, multi_ok, is_tail, direct_prim, direct_native, nontail_self, 1); + sjc.shared_non_tail_code[4][num_rands][mo] = code; + } + unboxed_code = sjc.shared_non_tail_code[4][num_rands][mo]; + } else + unboxed_code = NULL; if (!sjc.shared_non_tail_code[dp][num_rands][mo]) { scheme_ensure_retry_available(jitter, multi_ok); - code = scheme_generate_shared_call(num_rands, jitter, multi_ok, is_tail, direct_prim, direct_native, nontail_self); + code = scheme_generate_shared_call(num_rands, jitter, multi_ok, is_tail, direct_prim, direct_native, nontail_self, 0); sjc.shared_non_tail_code[dp][num_rands][mo] = code; } LOG_IT(("<-non-tail %d %d %d\n", dp, num_rands, mo)); @@ -1917,7 +2037,21 @@ int scheme_generate_app(Scheme_App_Rec *app, Scheme_Object **alt_rands, int num_ else (void)jit_calli(sjc.apply_to_list_code); } else { + GC_CAN_IGNORE jit_insn *refdone; + +#ifdef USE_FLONUM_UNBOXING + if (unboxed_code) { + generate_call_path_with_unboxes(jitter, direct_flostack_offset, unboxed_code, &refdone, + num_rands, direct_data, rator); + CHECK_LIMIT(); + } else +#endif + refdone = NULL; + (void)jit_calli(code); + + if (refdone) + mz_patch_branch(refdone); } if (direct_prim) { diff --git a/src/racket/src/jitcommon.c b/src/racket/src/jitcommon.c index 1e5713e0a8..b2fa5444eb 100644 --- a/src/racket/src/jitcommon.c +++ b/src/racket/src/jitcommon.c @@ -971,7 +971,7 @@ static int generate_apply_proxy(mz_jit_state *jitter, int setter) CHECK_LIMIT(); JIT_UPDATE_THREAD_RSPTR(); __END_SHORT_JUMPS__(1); - scheme_generate_non_tail_call(jitter, 3, 0, 0, 0, 0, 0, 1); + scheme_generate_non_tail_call(jitter, 3, 0, 0, 0, 0, 0, 1, 0); __START_SHORT_JUMPS__(1); CHECK_LIMIT(); if (setter) { @@ -2844,7 +2844,7 @@ static int more_common0(mz_jit_state *jitter, void *_data) mz_rs_sync(); __END_SHORT_JUMPS__(1); - scheme_generate_non_tail_call(jitter, 2, 0, 1, 0, 0, 0, 0); + scheme_generate_non_tail_call(jitter, 2, 0, 1, 0, 0, 0, 0, 0); CHECK_LIMIT(); __START_SHORT_JUMPS__(1); @@ -3279,7 +3279,7 @@ static int more_common1(mz_jit_state *jitter, void *_data) __END_SHORT_JUMPS__(1); - scheme_generate_non_tail_call(jitter, -1, 0, 1, multi_ok, 0, 1, 0); + scheme_generate_non_tail_call(jitter, -1, 0, 1, multi_ok, 0, 1, 0, 0); scheme_jit_register_sub_func(jitter, code, scheme_false); } diff --git a/src/racket/src/lightning/i386/fp-sse.h b/src/racket/src/lightning/i386/fp-sse.h index 015b2bcae0..170741d37d 100644 --- a/src/racket/src/lightning/i386/fp-sse.h +++ b/src/racket/src/lightning/i386/fp-sse.h @@ -98,7 +98,7 @@ #define jit_ldxr_d(f0, r0, r1) MOVSDmr(0, r0, r1, _SCL1, f0) -#define jit_ldxi_d(f0, r0, i0) MOVSDmr(i0, r0, _NOREG, _SCL1, f0); +#define jit_ldxi_d(f0, r0, i0) MOVSDmr(i0, r0, _NOREG, _SCL1, f0) #define jit_str_d(r0, f0) MOVSDrm(f0, 0, r0, _NOREG, _SCL1)