diff --git a/collects/scribblings/guide/guide.scrbl b/collects/scribblings/guide/guide.scrbl index 13ae4b46f8..75bb5e5aad 100644 --- a/collects/scribblings/guide/guide.scrbl +++ b/collects/scribblings/guide/guide.scrbl @@ -106,15 +106,7 @@ printed output. @subsection[#:tag "ephemerons"]{Ephemerons} @; ---------------------------------------------------------------------- -@section[#:tag "performance"]{Performance} - -Every definition or expression is compiled to an internal bytecode -format. Standard optimizations are applied when compiling the -bytecode. For example, in an environment where @scheme[+] has its -usual binding, the expression @scheme[(let ([x 1][y (lambda () 4)]) (+ -1 (y)))] is compiled the same as the constant @scheme[5] due to -constant propagation, constant folding, and inlining optimizations. - +@include-section["performance.scrbl"] @; ---------------------------------------------------------------------- @section[#:tag "ffi"]{Foreign-Function Interface@aux-elem{ (FFI)}} diff --git a/collects/scribblings/guide/performance.scrbl b/collects/scribblings/guide/performance.scrbl new file mode 100644 index 0000000000..1d2e0bcd12 --- /dev/null +++ b/collects/scribblings/guide/performance.scrbl @@ -0,0 +1,318 @@ +#lang scribble/doc +@(require scribble/manual + "guide-utils.ss") + +@title[#:tag "performance"]{Performance} + +Alan Perlis famously quipped ``Lisp programmers know the value of +everything and the cost of nothing.'' A Scheme programmer knows, for +example, that a @scheme[lambda] anywhere in a program produces a value +that is closed over it lexical environment---but how much does +allocating that value cost? While most programmers have a reasonable +grasp of the cost of various operations and data structures at the +machine level, the gap between the Scheme language model and the +underlying computing machinery can be quite large. + +In this chapter, we narrow the gap by explaining details of the PLT +Scheme compiler and run-time system and how they affect the run-time +and memory performance of Scheme code. + +@; ---------------------------------------------------------------------- + +@section{The Bytecode and Just-in-Time (JIT) Compilers} + +Every definition or expression to be evaluated by Scheme is compiled +to an internal bytecode format. In interactive mode, this compilation +occurs automatically and on-the-fly. Tools like @exec{setup-plt} and +@scheme[compile-file] marshal compiled bytecode to a file. Most of the +time required to compile a file is actually in macro expansion; +generating bytecode from fully expanded code is relatively fast. + +The bytecode compiler applies all standard optimizations, such as +constant propagation, constant folding, inlining, and dead-code +elimination. For example, in an environment where @scheme[+] has its +usual binding, the expression @scheme[(let ([x 1][y (lambda () 4)]) (+ +1 (y)))] is compiled the same as the constant @scheme[5]. + +On some platforms, bytecode is further compiled to native code via a +@deftech{jut-in-time} or @deftech{JIT} compiler. The @tech{JIT} +compiler substantially speed programs that execute tight loops, +arithmetic on small integers, and arithmetic on inexact real +numbers. Currently, @tech{JIT} compilation is supported for x86, +x86_64 (a.k.a. AMD64), and 32-bit PowerPC processors. The @tech{JIT} +compiler can be disabled via the @scheme[eval-jit-enabled] parameter +or the @DFlag{no-jit}/@Flag{j} command-line flag. + +The @tech{JIT} compiler works incrementally as functions are applied, +but the @tech{JIT} compiler makes only limited use of run-time +information when compiling procedures, since the code for a given +module body or @scheme[lambda] abstraction is compiled only once. The +@tech{JIT}'s granularity of compilation is a single procedure body, +not counting the bodies of any lexically nested procedures. The +overhead for @tech{JIT} compilation is normally so small that it is +difficult to detect. + +@; ---------------------------------------------------------------------- + +@section{Modules and Performance} + +The module system aids optimization by helping to ensure that +identifiers have the usual bindings. That is, the @scheme[+] provided +by @schememodname[scheme/base] can be recognized by the compiler and +inlined, which is especially imported for @tech{JIT}-compiled code. +In contrast, in a traditional interactive Scheme system, the top-level +@scheme[+] binding might be redefined, so the compiler cannot assume a +fixed @scheme[+] binding (unless special flags or declarations +act as a poor-man's module system to indicate otherwise). + +Even in the top-level environment, importing with @scheme[require] +enables some inlining optimizations. Although a @scheme[+] definition +at the top level might shadow an imported @scheme[+], the shadowing +definition applies only to expressions evaluated later. + +Within a module, inlining and constant-propagation optimizations take +additional advantage of the fact that definitions within a module +cannot be mutated when no @scheme[set!] is visable at compile +time. Such optimizations are unavailable in the top-level +environment. Although this optimization within modules is important +for performance, it hinders some forms of interactive development and +exploration. The @scheme[compile-enforce-module-constants] parameter +disables the @tech{JIT} compiler's assumptions about module +definitions when interactive exploration is more important. See +@secref["module-set"] for more information. + +Currently, the compiler does not attempt to inline or propagate +constant across module boundary, except for exports of the built-in +modules (such as the one that originally provides @scheme[+]). + +The later section @secref["letrec-performance"] provides some +additional caveats concerning inlining of module bindings. + +@; ---------------------------------------------------------------------- + +@section[#:tag "func-call-performance"]{Function-Call Optimizations} + +When the compiler detects a function call to an immediately visible +function, it generates more efficient code than for a generic call, +especially for tail calls. For example, given the program + +@schemeblock[ +(letrec ([odd (lambda (x) + (if (zero? x) + #f + (even (sub1 x))))] + [even (lambda (x) + (if (zero? x) + #t + (odd (sub1 x))))]) + (odd 40000000)) +] + +the compiler can detect the @scheme[odd]--@scheme[even] loop and +produce code that runs much faster via loop unrolling and related +optimizations. + +Within a module form, @scheme[define]d variables are lexically scoped +like @scheme[letrec] bindings, and definitions within a module +therefore permit call optimizations, so + +@schemeblock[ +(define (odd x) ....) +(define (even x) ....) +] + +within a module would perform the same as the @scheme[letrec] version. + +Primitive operations like @scheme[pair?], @scheme[car], and +@scheme[cdr] are inlined at the machine-code level by the @tech{JIT} +compiler. See also the later section @secref["fixnums+flonums"] for +information about inlined arithmetic operations. + +@; ---------------------------------------------------------------------- + +@section{Mutation and Performance} + +Using @scheme[set!] to mutate a variable can lead to bad +performance. For example, the microbenchmark + +@schememod[ +scheme/base + +(define (subtract-one x) + (set! x (sub1 x)) + x) + +(time + (let loop ([n 4000000]) + (if (zero? n) + 'done + (loop (subtract-one n))))) +] + +runs much more slowly than the equivalent + +@schememod[ +scheme/base + +(define (subtract-one x) + (sub1 x)) + +(time + (let loop ([n 4000000]) + (if (zero? n) + 'done + (loop (subtract-one n))))) +] + +In the first variant, a new location is allocated for @scheme[x] on +every iteration, leading to poor performance. A more clever compiler +could unravel the use of @scheme[set!] in the first example, but since +mutation is discouraged (see @secref["using-set!"]), the compiler's +effort is spent elsewhere. + +More significantly, mutation can obscure bindings where inlining and +constant-propagation might otherwise apply. For example, in + +@schemeblock[ +(let ([minus1 #f]) + (set! minus1 sub1) + (let loop ([n 4000000]) + (if (zero? n) + 'done + (loop (minus1 n))))) +] + +the @scheme[set!] obscures the fact that @scheme[minus1] is just +another name for the built-in @scheme[sub1]. + +@; ---------------------------------------------------------------------- + +@section[#:tag "letrec-performance"]{@scheme[letrec] Performance} + +When @scheme[letrec] is used to bind only procedures and literals, +then the compiler can treat the bindings in an optimal manner, +compiling uses of the bindings efficiently. When other kinds of +bindings are mixed with procedures, the compiler may be less able to +determine the control flow. + +For example, + +@schemeblock[ +(letrec ([loop (lambda (x) + (if (zero? x) + 'done + (loop (next x))))] + [junk (display loop)] + [next (lambda (x) (sub1 x))]) + (loop 40000000)) +] + +likely compiles to less efficient code than + +@schemeblock[ +(letrec ([loop (lambda (x) + (if (zero? x) + 'done + (loop (next x))))] + [next (lambda (x) (sub1 x))]) + (loop 40000000)) +] + +In the first case, the compiler likely does not know that +@scheme[display] does not call @scheme[loop]. If it did, then +@scheme[loop] might refer to @scheme[next] before the binding is +available. + +This caveat about @scheme[letrec] also applies to definitions of +functions and constants within modules. A definition sequence in a +module body is analogous to a sequence of @scheme[letrec] bindings, +and non-constant expressions in a module body can interfere with the +optimization of references to later bindings. + +@; ---------------------------------------------------------------------- + +@section[#:tag "fixnums+flonums"]{Fixnum and Flonum Optimizations} + +A @deftech{fixnum} is a small exact integer. In this case, ``small'' +depends on the platform. For a 32-bit machine, numbers that can be +expressed in 30 bits plus a sign bit are represented as fixnums. On a +64-bit machine, 62 bits plus a sign bit are available. + +A @deftech{flonum} is used to represent any inexact real number. They +correspond to 64-bit IEEE floating-point numbers on all platforms. + +Inlined fixnum and flonum arithmetic operations are among the most +important advantages of the @tech{JIT} compiler. For example, when +@scheme[+] is applied to two arguments, the generated machine code +tests whether the two arguments are fixnums, and if so, it uses the +machine's instruction to add the numbers (and check for overflow). If +the two numbers are not fixnums, then the next check whether whether +both are flonums; in that case, the machine's floating-point +operations are used directly. For functions that take any number of +arguments, such as @scheme[+], inlining is applied only for the +two-argument case (except for @scheme[-], whose one-argument case is +also inlined). + +Flonums are @defterm{boxed}, which means that memory is allocated to +hold every result of a flonum computation. Fortunately, the +generational garbage collector (described later in @secref["gc-perf"]) +makes allocation for short-lived results reasonably cheap. Fixnums, in +contrast are never boxed, so they are especially cheap to use. + +@; ---------------------------------------------------------------------- + +@section[#:tag "gc-perf"]{Memory Management} + +PLT Scheme is available in two variants: @deftech{3m} and +@deftech{CGC}. The @tech{3m} variant uses a modern, +@deftech{generational garbage collector} that makes allocation +relatively cheap for short-lived objects. The @tech{CGC} variant uses +a @deftech{conservative garbage collector} which facilitates +interaction with C code at the expense of both precision and speed for +Scheme memory management. The 3m variant is the standard one. + +Although memory allocation is reasonably cheap, avoiding allocation +altogether is normally faster. One particular place where allocation +can be avoided sometimes is in @deftech{closures}, which are the +run-time representation of functions that contain free variables. +For example, + +@schemeblock[ +(let loop ([n 40000000][prev-thunk (lambda () #f)]) + (if (zero? n) + (prev-thunk) + (loop (sub1 n) + (lambda () n)))) +] + +allocates a closure on every iteration, since @scheme[(lambda () n)] +effectively saves @scheme[n]. + +The compiler can eliminate many closures automatically. For example, +in + +@schemeblock[ +(let loop ([n 40000000][prev-val #f]) + (let ([prev-thunk (lambda () n)]) + (if (zero? n) + prev-val + (loop (sub1 n) (prev-thunk))))) +] + +no closure is ever allocated for @scheme[prev-thunk], because its only +application is visible, and so it is inlined. Similarly, in + +@schemeblock[ +(let n-loop ([n 400000]) + (if (zero? n) + 'done + (let m-loop ([m 100]) + (if (zero? m) + (n-loop (sub1 n)) + (m-loop (sub1 m)))))) +] + +then the expansion of the @scheme[let] form to implement +@scheme[m-loop] involves a closure over @scheme[n], but the compiler +automatically converts the closure to pass itself @scheme[n] as an +argument instead. diff --git a/collects/scribblings/guide/set.scrbl b/collects/scribblings/guide/set.scrbl index c0e9c0ace3..50a4f963cb 100644 --- a/collects/scribblings/guide/set.scrbl +++ b/collects/scribblings/guide/set.scrbl @@ -45,7 +45,7 @@ greeted ] @;------------------------------------------------------------------------ -@section{Guidelines for Using Assignment} +@section[#:tag "using-set!"]{Guidelines for Using Assignment} Although using @scheme[set!] is sometimes appropriate, Scheme style generally discourages the use of @scheme[set!]. The following diff --git a/collects/tests/mzscheme/benchmarks/common/auto.ss b/collects/tests/mzscheme/benchmarks/common/auto.ss index e82a1d21ed..18ad55046b 100755 --- a/collects/tests/mzscheme/benchmarks/common/auto.ss +++ b/collects/tests/mzscheme/benchmarks/common/auto.ss @@ -179,6 +179,13 @@ exec mzscheme -qu "$0" ${1+"$@"} extract-mzscheme-times clean-up-nothing mutable-pair-progs) + (make-impl 'mz-old + mk-mzscheme + (lambda (bm) + (system (format "mz-old -u ~a.ss" bm))) + extract-mzscheme-times + clean-up-nothing + mutable-pair-progs) (make-impl 'mzschemecgc mk-mzscheme (lambda (bm) diff --git a/collects/tests/mzscheme/benchmarks/shootout/mandelbrot.ss b/collects/tests/mzscheme/benchmarks/shootout/mandelbrot.ss index b12521f837..e046ce3e1c 100644 --- a/collects/tests/mzscheme/benchmarks/shootout/mandelbrot.ss +++ b/collects/tests/mzscheme/benchmarks/shootout/mandelbrot.ss @@ -39,7 +39,7 @@ (ziq (* zi zi))) (cond ((> (+ zrq ziq) +limit-sqr+) 0) - (else (loop (add1 i) (+ (- zrq ziq) cr) (+ (* 2.0 zr zi) ci))))))))) + (else (loop (add1 i) (+ (- zrq ziq) cr) (+ (* 2.0 (* zr zi)) ci))))))))) ;; ------------------------------- diff --git a/collects/tests/mzscheme/optimize.ss b/collects/tests/mzscheme/optimize.ss index 5195534f56..09fd0b5327 100644 --- a/collects/tests/mzscheme/optimize.ss +++ b/collects/tests/mzscheme/optimize.ss @@ -65,12 +65,23 @@ (bin0 v op arg1 arg2))] [bin (lambda (v op arg1 arg2) (bin-exact v op arg1 arg2) - (let ([iv (if (number? v) - (exact->inexact v) - v)]) + (let* ([iv (if (number? v) + (exact->inexact v) + v)] + [iv0 (if (and (memq op '(* /)) (zero? iv)) + 0 + iv)]) (bin0 iv op (exact->inexact arg1) arg2) - (bin0 iv op arg1 (exact->inexact arg2)) - (bin0 iv op (exact->inexact arg1) (exact->inexact arg2))))] + (bin0 iv0 op arg1 (exact->inexact arg2)) + (bin0 iv op (exact->inexact arg1) (exact->inexact arg2))) + (let ([iv (if (number? v) + (if (eq? op '*) + (/ v (* 33333 33333)) + (if (eq? op '/) + v + (/ v 33333))) + v)]) + (bin0 iv op (/ arg1 33333) (/ arg2 33333))))] [tri0 (lambda (v op get-arg1 arg2 arg3 check-effect) ;; (printf "Trying ~a ~a ~a\n" op (get-arg1) arg2 arg3); (let ([name `(,op ,get-arg1 ,arg2, arg3)]) @@ -201,6 +212,19 @@ (un -5 'sub1 -4) (un (- (expt 2 30)) 'sub1 (- 1 (expt 2 30))) + (un -1 '- 1) + (un 1 '- -1) + (un (- (expt 2 30)) '- (expt 2 30)) + (un (expt 2 30) '- (- (expt 2 30))) + (un -0.0 '- 0.0) + (un 0.0 '- -0.0) + + (un 0 'abs 0) + (un 1 'abs 1) + (un 1 'abs -1) + (un (sub1 (expt 2 31)) 'abs (sub1 (expt 2 31))) + (un (sub1 (expt 2 31)) 'abs (add1 (expt -2 31))) + (bin 11 '+ 4 7) (bin -3 '+ 4 -7) (bin (expt 2 30) '+ (expt 2 29) (expt 2 29)) @@ -213,6 +237,21 @@ (bin (- (expt 2 30)) '- (- (expt 2 29)) (expt 2 29)) (bin (- 2 (expt 2 31)) '- (- 1 (expt 2 30)) (sub1 (expt 2 30))) + (bin 4 '* 1 4) + (bin 0 '* 0 4) + (bin 12 '* 3 4) + (bin -12 '* -3 4) + (bin -12 '* 3 -4) + (bin 12 '* -3 -4) + + (bin 0 '/ 0 4) + (bin 1/4 '/ 1 4) + (bin 4 '/ 4 1) + (bin 4 '/ 16 4) + (bin -4 '/ -16 4) + (bin -4 '/ 16 -4) + (bin 4 '/ -16 -4) + (bin 3 'min 3 300) (bin -300 'min 3 -300) (bin -400 'min -400 -300) diff --git a/src/mzscheme/src/jit.c b/src/mzscheme/src/jit.c index 90343fa6d6..0ed8efca98 100644 --- a/src/mzscheme/src/jit.c +++ b/src/mzscheme/src/jit.c @@ -59,6 +59,10 @@ END_XFORM_ARITH; #endif +#ifdef MZ_USE_JIT_I386 +# define JIT_USE_FP_OPS +#endif + #ifdef MZ_USE_JIT_X86_64 # define MZ_USE_JIT_I386 # define JIT_X86_64 @@ -1099,6 +1103,21 @@ static int inline_alloc(mz_jit_state *jitter, int amt, Scheme_Type ty, int keep_ } #endif +#ifdef JIT_USE_FP_OPS +# define INLINE_FP_COMP +# ifdef CAN_INLINE_ALLOC +# define INLINE_FP_OPS +# endif +#endif + +#if defined(INLINE_FP_OPS) && !defined(CAN_INLINE_ALLOC) +static double double_result; +static void *malloc_double(void) +{ + return scheme_make_double(double_result); +} +#endif + /*========================================================================*/ /* bytecode properties */ /*========================================================================*/ @@ -2325,6 +2344,9 @@ static jit_insn *generate_arith_slow_path(mz_jit_state *jitter, Scheme_Object *r jit_insn **_ref, jit_insn **_ref4, jit_insn **for_branch, int orig_args, int reversed, int arith, int use_v, int v) +/* *_ref is place to set for where to jump (for true case, if for_branch) after completing; + *_ref4 is place to set for where to jump for false if for_branch; + result is place to jump to start slow path if fixnum attempt fails */ { jit_insn *ref, *ref4, *refslow; @@ -2394,11 +2416,210 @@ static jit_insn *generate_arith_slow_path(mz_jit_state *jitter, Scheme_Object *r # define SCHEME_INT_SMALL_ENOUGH(rand2) 1 #endif +static int can_fast_double(int arith, int cmp, int two_args) +{ +#ifdef INLINE_FP_OPS + if ((arith == 1) + || (arith == -1) + || (arith == 2) + || (arith == -2) + || (arith == 11)) + return 1; +#endif +#ifdef INLINE_FP_COMP + if (!arith + || ((arith == 9) /* min */ && two_args) + || ((arith == 10) /* max */ && two_args)) + return 1; +#endif + + return 0; +} + +static int generate_double_arith(mz_jit_state *jitter, int arith, int cmp, int reversed, int two_args, int second_const, + jit_insn **_refd, jit_insn **_refdt, + int branch_short) +{ +#if defined(INLINE_FP_OPS) || defined(INLINE_FP_COMP) + GC_CAN_IGNORE jit_insn *ref8, *ref9, *ref10, *refd, *refdt, *refskip = NULL; + int no_alloc = 0; + + /* Maybe they're doubles */ + __START_SHORT_JUMPS__(1); + if (two_args) { + jit_orr_ul(JIT_R2, JIT_R0, JIT_R1); + ref8 = jit_bmsi_ul(jit_forward(), JIT_R2, 0x1); + } else + ref8 = NULL; + jit_ldxi_s(JIT_R2, JIT_R0, &((Scheme_Object *)0x0)->type); + ref9 = jit_bnei_p(jit_forward(), JIT_R2, scheme_double_type); + if (two_args) { + jit_ldxi_s(JIT_R2, JIT_R1, &((Scheme_Object *)0x0)->type); + ref10 = jit_bnei_p(jit_forward(), JIT_R2, scheme_double_type); + } else + ref10 = NULL; + CHECK_LIMIT(); + + if (!two_args && !second_const && ((arith == 2) || ((arith == -2) && reversed))) { + /* Special case: multiplication by exact 0 */ + jit_movi_p(JIT_R0, scheme_make_integer(0)); + } else { + __END_SHORT_JUMPS__(1); + + /* Yes, they're doubles. */ + jit_ldxi_d(JIT_FPR0, JIT_R0, &((Scheme_Double *)0x0)->double_val); + if (two_args) { + jit_ldxi_d(JIT_FPR1, JIT_R1, &((Scheme_Double *)0x0)->double_val); + } else if ((arith == -1) && !second_const && reversed) { + reversed = 0; + } else { + double d = second_const; + jit_movi_d(JIT_FPR1, d); + reversed = !reversed; + cmp = -cmp; + } + if (arith) { + switch (arith) { + case 1: + jit_addr_d(JIT_FPR0, JIT_FPR0, JIT_FPR1); + break; + case 2: + jit_mulr_d(JIT_FPR0, JIT_FPR0, JIT_FPR1); + break; + case -2: + if (reversed) + jit_divr_d(JIT_FPR0, JIT_FPR0, JIT_FPR1); + else + jit_divr_d(JIT_FPR0, JIT_FPR1, JIT_FPR0); + break; + case -1: + { + if (!two_args && !second_const && !reversed) { + /* Need a special case to make sure that (- 0.0) => -0.0 */ + jit_negr_d(JIT_FPR0, JIT_FPR0); + } else if (reversed) + jit_subr_d(JIT_FPR0, JIT_FPR0, JIT_FPR1); + else + jit_subr_d(JIT_FPR0, JIT_FPR1, JIT_FPR0); + } + break; + case 9: /* min */ + case 10: /* max */ + { + GC_CAN_IGNORE jit_insn *refc; + __START_SHORT_JUMPS__(1); + if (arith == 9) { + refc = jit_bler_d(jit_forward(), JIT_FPR0, JIT_FPR1); + } else { + refc = jit_bger_d(jit_forward(), JIT_FPR0, JIT_FPR1); + } + jit_movr_p(JIT_R0, JIT_R1); + mz_patch_branch(refc); + __END_SHORT_JUMPS__(1); + no_alloc = 1; + } + break; + case 11: /* abs */ + __START_SHORT_JUMPS__(1); + refskip = jit_bger_d(jit_forward(), JIT_FPR0, JIT_FPR1); + jit_subr_d(JIT_FPR0, JIT_FPR1, JIT_FPR0); + __END_SHORT_JUMPS__(1); + break; + default: + break; + } + CHECK_LIMIT(); + + if (!no_alloc) { +#ifdef INLINE_FP_OPS +# ifdef CAN_INLINE_ALLOC + inline_alloc(jitter, sizeof(Scheme_Double), scheme_double_type, 0); + CHECK_LIMIT(); + jit_addi_p(JIT_R0, JIT_V1, sizeof(long)); +# else + (void)jit_sti_d(&double_result, JIT_FPR0); + JIT_UPDATE_THREAD_RSPTR_IF_NEEDED(); + mz_prepare(0); + (void)mz_finish(malloc_double); + jit_retval(JIT_R0); +# endif +#endif + CHECK_LIMIT(); + + (void)jit_stxi_d(&((Scheme_Double *)0x0)->double_val, JIT_R0, JIT_FPR0); + + if (refskip) { + __START_SHORT_JUMPS__(1); + mz_patch_branch(refskip); + __END_SHORT_JUMPS__(1); + } + } + } else { + __START_SHORT_JUMPS__(branch_short); + switch (cmp) { + case -2: + refdt = jit_bltr_d(jit_forward(), JIT_FPR1, JIT_FPR0); + break; + case -1: + refdt = jit_bler_d(jit_forward(), JIT_FPR1, JIT_FPR0); + break; + case 0: + refdt = jit_beqr_d(jit_forward(), JIT_FPR1, JIT_FPR0); + break; + case 1: + refdt = jit_bger_d(jit_forward(), JIT_FPR1, JIT_FPR0); + break; + case 2: + refdt = jit_bgtr_d(jit_forward(), JIT_FPR1, JIT_FPR0); + break; + default: + refdt = NULL; + break; + } + __END_SHORT_JUMPS__(branch_short); + *_refdt = refdt; + } + + __START_SHORT_JUMPS__(1); + } + + /* Jump to return result or false branch: */ + refd = jit_jmpi(jit_forward()); + *_refd = refd; + + /* No, they're not both doubles. */ + if (two_args) { + mz_patch_branch(ref8); + mz_patch_branch(ref10); + } + mz_patch_branch(ref9); + __END_SHORT_JUMPS__(1); +#endif + + return 1; +} + static int generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Object *rand, Scheme_Object *rand2, int orig_args, int arith, int cmp, int v, jit_insn **for_branch, int branch_short) +/* Either arith is non-zero or it's a cmp; the value of each determines the operation: + arith = 1 -> + or add1 (if !rand2) + arith = -1 -> - or sub1 + arith = 2 -> * + arith = 3 -> bitwise-and + arith = 4 -> bitwise-ior + arith = 5 -> bitwise-xor + arith = 6 -> arithmetic-shift + arith = 7 -> bitwise-not + arith = 9 -> min + arith = 10 -> max + arith = 11 -> abs + cmp = 0 -> = or zero? + cmp = +/-1 -> >=/<= + cmp = +/-2 -> >/< or positive/negative? + */ { - GC_CAN_IGNORE jit_insn *ref, *ref2, *ref3, *ref4, *refslow; - int skipped, simple_rand, reversed = 0; + GC_CAN_IGNORE jit_insn *ref, *ref2, *ref3, *ref4, *refd = NULL, *refdt = NULL, *refslow; + int skipped, simple_rand, reversed = 0, has_fixnum_fast = 1; LOG_IT(("inlined %s\n", ((Scheme_Primitive_Proc *)rator)->name)); @@ -2437,6 +2658,11 @@ static int generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj } } + if ((arith == -1) && (orig_args == 1) && !v) { + /* Unary subtract */ + reversed = 1; + } + if (rand2) { simple_rand = (SAME_TYPE(SCHEME_TYPE(rand), scheme_local_type) || SCHEME_INTP(rand)); @@ -2462,6 +2688,16 @@ static int generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj generate_non_tail(rand2 ? rand2 : rand, jitter, 0, 1); CHECK_LIMIT(); + if (arith == 2) { + if (rand2 || ((v != 0) && (v != 1))) + has_fixnum_fast = 0; + } else if (arith == -2) { + if (rand2 || (v != 1) || reversed) + has_fixnum_fast = 0; + } + + /* rand2 in R0, and rand in R1 unless it's simple */ + if (simple_rand) { int pos, va; @@ -2471,6 +2707,7 @@ static int generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj } else { pos = mz_remap(SCHEME_LOCAL_POS(rand)); jit_ldxi_p(JIT_R1, JIT_RUNSTACK, WORDS_TO_BYTES(pos)); + /* check both fixnum bits at once by ANDing into R2: */ jit_andr_ul(JIT_R2, JIT_R0, JIT_R1); va = JIT_R2; } @@ -2479,41 +2716,92 @@ static int generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj ref2 = jit_bmsi_ul(jit_forward(), va, 0x1); __END_SHORT_JUMPS__(1); + if (!SCHEME_INTP(rand) && can_fast_double(arith, cmp, 1)) { + /* Maybe they're both doubles... */ + generate_double_arith(jitter, arith, cmp, reversed, 1, 0, &refd, &refdt, branch_short); + CHECK_LIMIT(); + } + + if (!has_fixnum_fast) { + __START_SHORT_JUMPS__(1); + mz_patch_branch(ref2); + __END_SHORT_JUMPS__(1); + } + /* Slow path */ refslow = generate_arith_slow_path(jitter, rator, &ref, &ref4, for_branch, orig_args, reversed, arith, 0, 0); - __START_SHORT_JUMPS__(1); - mz_patch_branch(ref2); - __END_SHORT_JUMPS__(1); + if (has_fixnum_fast) { + __START_SHORT_JUMPS__(1); + mz_patch_branch(ref2); + __END_SHORT_JUMPS__(1); + } CHECK_LIMIT(); } else if (rand2) { + /* Move rand result back into R1 */ jit_ldr_p(JIT_R1, JIT_RUNSTACK); jit_addi_p(JIT_RUNSTACK, JIT_RUNSTACK, WORDS_TO_BYTES(1)); mz_runstack_popped(jitter, 1); + /* check both fixnum bits at once by ANDing into R2: */ jit_andr_ul(JIT_R2, JIT_R0, JIT_R1); __START_SHORT_JUMPS__(1); ref2 = jit_bmsi_ul(jit_forward(), JIT_R2, 0x1); __END_SHORT_JUMPS__(1); + CHECK_LIMIT(); + + if (can_fast_double(arith, cmp, 1)) { + /* Maybe they're both doubles... */ + generate_double_arith(jitter, arith, cmp, reversed, 1, 0, &refd, &refdt, branch_short); + CHECK_LIMIT(); + } + + if (!has_fixnum_fast) { + __START_SHORT_JUMPS__(1); + mz_patch_branch(ref2); + __END_SHORT_JUMPS__(1); + } /* Slow path */ refslow = generate_arith_slow_path(jitter, rator, &ref, &ref4, for_branch, orig_args, reversed, arith, 0, 0); - __START_SHORT_JUMPS__(1); - mz_patch_branch(ref2); - __END_SHORT_JUMPS__(1); + if (has_fixnum_fast) { + /* Fixnum branch: */ + __START_SHORT_JUMPS__(1); + mz_patch_branch(ref2); + __END_SHORT_JUMPS__(1); + } CHECK_LIMIT(); } else { + /* Only one argument: */ __START_SHORT_JUMPS__(1); ref2 = jit_bmsi_ul(jit_forward(), JIT_R0, 0x1); __END_SHORT_JUMPS__(1); + if ((orig_args != 2) /* <- heuristic: we could generate code when an exact argument is + given, but the extra FP code is probably not worthwhile. */ + && can_fast_double(arith, cmp, 0) + /* watch out: divide by 0 is special: */ + && ((arith != -2) || v || reversed)) { + /* Maybe it's a double... */ + generate_double_arith(jitter, arith, cmp, reversed, 0, v, &refd, &refdt, branch_short); + CHECK_LIMIT(); + } + + if (!has_fixnum_fast) { + __START_SHORT_JUMPS__(1); + mz_patch_branch(ref2); + __END_SHORT_JUMPS__(1); + } + /* Slow path */ refslow = generate_arith_slow_path(jitter, rator, &ref, &ref4, for_branch, orig_args, reversed, arith, 1, v); - __START_SHORT_JUMPS__(1); - mz_patch_branch(ref2); - __END_SHORT_JUMPS__(1); + if (has_fixnum_fast) { + __START_SHORT_JUMPS__(1); + mz_patch_branch(ref2); + __END_SHORT_JUMPS__(1); + } } CHECK_LIMIT(); @@ -2538,6 +2826,16 @@ static int generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj (void)jit_bosubr_l(refslow, JIT_R2, JIT_R0); } jit_ori_ul(JIT_R0, JIT_R2, 0x1); + } else if (arith == 2) { + if (has_fixnum_fast) { + /* No fast path for fixnum multiplication, yet */ + (void)jit_jmpi(refslow); + } + } else if (arith == -2) { + if (has_fixnum_fast) { + /* No fast path for fixnum division, yet */ + (void)jit_jmpi(refslow); + } } else if (arith == 3) { /* and */ jit_andr_ul(JIT_R0, JIT_R1, JIT_R0); @@ -2563,6 +2861,7 @@ static int generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj jit_notr_l(JIT_V1, v2); jit_rshi_l(JIT_V1, JIT_V1, 0x1); jit_addi_l(JIT_V1, JIT_V1, 0x1); + CHECK_LIMIT(); #ifdef MZ_USE_JIT_I386 /* Can't shift from _ECX */ jit_movr_l(JIT_R2, v1); @@ -2572,6 +2871,7 @@ static int generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj #endif jit_ori_l(JIT_R0, JIT_R2, 0x1); refc = jit_jmpi(jit_forward()); + CHECK_LIMIT(); /* Left shift */ mz_patch_branch(refi); @@ -2585,6 +2885,7 @@ static int generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj #else jit_lshr_l(JIT_R2, v1, JIT_V1); #endif + CHECK_LIMIT(); /* If shifting back right produces a different result, that's overflow... */ jit_rshr_l(JIT_V1, JIT_R2, JIT_V1); /* !! In case we go refslow, it nseed to add back tag to v1 !! */ @@ -2622,6 +2923,28 @@ static int generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj (void)jit_bosubi_l(refslow, JIT_R2, v << 1); jit_movr_p(JIT_R0, JIT_R2); } + } else if (arith == 2) { + if (v == 1) { + /* R0 already is the answer */ + } else if (v == 0) { + jit_movi_p(JIT_R0, scheme_make_integer(0)); + } else { + if (has_fixnum_fast) { + /* No general fast path for fixnum multiplication, yet */ + jit_movi_p(JIT_R1, scheme_make_integer(v)); + (void)jit_jmpi(refslow); + } + } + } else if (arith == -2) { + if ((v == 1) && !reversed) { + /* R0 already is the answer */ + } else { + if (has_fixnum_fast) { + /* No general fast path for fixnum division, yet */ + jit_movi_p(JIT_R1, scheme_make_integer(v)); + (void)jit_jmpi(refslow); + } + } } else { if (arith == 3) { /* and */ @@ -2665,9 +2988,22 @@ static int generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj refc = jit_bgti_l(jit_forward(), JIT_R0, (long)scheme_make_integer(v)); jit_movi_l(JIT_R0, (long)scheme_make_integer(v)); mz_patch_branch(refc); + } else if (arith == 11) { + /* abs */ + jit_insn *refc; + refc = jit_bgti_l(jit_forward(), JIT_R0, (long)scheme_make_integer(0)); + jit_rshi_l(JIT_R0, JIT_R0, 1); + jit_movi_l(JIT_R1, 0); + jit_subr_l(JIT_R0, JIT_R1, JIT_R0); + jit_lshi_l(JIT_R0, JIT_R0, 1); + jit_ori_l(JIT_R0, JIT_R0, 0x1); + mz_patch_branch(refc); + CHECK_LIMIT(); } } } + if (refd) + mz_patch_ucbranch(refd); jit_patch_movi(ref, (_jit.x.pc)); } else { /* If second is constant, first arg is in JIT_R0. */ @@ -2711,14 +3047,20 @@ static int generate_arith(mz_jit_state *jitter, Scheme_Object *rator, Scheme_Obj break; } + if (refdt) + mz_patch_ucbranch(refdt); + if (for_branch) { for_branch[0] = ref3; + for_branch[1] = refd; for_branch[2] = ref; jit_patch_movi(ref4, (_jit.x.pc)); } else { (void)jit_movi_p(JIT_R0, scheme_true); ref2 = jit_jmpi(jit_forward()); mz_patch_branch(ref3); + if (refd) + mz_patch_ucbranch(refd); (void)jit_movi_p(JIT_R0, scheme_false); mz_patch_ucbranch(ref2); jit_patch_movi(ref, (_jit.x.pc)); @@ -3175,6 +3517,12 @@ static int generate_inlined_unary(mz_jit_state *jitter, Scheme_App2_Rec *app, in } else if (IS_NAMED_PRIM(rator, "sub1")) { generate_arith(jitter, rator, app->rand, NULL, 1, -1, 0, 1, NULL, 1); return 1; + } else if (IS_NAMED_PRIM(rator, "-")) { + generate_arith(jitter, rator, app->rand, NULL, 1, -1, 0, 0, NULL, 1); + return 1; + } else if (IS_NAMED_PRIM(rator, "abs")) { + generate_arith(jitter, rator, app->rand, NULL, 1, 11, 0, 0, NULL, 1); + return 1; } else if (IS_NAMED_PRIM(rator, "bitwise-not")) { generate_arith(jitter, rator, app->rand, NULL, 1, 7, 0, 9, NULL, 1); return 1; @@ -3455,6 +3803,12 @@ static int generate_inlined_binary(mz_jit_state *jitter, Scheme_App3_Rec *app, i } else if (IS_NAMED_PRIM(rator, "-")) { generate_arith(jitter, rator, app->rand1, app->rand2, 2, -1, 0, 0, NULL, 1); return 1; + } else if (IS_NAMED_PRIM(rator, "*")) { + generate_arith(jitter, rator, app->rand1, app->rand2, 2, 2, 0, 0, NULL, 1); + return 1; + } else if (IS_NAMED_PRIM(rator, "/")) { + generate_arith(jitter, rator, app->rand1, app->rand2, 2, -2, 0, 0, NULL, 1); + return 1; } else if (IS_NAMED_PRIM(rator, "min")) { generate_arith(jitter, rator, app->rand1, app->rand2, 2, 9, 0, 0, NULL, 1); return 1; diff --git a/src/mzscheme/src/lightning/i386/asm.h b/src/mzscheme/src/lightning/i386/asm.h index 6cefd93794..cd5784faec 100644 --- a/src/mzscheme/src/lightning/i386/asm.h +++ b/src/mzscheme/src/lightning/i386/asm.h @@ -583,6 +583,8 @@ typedef _uc jit_insn; #define JNLEm(D,B,I,S) JCCim(0xf,0xe,D,B,I,S) #define JGm(D,B,I,S) JCCim(0xf,0xe,D,B,I,S) +#define JCm(D,B,I,S) JBm(D,B,I,S) +#define JNCm(D,B,I,S) JNBm(D,B,I,S) #define JMPSm(D,B,I,S) ((_r0P(B) && _r0P(I)) ? _O_D8 (0xeb ,(long)(D) ) : \ JITFAIL("illegal mode in short jump")) diff --git a/src/mzscheme/src/lightning/i386/fp.h b/src/mzscheme/src/lightning/i386/fp.h index 0d27255635..9394d5dead 100644 --- a/src/mzscheme/src/lightning/i386/fp.h +++ b/src/mzscheme/src/lightning/i386/fp.h @@ -49,8 +49,8 @@ #define JIT_FPR_NUM 6 #define JIT_FPR(i) (i) -#define jit_fxch(rs, op) (((rs) != 0 ? FXCHr(rs) : 0), \ - op, ((rs) != 0 ? FXCHr(rs) : 0)) +#define jit_fxch(rs, op) (((rs) != 0 ? FXCHr(rs) : (void)0), \ + op, ((rs) != 0 ? FXCHr(rs) : (void)0)) #define jit_fp_unary(rd, s1, op) \ ((rd) == (s1) ? jit_fxch ((rd), op) \ @@ -62,7 +62,7 @@ ((s2) == 0 ? opr(0, (rd)) \ : (s2) == (s1) ? jit_fxch((rd), op(0, 0)) \ : jit_fxch((rd), op((s2), 0))) \ - : (rd) == (s2) ? jit_fxch((s1), opr(0, (rd) == 0 ? (s1) : (rd))) \ + : (rd) == (s2) ? jit_fxch((s1), op(0, (rd) == 0 ? (s1) : (rd))) \ : (FLDr (s1), op(0, (s2)+1), FSTPr((rd)+1))) #define jit_addr_d(rd,s1,s2) jit_fp_binary((rd),(s1),(s2),FADDrr,FADDrr) @@ -255,7 +255,7 @@ union jit_double_imm { ((_and) ? ANDLir ((_and), _EAX) : 0), \ ((cmp) ? CMPLir ((cmp), _AL) : 0), \ POPLr(_EAX), \ - res ((d), 0, 0, 0)) + res ((d), 0, 0, 0), _jit.x.pc) #define jit_nothing_needed(x) diff --git a/src/mzscheme/src/numarith.c b/src/mzscheme/src/numarith.c index 45a181bc2a..96f884c8a7 100644 --- a/src/mzscheme/src/numarith.c +++ b/src/mzscheme/src/numarith.c @@ -54,24 +54,22 @@ void scheme_init_numarith(Scheme_Env *env) scheme_add_global_constant("+", p, env); p = scheme_make_folding_prim(minus, "-", 1, -1, 1); - SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_BINARY_INLINED; + SCHEME_PRIM_PROC_FLAGS(p) |= (SCHEME_PRIM_IS_BINARY_INLINED + | SCHEME_PRIM_IS_UNARY_INLINED); scheme_add_global_constant("-", p, env); - scheme_add_global_constant("*", - scheme_make_folding_prim(mult, - "*", - 0, -1, 1), - env); - scheme_add_global_constant("/", - scheme_make_folding_prim(div_prim, - "/", - 1, -1, 1), - env); - scheme_add_global_constant("abs", - scheme_make_folding_prim(scheme_abs, - "abs", - 1, 1, 1), - env); + p = scheme_make_folding_prim(mult, "*", 0, -1, 1); + SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_BINARY_INLINED; + scheme_add_global_constant("*", p, env); + + p = scheme_make_folding_prim(div_prim, "/", 1, -1, 1); + SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_BINARY_INLINED; + scheme_add_global_constant("/", p, env); + + p = scheme_make_folding_prim(scheme_abs, "abs", 1, 1, 1); + SCHEME_PRIM_PROC_FLAGS(p) |= SCHEME_PRIM_IS_UNARY_INLINED; + scheme_add_global_constant("abs", p, env); + scheme_add_global_constant("quotient", scheme_make_folding_prim(quotient, "quotient",