From 540c58bbe8d6dc0f8be86c8d717d1e2343fee056 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Sat, 11 Jan 2020 09:28:38 -0700 Subject: [PATCH] use POPCNT instruction when available on x86_64 On x86_64, a POPCNT instruction is usually available, and it can speed up `fxpopcount` operations by a factor of 2-3. Since POPCNT isn't always available, code using `fxpopcount` is compiled to a call to a generic implementation. The linker substitutes a POPCNT instruction when it determines at runtime that POPCNT is available. Some measurements on a 2018 MacBook Pro (2.7 GHz Core i7) using the program below: popcnt = this implementation, POPCNT discovered nocnt = this implementation, POPCNT considered unavailable optcnt = compile to use POPCNT directly (no linker work) cpcnt = compile to inlined generic (no linker work, no POPCNT) Since the generic implementation is always a 64-bit popcount, it's not as good as an inlined version for `fxpopcount32`, but otherwise the link-edit approach to POPCNT works well: fxpopcount fxpopcount32 popcnt: 0.098s nocnt: 0.284s optcnt 0.109s [slower means noise?] cpcnt: 0.279s 0.188s (optimize-level 3) (time (let loop ([v #f] [i 100000000]) (if (fx= i 0) v (loop (fxpopcount i) (fx- i 1))))) original commit: 5f090e509f8fe5edc777ed9f0463b20c2e571336 --- c/externs.h | 3 ++ c/fasl.c | 53 +++++++++++++++--- c/prim.c | 4 ++ makefiles/Mf-install.in | 2 +- s/a6fb.def | 2 +- s/a6le.def | 2 +- s/a6nb.def | 2 +- s/a6nt.def | 2 +- s/a6ob.def | 2 +- s/a6osx.def | 2 +- s/a6s2.def | 2 +- s/cmacros.ss | 6 ++- s/compile.ss | 10 +++- s/cpnanopass.ss | 34 +++++++++++- s/library.ss | 2 + s/mkheader.ss | 4 ++ s/np-languages.ss | 1 + s/ta6fb.def | 2 +- s/ta6le.def | 2 +- s/ta6nb.def | 2 +- s/ta6nt.def | 2 +- s/ta6ob.def | 2 +- s/ta6osx.def | 2 +- s/ta6s2.def | 2 +- s/x86_64.ss | 115 ++++++++++++++++++++++++++++++++++------ 25 files changed, 221 insertions(+), 41 deletions(-) diff --git a/c/externs.h b/c/externs.h index b66cfacab3..9fb275714b 100644 --- a/c/externs.h +++ b/c/externs.h @@ -118,6 +118,9 @@ extern void S_set_code_obj PROTO((char *who, IFASLCODE typ, ptr p, iptr n, extern ptr S_get_code_obj PROTO((IFASLCODE typ, ptr p, iptr n, iptr o)); extern int S_fasl_stream_read PROTO((void *stream, octet *dest, iptr n)); extern int S_fasl_intern_rtd(ptr *x); +#ifdef X86_64 +extern void x86_64_set_popcount_present PROTO((ptr code)); +#endif /* vfasl.c */ extern ptr S_to_vfasl PROTO((ptr v)); diff --git a/c/fasl.c b/c/fasl.c index 5326bdb8b3..18e0f04933 100644 --- a/c/fasl.c +++ b/c/fasl.c @@ -241,6 +241,7 @@ static uptr ppc32_get_jump PROTO((void *address)); #ifdef X86_64 static void x86_64_set_jump PROTO((void *address, uptr item, IBOOL callp)); static uptr x86_64_get_jump PROTO((void *address)); +static void x86_64_set_popcount PROTO((void *address, uptr item)); #endif /* X86_64 */ #ifdef SPARC64 static INT extract_reg_from_sethi PROTO((void *address)); @@ -1293,6 +1294,9 @@ void S_set_code_obj(who, typ, p, n, x, o) char *who; IFASLCODE typ; iptr n, o; p case reloc_x86_64_call: x86_64_set_jump(address, item, 1); break; + case reloc_x86_64_popcount: + x86_64_set_popcount(address, item); + break; #endif /* X86_64 */ #ifdef SPARC64 case reloc_sparc64abs: @@ -1364,6 +1368,9 @@ ptr S_get_code_obj(typ, p, n, o) IFASLCODE typ; iptr n, o; ptr p; { case reloc_x86_64_call: item = x86_64_get_jump(address); break; + case reloc_x86_64_popcount: + item = (uptr)Svector_ref(S_G.library_entry_vector, library_popcount_slow) + o; + break; #endif /* X86_64 */ #ifdef SPARC64 case reloc_sparc64abs: @@ -1509,18 +1516,20 @@ static uptr ppc32_get_jump(void *address) { #endif /* PPC32 */ #ifdef X86_64 + static void x86_64_set_jump(void *address, uptr item, IBOOL callp) { I64 disp = (I64)item - ((I64)address + 5); /* 5 = size of call instruction */ if ((I32)disp == disp) { *(octet *)address = callp ? 0xE8 : 0xE9; /* call or jmp disp32 opcode */ *(I32 *)((uptr)address + 1) = (I32)disp; - *((octet *)address + 5) = 0x90; /* nop */ - *((octet *)address + 6) = 0x90; /* nop */ - *((octet *)address + 7) = 0x90; /* nop */ - *((octet *)address + 8) = 0x90; /* nop */ - *((octet *)address + 9) = 0x90; /* nop */ - *((octet *)address + 10) = 0x90; /* nop */ - *((octet *)address + 11) = 0x90; /* nop */ + /* 7-byte nop: */ + *((octet *)address + 5) = 0x0F; + *((octet *)address + 6) = 0x1F; + *((octet *)address + 7) = 0x80; + *((octet *)address + 8) = 0x00; + *((octet *)address + 9) = 0x00; + *((octet *)address + 10) = 0x00; + *((octet *)address + 11) = 0x00; } else { *(octet *)address = 0x48; /* REX w/REX.w set */ *((octet *)address + 1)= 0xB8; /* MOV imm64 to RAX */ @@ -1538,6 +1547,36 @@ static uptr x86_64_get_jump(void *address) { /* must be short form: call/jmp */ return ((uptr)address + 5) + *(I32 *)((uptr)address + 1); } + +static int popcount_present; + +static void x86_64_set_popcount(void *address, uptr item) { + if (!popcount_present) { + x86_64_set_jump(address, item, 1); + } else { + *((octet *)address + 0) = 0x48; /* REX */ + *((octet *)address + 1) = 0x31; /* XOR RAX, RAX - avoid false dependency */ + *((octet *)address + 2) = 0xc0; + *((octet *)address + 3) = 0xF3; + *((octet *)address + 4) = 0x48; /* REX */ + *((octet *)address + 5) = 0x0F; /* POPCNT */ + *((octet *)address + 6) = 0xB8; + *((octet *)address + 7) = 0xC7; /* RDI -> RAX */ + /* 4-byte nop: */ + *((octet *)address + 8) = 0x0F; + *((octet *)address + 9) = 0x1F; + *((octet *)address + 10) = 0x40; + *((octet *)address + 11) = 0x00; + } +} + +void x86_64_set_popcount_present(ptr code) { + /* cpu_features returns ECX after CPUID for function 1 */ + int (*cpu_features)() = (int (*)())((uptr)code + code_data_disp); + if (cpu_features() & (1 << 23)) + popcount_present = 1; +} + #endif /* X86_64 */ #ifdef SPARC64 diff --git a/c/prim.c b/c/prim.c index 91afee40e9..ec25d2ccba 100644 --- a/c/prim.c +++ b/c/prim.c @@ -42,6 +42,10 @@ static void install_library_entry(n, x) ptr n, x; { S_G.nonprocedure_code = x; S_retrofit_nonprocedure_code(); } +#ifdef X86_64 + if (n == FIX(library_cpu_features)) + x86_64_set_popcount_present(x); +#endif } ptr S_lookup_library_entry(n, errorp) iptr n; IBOOL errorp; { diff --git a/makefiles/Mf-install.in b/makefiles/Mf-install.in index 69945027ca..90a424ab62 100644 --- a/makefiles/Mf-install.in +++ b/makefiles/Mf-install.in @@ -62,7 +62,7 @@ InstallLZ4Target= # no changes should be needed below this point # ############################################################################### -Version=csv9.5.3.10 +Version=csv9.5.3.11 Include=boot/$m PetiteBoot=boot/$m/petite.boot SchemeBoot=boot/$m/scheme.boot diff --git a/s/a6fb.def b/s/a6fb.def index 095a4065fd..6f62f5e5e9 100644 --- a/s/a6fb.def +++ b/s/a6fb.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor) diff --git a/s/a6le.def b/s/a6le.def index 39b55f44e5..d1d38dce01 100644 --- a/s/a6le.def +++ b/s/a6le.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor) diff --git a/s/a6nb.def b/s/a6nb.def index b7b8dd9e5a..a0d51758d3 100644 --- a/s/a6nb.def +++ b/s/a6nb.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor) diff --git a/s/a6nt.def b/s/a6nt.def index 122a60ae4f..858ec529a9 100644 --- a/s/a6nt.def +++ b/s/a6nt.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor windows) diff --git a/s/a6ob.def b/s/a6ob.def index fb9b7a1c61..a1bac80483 100644 --- a/s/a6ob.def +++ b/s/a6ob.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor) diff --git a/s/a6osx.def b/s/a6osx.def index ea26168a19..dbf65de27d 100644 --- a/s/a6osx.def +++ b/s/a6osx.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor) diff --git a/s/a6s2.def b/s/a6s2.def index efa47f7245..85342c1769 100644 --- a/s/a6s2.def +++ b/s/a6s2.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor) diff --git a/s/cmacros.ss b/s/cmacros.ss index 472d6fa7ff..f9b1b16448 100644 --- a/s/cmacros.ss +++ b/s/cmacros.ss @@ -328,7 +328,7 @@ [(_ foo e1 e2) e1] ... [(_ bar e1 e2) e2]))))]))) -(define-constant scheme-version #x0905030A) +(define-constant scheme-version #x0905030B) (define-syntax define-machine-types (lambda (x) @@ -500,7 +500,7 @@ (sparc reloc-sparcabs reloc-sparcrel) (sparc64 reloc-sparc64abs reloc-sparc64rel) (ppc reloc-ppccall reloc-ppcload) - (x86_64 reloc-x86_64-call reloc-x86_64-jump) + (x86_64 reloc-x86_64-call reloc-x86_64-jump reloc-x86_64-popcount) (arm32 reloc-arm32-abs reloc-arm32-call reloc-arm32-jump) (ppc32 reloc-ppc32-abs reloc-ppc32-call reloc-ppc32-jump)) @@ -2716,6 +2716,8 @@ ($wrapper-apply #f 0 #f #f) (wrapper-apply #f 0 #f #f) (arity-wrapper-apply #f 0 #f #f) + (popcount-slow #f 0 #f #t) + (cpu-features #f 0 #f #t) )) (let () diff --git a/s/compile.ss b/s/compile.ss index e37a4ace5f..7fbac0b5e9 100644 --- a/s/compile.ss +++ b/s/compile.ss @@ -204,6 +204,10 @@ (let ([a1 (fx- a 12)] [x* (cons (mkcode x) x*)]) (let ([r ($reloc (constant reloc-x86_64-call) n (fx- a1 ra))]) (mkc0 (cdr c*) a (cons r r*) a1 x*)))] + [(x86_64-popcount) (n x) + (let ([a1 (fx- a 12)] [x* (cons (mkcode x) x*)]) + (let ([r ($reloc (constant reloc-x86_64-popcount) n (fx- a1 ra))]) + (mkc0 (cdr c*) a (cons r r*) a1 x*)))] [else (c-assembler-output-error c)])] [else (c-assembler-output-error c)])])))) p))] @@ -258,7 +262,7 @@ [else (void)])] [(x86_64) (record-case x - [(x86_64-jump x86_64-call) (n x) (build x d)] + [(x86_64-jump x86_64-call x86_64-popcount) (n x) (build x d)] [else (void)])] [(arm32) (record-case x @@ -394,6 +398,10 @@ (let ([a1 (fx- a 12)]) ; 10-byte moviq followed by 2-byte call (let ([r ($reloc (constant reloc-x86_64-call) n (fx- a1 ra))]) (prf0 (cdr c*) a (cons r r*) a1 (cons x x*))))] + [(x86_64-popcount) (n x) + (let ([a1 (fx- a 12)]) ; like a call, for worst case + (let ([r ($reloc (constant reloc-x86_64-popcount) n (fx- a1 ra))]) + (prf0 (cdr c*) a (cons r r*) a1 (cons x x*))))] [else (c-assembler-output-error c)])] [else (c-assembler-output-error c)])]))))])) diff --git a/s/cpnanopass.ss b/s/cpnanopass.ss index 020ce18f10..3912b559d2 100644 --- a/s/cpnanopass.ss +++ b/s/cpnanopass.ss @@ -995,6 +995,11 @@ (sealed #t) (fields offset)) + (define-record-type info-inline (nongenerative) + (parent info) + (sealed #t) + (fields)) + (module () (record-writer (record-type-descriptor info-load) (lambda (x p wr) @@ -5723,7 +5728,8 @@ (define hand-coded-closure? (lambda (name) (not (memq name '(nuate nonprocedure-code error-invoke invoke - $wrapper-apply wrapper-apply arity-wrapper-apply))))) + $wrapper-apply wrapper-apply arity-wrapper-apply + popcount-slow cpu-features))))) (define-inline 2 $hand-coded [(name) (nanopass-case (L7 Expr) name @@ -13384,6 +13390,32 @@ (in %ac0 %ac1 scheme-args) (out %cp %xp %yp %ts %td extra-regs)) (goto ,Lexit))))] + [(popcount-slow) + `(lambda ,(make-info "popcount-slow" '()) 0 () + ,(constant-case architecture + [(x86_64) + `(seq + (set! ,%rax (inline ,(make-info-inline) ,%popcount ,%rdi)) + (asm-c-return ,null-info ,%rdi ,%rax))] + [else + ;; Generate anything, since this should not get called + `(seq + (set! ,%ac0 (immediate 0)) + (jump ,%ref-ret (,%ac0)))]))] + [(cpu-features) + `(lambda ,(make-info "cpu-features" '()) 0 () + ,(constant-case architecture + [(x86_64) + (%seq + (set! ,%rdi ,%rbx) ; %rbx must be preserved + (set! ,%rax (inline ,(make-info-kill* (reg-list %rbx %rcx %rdx)) ,%cpuid)) + (set! ,%rbx ,%rdi) + (asm-c-return ,null-info ,%rax ,%rbx))] + [else + ;; Generate anything, since this should not get called + `(seq + (set! ,%ac0 (immediate 0)) + (jump ,%ref-ret (,%ac0)))]))] [else ($oops who "unrecognized hand-coded name ~s" sym)])])) (define-pass np-expose-allocation-pointer : L13.5 (ir) -> L14 () diff --git a/s/library.ss b/s/library.ss index d7e65b4dbb..9ec8805292 100644 --- a/s/library.ss +++ b/s/library.ss @@ -131,6 +131,8 @@ (define-hand-coded-library-entry $wrapper-apply) (define-hand-coded-library-entry wrapper-apply) (define-hand-coded-library-entry arity-wrapper-apply) +(define-hand-coded-library-entry popcount-slow) ; before fxpopcount use +(define-hand-coded-library-entry cpu-features) ; before fxpopcount use (define $instantiate-code-object ($hand-coded '$instantiate-code-object)) diff --git a/s/mkheader.ss b/s/mkheader.ss index 02d9260351..f113ac5e59 100644 --- a/s/mkheader.ss +++ b/s/mkheader.ss @@ -1027,6 +1027,10 @@ (libspec-index (lookup-libspec nonprocedure-code))) (def "library_dounderflow" (libspec-index (lookup-libspec dounderflow))) + (def "library_popcount_slow" + (libspec-index (lookup-libspec popcount-slow))) + (def "library_cpu_features" + (libspec-index (lookup-libspec cpu-features))) ))) ) diff --git a/s/np-languages.ss b/s/np-languages.ss index d92f5d4564..c9e762fcd8 100644 --- a/s/np-languages.ss +++ b/s/np-languages.ss @@ -561,6 +561,7 @@ (declare-primitive -/ovfl value #f) (declare-primitive -/eq value #f) (declare-primitive asmlibcall value #f) + (declare-primitive cpuid value #t) ; x86_64 only, actually side-effects ebx/ecx/edx (declare-primitive fstpl value #f) ; x86 only (declare-primitive fstps value #f) ; x86 only (declare-primitive get-double value #t) ; x86_64 diff --git a/s/ta6fb.def b/s/ta6fb.def index 68fe863bdf..f92d222abe 100644 --- a/s/ta6fb.def +++ b/s/ta6fb.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor pthreads) diff --git a/s/ta6le.def b/s/ta6le.def index 985b626bb2..af06b6a07f 100644 --- a/s/ta6le.def +++ b/s/ta6le.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor pthreads) diff --git a/s/ta6nb.def b/s/ta6nb.def index 59f1164c81..9917918934 100644 --- a/s/ta6nb.def +++ b/s/ta6nb.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor pthreads) diff --git a/s/ta6nt.def b/s/ta6nt.def index 75a948e67a..567f30463b 100644 --- a/s/ta6nt.def +++ b/s/ta6nt.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor pthreads windows) diff --git a/s/ta6ob.def b/s/ta6ob.def index 8195e342f9..3fe1a6c169 100644 --- a/s/ta6ob.def +++ b/s/ta6ob.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor pthreads) diff --git a/s/ta6osx.def b/s/ta6osx.def index 357c102c73..b0fba2c935 100644 --- a/s/ta6osx.def +++ b/s/ta6osx.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor pthreads) diff --git a/s/ta6s2.def b/s/ta6s2.def index df3757c67d..326db66e1e 100644 --- a/s/ta6s2.def +++ b/s/ta6s2.def @@ -45,7 +45,7 @@ (define-constant unaligned-floats #t) (define-constant unaligned-integers #t) (define-constant integer-divide-instruction #t) -(define-constant popcount-instruction #f) +(define-constant popcount-instruction #t) (define-constant software-floating-point #f) (define-constant segment-table-levels 3) (features iconv expeditor pthreads) diff --git a/s/x86_64.ss b/s/x86_64.ss index ca36278060..1a12b14a58 100644 --- a/s/x86_64.ss +++ b/s/x86_64.ss @@ -700,7 +700,18 @@ `(set! ,(make-live-info) ,z ,t)))]) (define-instruction value popcount - [(op (z ur) (x ur mem)) `(set! ,(make-live-info) ,z (asm ,info ,asm-popcount ,x))]) + [(op (z ur) (x ur mem)) + ;; Direct POPCNT instruction variant, works with corresponding `popcount-op`: + #; + `(set! ,(make-live-info) ,z (asm ,info ,(asm-popcount (info-inline? info)) ,x)) + + ;; Link-editable variant, for corresponding `popcount-op`: + (let ([urdi (make-precolored-unspillable 'urdi %rdi)] + [urax (make-precolored-unspillable 'urax %rax)]) + (seq + `(set! ,(make-live-info) ,urdi ,x) + `(set! ,(make-live-info) ,urax (asm ,info ,(asm-popcount (info-inline? info)) ,urdi)) + `(set! ,(make-live-info) ,z ,urax)))]) (define-instruction value move [(op (z mem) (x ur imm32)) @@ -969,6 +980,16 @@ (safe-assert (and (info-kill*? info) (memq %rdx (info-kill*-kill* info)))) `(set! ,(make-live-info) ,z (asm ,info ,asm-read-time-stamp-counter))]) + ;; currently returns ECX from CPUID function 1 + (define-instruction value cpuid + [(op (z ur)) + (safe-assert (eq? z %rax)) + (safe-assert (and (info-kill*? info) + (memq %rbx (info-kill*-kill* info)) + (memq %rcx (info-kill*-kill* info)) + (memq %rdx (info-kill*-kill* info)))) + `(set! ,(make-live-info) ,z (asm ,info ,asm-cpuid))]) + ; NB: shouldn't need to list (info-kill*-live*-live* info) ... here, since we've already ; NB: computed spillable/register live sets (define-instruction effect (c-call) @@ -1007,6 +1028,7 @@ asm-enter asm-foreign-call asm-foreign-callable asm-inc-profile-counter asm-inc-cc-counter asm-read-time-stamp-counter asm-read-performance-monitoring-counter + asm-cpuid ; threaded version specific asm-get-tc asm-activate-thread asm-deactivate-thread asm-unactivate-thread ; machine dependent exports @@ -1143,6 +1165,8 @@ (define-op popcount (*) popcount-op) + (define-op cpuid two-byte-op #b1111 #b10100010) + ; also do inc-reg dec-reg ; the following are forms of the call instruction and push the return address @@ -1582,17 +1606,68 @@ [3 op-code] [0 (fxlogand (ax-ea-reg-code reg) 7)])))))) + ;; Direct POPCNT instruction variant: + #; (define popcount-op - (lambda (op size dest-reg src-ea code*) - (begin - (emit-code (op dest-reg src-ea code*) - (build byte #xF3) - (ax-ea-rex (if (eq? size 'quad) 1 0) src-ea dest-reg size) - (build byte #x0F) - (build byte #xB8) - (ax-ea-modrm-reg src-ea dest-reg) - (ax-ea-sib src-ea) - (ax-ea-addr-disp src-ea))))) + (lambda (op size dest-reg src-ea inline? code*) + (let ([code* (emit-code (op src-ea dest-reg code*) + (build byte #xF3) + (ax-ea-rex (if (eq? size 'quad) 1 0) src-ea dest-reg size) + (build byte #x0F) + (build byte #xB8) + (ax-ea-modrm-reg src-ea dest-reg) + (ax-ea-sib src-ea) + (ax-ea-addr-disp src-ea))]) + (if (not (and (ax-register? src-ea) + (fx= (ax-ea-reg-code src-ea) + (ax-ea-reg-code dest-reg)))) + (emit xor dest-reg dest-reg code*) ; avoid false dependency + code*)))) + + ;; Link-editable variant: + (define popcount-op + (let ([target `(x86_64-popcount ,(constant code-data-disp) (library ,(lookup-libspec popcount-slow)))]) + (lambda (op size dest-rax src-rdi inline? code*) + (safe-assert (and (ax-register? dest-rax) (ax-register? src-dir))) + (record-case dest-rax + [(reg) dest-rax + (record-case src-rdi + [(reg) src-rdi + (safe-assert (and (eq? dest-rax %rax) (eq? src-rdi %rdi))) + (cond + [(not inline?) + ;; Set up a call to `popcount-slow`, which the linker + ;; can replace with a POPCNT instruction: + (asm-helper-call code* target dest-rax)] + [else + ;; Used for the body of `popcount-slow`. + ;; This is the sequence generated by LLVM's __builtin_popcountl() + ;; __builtin_popcountl() intrinsic, but with pushes and pops + ;; to save used registers other than the result register %rax. + (emit-literal-code (op dest-rax src-rdi code*) + 57 ; push %rdi + 51 ; push %rcx + 48 89 f8 ; movq %rdi, %rax + 48 d1 e8 ; shrq %rax + 48 b9 55 55 55 55 55 55 55 55 ; movabsq $6148914691236517205, %rcx + 48 21 c1 ; andq %rax, %rcx + 48 29 cf ; subq %rcx, %rdi + 48 b8 33 33 33 33 33 33 33 33 ; movabsq $3689348814741910323, %rax + 48 89 f9 ; movq %rdi, %rcx + 48 21 c1 ; andq %rax, %rcx + 48 c1 ef 02 ; shrq $2, %rdi + 48 21 c7 ; andq %rax, %rdi + 48 01 cf ; addq %rcx, %rdi + 48 89 f8 ; movq %rdi, %rax + 48 c1 e8 04 ; shrq $4, %rax + 48 8d 04 38 ; leaq (%rax,%rdi), %rax + 48 b9 0f 0f 0f 0f 0f 0f 0f 0f ; movabsq $1085102592571150095, %rcx + 48 21 c1 ; andq %rax, %rcx + 48 b8 01 01 01 01 01 01 01 01 ; movabsq $72340172838076673, %rax + 48 0f af c1 ; imulq %rcx, %rax + 48 c1 e8 38 ; shrq $56, %rax + 59 ; pop %rcx + 5f)])])])))) ; pop %rdi (define-syntax emit-code (lambda (x) @@ -1801,7 +1876,7 @@ (define asm-size (lambda (x) (case (car x) - [(asm x86_64-jump x86_64-call) 0] + [(asm x86_64-jump x86_64-call x86_64-popcount) 0] [(byte) 1] [(word) 2] [(long) 4] @@ -1984,9 +2059,10 @@ (emit mulsi src1 src0 dest code*)))) (define asm-popcount - (lambda (code* dest src) - (Trivit (src) - (emit popcount (cons 'reg dest) src code*)))) + (lambda (inline?) + (lambda (code* dest src) + (Trivit (dest src) + (emit popcount dest src inline? code*))))) (define-who asm-addop (lambda (op) @@ -2018,6 +2094,15 @@ (safe-assert (eq? dest %rax)) (emit rdtsc code*))) + (define asm-cpuid + (lambda (code* dest) + ; rbx/rcx/rdx is an implied dest and included in info's kill list + (safe-assert (eq? dest %rax)) + (emit movi '(imm 1) (cons 'reg %rax) + (emit cpuid + (emit mov (cons 'reg %rcx) (cons 'reg %rax) + code*))))) + (define asm-inc-profile-counter (lambda (code* dest src) (Trivit (dest src)