use POPCNT instruction when available on x86_64
On x86_64, a POPCNT instruction is usually available, and it can speed up `fxpopcount` operations by a factor of 2-3. Since POPCNT isn't always available, code using `fxpopcount` is compiled to a call to a generic implementation. The linker substitutes a POPCNT instruction when it determines at runtime that POPCNT is available. Some measurements on a 2018 MacBook Pro (2.7 GHz Core i7) using the program below: popcnt = this implementation, POPCNT discovered nocnt = this implementation, POPCNT considered unavailable optcnt = compile to use POPCNT directly (no linker work) cpcnt = compile to inlined generic (no linker work, no POPCNT) Since the generic implementation is always a 64-bit popcount, it's not as good as an inlined version for `fxpopcount32`, but otherwise the link-edit approach to POPCNT works well: fxpopcount fxpopcount32 popcnt: 0.098s nocnt: 0.284s optcnt 0.109s [slower means noise?] cpcnt: 0.279s 0.188s (optimize-level 3) (time (let loop ([v #f] [i 100000000]) (if (fx= i 0) v (loop (fxpopcount i) (fx- i 1))))) original commit: 5f090e509f8fe5edc777ed9f0463b20c2e571336
This commit is contained in:
parent
f88714210b
commit
540c58bbe8
|
@ -118,6 +118,9 @@ extern void S_set_code_obj PROTO((char *who, IFASLCODE typ, ptr p, iptr n,
|
|||
extern ptr S_get_code_obj PROTO((IFASLCODE typ, ptr p, iptr n, iptr o));
|
||||
extern int S_fasl_stream_read PROTO((void *stream, octet *dest, iptr n));
|
||||
extern int S_fasl_intern_rtd(ptr *x);
|
||||
#ifdef X86_64
|
||||
extern void x86_64_set_popcount_present PROTO((ptr code));
|
||||
#endif
|
||||
|
||||
/* vfasl.c */
|
||||
extern ptr S_to_vfasl PROTO((ptr v));
|
||||
|
|
53
c/fasl.c
53
c/fasl.c
|
@ -241,6 +241,7 @@ static uptr ppc32_get_jump PROTO((void *address));
|
|||
#ifdef X86_64
|
||||
static void x86_64_set_jump PROTO((void *address, uptr item, IBOOL callp));
|
||||
static uptr x86_64_get_jump PROTO((void *address));
|
||||
static void x86_64_set_popcount PROTO((void *address, uptr item));
|
||||
#endif /* X86_64 */
|
||||
#ifdef SPARC64
|
||||
static INT extract_reg_from_sethi PROTO((void *address));
|
||||
|
@ -1293,6 +1294,9 @@ void S_set_code_obj(who, typ, p, n, x, o) char *who; IFASLCODE typ; iptr n, o; p
|
|||
case reloc_x86_64_call:
|
||||
x86_64_set_jump(address, item, 1);
|
||||
break;
|
||||
case reloc_x86_64_popcount:
|
||||
x86_64_set_popcount(address, item);
|
||||
break;
|
||||
#endif /* X86_64 */
|
||||
#ifdef SPARC64
|
||||
case reloc_sparc64abs:
|
||||
|
@ -1364,6 +1368,9 @@ ptr S_get_code_obj(typ, p, n, o) IFASLCODE typ; iptr n, o; ptr p; {
|
|||
case reloc_x86_64_call:
|
||||
item = x86_64_get_jump(address);
|
||||
break;
|
||||
case reloc_x86_64_popcount:
|
||||
item = (uptr)Svector_ref(S_G.library_entry_vector, library_popcount_slow) + o;
|
||||
break;
|
||||
#endif /* X86_64 */
|
||||
#ifdef SPARC64
|
||||
case reloc_sparc64abs:
|
||||
|
@ -1509,18 +1516,20 @@ static uptr ppc32_get_jump(void *address) {
|
|||
#endif /* PPC32 */
|
||||
|
||||
#ifdef X86_64
|
||||
|
||||
static void x86_64_set_jump(void *address, uptr item, IBOOL callp) {
|
||||
I64 disp = (I64)item - ((I64)address + 5); /* 5 = size of call instruction */
|
||||
if ((I32)disp == disp) {
|
||||
*(octet *)address = callp ? 0xE8 : 0xE9; /* call or jmp disp32 opcode */
|
||||
*(I32 *)((uptr)address + 1) = (I32)disp;
|
||||
*((octet *)address + 5) = 0x90; /* nop */
|
||||
*((octet *)address + 6) = 0x90; /* nop */
|
||||
*((octet *)address + 7) = 0x90; /* nop */
|
||||
*((octet *)address + 8) = 0x90; /* nop */
|
||||
*((octet *)address + 9) = 0x90; /* nop */
|
||||
*((octet *)address + 10) = 0x90; /* nop */
|
||||
*((octet *)address + 11) = 0x90; /* nop */
|
||||
/* 7-byte nop: */
|
||||
*((octet *)address + 5) = 0x0F;
|
||||
*((octet *)address + 6) = 0x1F;
|
||||
*((octet *)address + 7) = 0x80;
|
||||
*((octet *)address + 8) = 0x00;
|
||||
*((octet *)address + 9) = 0x00;
|
||||
*((octet *)address + 10) = 0x00;
|
||||
*((octet *)address + 11) = 0x00;
|
||||
} else {
|
||||
*(octet *)address = 0x48; /* REX w/REX.w set */
|
||||
*((octet *)address + 1)= 0xB8; /* MOV imm64 to RAX */
|
||||
|
@ -1538,6 +1547,36 @@ static uptr x86_64_get_jump(void *address) {
|
|||
/* must be short form: call/jmp */
|
||||
return ((uptr)address + 5) + *(I32 *)((uptr)address + 1);
|
||||
}
|
||||
|
||||
static int popcount_present;
|
||||
|
||||
static void x86_64_set_popcount(void *address, uptr item) {
|
||||
if (!popcount_present) {
|
||||
x86_64_set_jump(address, item, 1);
|
||||
} else {
|
||||
*((octet *)address + 0) = 0x48; /* REX */
|
||||
*((octet *)address + 1) = 0x31; /* XOR RAX, RAX - avoid false dependency */
|
||||
*((octet *)address + 2) = 0xc0;
|
||||
*((octet *)address + 3) = 0xF3;
|
||||
*((octet *)address + 4) = 0x48; /* REX */
|
||||
*((octet *)address + 5) = 0x0F; /* POPCNT */
|
||||
*((octet *)address + 6) = 0xB8;
|
||||
*((octet *)address + 7) = 0xC7; /* RDI -> RAX */
|
||||
/* 4-byte nop: */
|
||||
*((octet *)address + 8) = 0x0F;
|
||||
*((octet *)address + 9) = 0x1F;
|
||||
*((octet *)address + 10) = 0x40;
|
||||
*((octet *)address + 11) = 0x00;
|
||||
}
|
||||
}
|
||||
|
||||
void x86_64_set_popcount_present(ptr code) {
|
||||
/* cpu_features returns ECX after CPUID for function 1 */
|
||||
int (*cpu_features)() = (int (*)())((uptr)code + code_data_disp);
|
||||
if (cpu_features() & (1 << 23))
|
||||
popcount_present = 1;
|
||||
}
|
||||
|
||||
#endif /* X86_64 */
|
||||
|
||||
#ifdef SPARC64
|
||||
|
|
4
c/prim.c
4
c/prim.c
|
@ -42,6 +42,10 @@ static void install_library_entry(n, x) ptr n, x; {
|
|||
S_G.nonprocedure_code = x;
|
||||
S_retrofit_nonprocedure_code();
|
||||
}
|
||||
#ifdef X86_64
|
||||
if (n == FIX(library_cpu_features))
|
||||
x86_64_set_popcount_present(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
ptr S_lookup_library_entry(n, errorp) iptr n; IBOOL errorp; {
|
||||
|
|
|
@ -62,7 +62,7 @@ InstallLZ4Target=
|
|||
# no changes should be needed below this point #
|
||||
###############################################################################
|
||||
|
||||
Version=csv9.5.3.10
|
||||
Version=csv9.5.3.11
|
||||
Include=boot/$m
|
||||
PetiteBoot=boot/$m/petite.boot
|
||||
SchemeBoot=boot/$m/scheme.boot
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor)
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor)
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor)
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor windows)
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor)
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor)
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor)
|
||||
|
|
|
@ -328,7 +328,7 @@
|
|||
[(_ foo e1 e2) e1] ...
|
||||
[(_ bar e1 e2) e2]))))])))
|
||||
|
||||
(define-constant scheme-version #x0905030A)
|
||||
(define-constant scheme-version #x0905030B)
|
||||
|
||||
(define-syntax define-machine-types
|
||||
(lambda (x)
|
||||
|
@ -500,7 +500,7 @@
|
|||
(sparc reloc-sparcabs reloc-sparcrel)
|
||||
(sparc64 reloc-sparc64abs reloc-sparc64rel)
|
||||
(ppc reloc-ppccall reloc-ppcload)
|
||||
(x86_64 reloc-x86_64-call reloc-x86_64-jump)
|
||||
(x86_64 reloc-x86_64-call reloc-x86_64-jump reloc-x86_64-popcount)
|
||||
(arm32 reloc-arm32-abs reloc-arm32-call reloc-arm32-jump)
|
||||
(ppc32 reloc-ppc32-abs reloc-ppc32-call reloc-ppc32-jump))
|
||||
|
||||
|
@ -2716,6 +2716,8 @@
|
|||
($wrapper-apply #f 0 #f #f)
|
||||
(wrapper-apply #f 0 #f #f)
|
||||
(arity-wrapper-apply #f 0 #f #f)
|
||||
(popcount-slow #f 0 #f #t)
|
||||
(cpu-features #f 0 #f #t)
|
||||
))
|
||||
|
||||
(let ()
|
||||
|
|
10
s/compile.ss
10
s/compile.ss
|
@ -204,6 +204,10 @@
|
|||
(let ([a1 (fx- a 12)] [x* (cons (mkcode x) x*)])
|
||||
(let ([r ($reloc (constant reloc-x86_64-call) n (fx- a1 ra))])
|
||||
(mkc0 (cdr c*) a (cons r r*) a1 x*)))]
|
||||
[(x86_64-popcount) (n x)
|
||||
(let ([a1 (fx- a 12)] [x* (cons (mkcode x) x*)])
|
||||
(let ([r ($reloc (constant reloc-x86_64-popcount) n (fx- a1 ra))])
|
||||
(mkc0 (cdr c*) a (cons r r*) a1 x*)))]
|
||||
[else (c-assembler-output-error c)])]
|
||||
[else (c-assembler-output-error c)])]))))
|
||||
p))]
|
||||
|
@ -258,7 +262,7 @@
|
|||
[else (void)])]
|
||||
[(x86_64)
|
||||
(record-case x
|
||||
[(x86_64-jump x86_64-call) (n x) (build x d)]
|
||||
[(x86_64-jump x86_64-call x86_64-popcount) (n x) (build x d)]
|
||||
[else (void)])]
|
||||
[(arm32)
|
||||
(record-case x
|
||||
|
@ -394,6 +398,10 @@
|
|||
(let ([a1 (fx- a 12)]) ; 10-byte moviq followed by 2-byte call
|
||||
(let ([r ($reloc (constant reloc-x86_64-call) n (fx- a1 ra))])
|
||||
(prf0 (cdr c*) a (cons r r*) a1 (cons x x*))))]
|
||||
[(x86_64-popcount) (n x)
|
||||
(let ([a1 (fx- a 12)]) ; like a call, for worst case
|
||||
(let ([r ($reloc (constant reloc-x86_64-popcount) n (fx- a1 ra))])
|
||||
(prf0 (cdr c*) a (cons r r*) a1 (cons x x*))))]
|
||||
[else (c-assembler-output-error c)])]
|
||||
[else (c-assembler-output-error c)])]))))]))
|
||||
|
||||
|
|
|
@ -995,6 +995,11 @@
|
|||
(sealed #t)
|
||||
(fields offset))
|
||||
|
||||
(define-record-type info-inline (nongenerative)
|
||||
(parent info)
|
||||
(sealed #t)
|
||||
(fields))
|
||||
|
||||
(module ()
|
||||
(record-writer (record-type-descriptor info-load)
|
||||
(lambda (x p wr)
|
||||
|
@ -5723,7 +5728,8 @@
|
|||
(define hand-coded-closure?
|
||||
(lambda (name)
|
||||
(not (memq name '(nuate nonprocedure-code error-invoke invoke
|
||||
$wrapper-apply wrapper-apply arity-wrapper-apply)))))
|
||||
$wrapper-apply wrapper-apply arity-wrapper-apply
|
||||
popcount-slow cpu-features)))))
|
||||
(define-inline 2 $hand-coded
|
||||
[(name)
|
||||
(nanopass-case (L7 Expr) name
|
||||
|
@ -13384,6 +13390,32 @@
|
|||
(in %ac0 %ac1 scheme-args)
|
||||
(out %cp %xp %yp %ts %td extra-regs))
|
||||
(goto ,Lexit))))]
|
||||
[(popcount-slow)
|
||||
`(lambda ,(make-info "popcount-slow" '()) 0 ()
|
||||
,(constant-case architecture
|
||||
[(x86_64)
|
||||
`(seq
|
||||
(set! ,%rax (inline ,(make-info-inline) ,%popcount ,%rdi))
|
||||
(asm-c-return ,null-info ,%rdi ,%rax))]
|
||||
[else
|
||||
;; Generate anything, since this should not get called
|
||||
`(seq
|
||||
(set! ,%ac0 (immediate 0))
|
||||
(jump ,%ref-ret (,%ac0)))]))]
|
||||
[(cpu-features)
|
||||
`(lambda ,(make-info "cpu-features" '()) 0 ()
|
||||
,(constant-case architecture
|
||||
[(x86_64)
|
||||
(%seq
|
||||
(set! ,%rdi ,%rbx) ; %rbx must be preserved
|
||||
(set! ,%rax (inline ,(make-info-kill* (reg-list %rbx %rcx %rdx)) ,%cpuid))
|
||||
(set! ,%rbx ,%rdi)
|
||||
(asm-c-return ,null-info ,%rax ,%rbx))]
|
||||
[else
|
||||
;; Generate anything, since this should not get called
|
||||
`(seq
|
||||
(set! ,%ac0 (immediate 0))
|
||||
(jump ,%ref-ret (,%ac0)))]))]
|
||||
[else ($oops who "unrecognized hand-coded name ~s" sym)])]))
|
||||
|
||||
(define-pass np-expose-allocation-pointer : L13.5 (ir) -> L14 ()
|
||||
|
|
|
@ -131,6 +131,8 @@
|
|||
(define-hand-coded-library-entry $wrapper-apply)
|
||||
(define-hand-coded-library-entry wrapper-apply)
|
||||
(define-hand-coded-library-entry arity-wrapper-apply)
|
||||
(define-hand-coded-library-entry popcount-slow) ; before fxpopcount use
|
||||
(define-hand-coded-library-entry cpu-features) ; before fxpopcount use
|
||||
|
||||
(define $instantiate-code-object ($hand-coded '$instantiate-code-object))
|
||||
|
||||
|
|
|
@ -1027,6 +1027,10 @@
|
|||
(libspec-index (lookup-libspec nonprocedure-code)))
|
||||
(def "library_dounderflow"
|
||||
(libspec-index (lookup-libspec dounderflow)))
|
||||
(def "library_popcount_slow"
|
||||
(libspec-index (lookup-libspec popcount-slow)))
|
||||
(def "library_cpu_features"
|
||||
(libspec-index (lookup-libspec cpu-features)))
|
||||
|
||||
)))
|
||||
)
|
||||
|
|
|
@ -561,6 +561,7 @@
|
|||
(declare-primitive -/ovfl value #f)
|
||||
(declare-primitive -/eq value #f)
|
||||
(declare-primitive asmlibcall value #f)
|
||||
(declare-primitive cpuid value #t) ; x86_64 only, actually side-effects ebx/ecx/edx
|
||||
(declare-primitive fstpl value #f) ; x86 only
|
||||
(declare-primitive fstps value #f) ; x86 only
|
||||
(declare-primitive get-double value #t) ; x86_64
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor pthreads)
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor pthreads)
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor pthreads)
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor pthreads windows)
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor pthreads)
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor pthreads)
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
(define-constant unaligned-floats #t)
|
||||
(define-constant unaligned-integers #t)
|
||||
(define-constant integer-divide-instruction #t)
|
||||
(define-constant popcount-instruction #f)
|
||||
(define-constant popcount-instruction #t)
|
||||
(define-constant software-floating-point #f)
|
||||
(define-constant segment-table-levels 3)
|
||||
(features iconv expeditor pthreads)
|
||||
|
|
115
s/x86_64.ss
115
s/x86_64.ss
|
@ -700,7 +700,18 @@
|
|||
`(set! ,(make-live-info) ,z ,t)))])
|
||||
|
||||
(define-instruction value popcount
|
||||
[(op (z ur) (x ur mem)) `(set! ,(make-live-info) ,z (asm ,info ,asm-popcount ,x))])
|
||||
[(op (z ur) (x ur mem))
|
||||
;; Direct POPCNT instruction variant, works with corresponding `popcount-op`:
|
||||
#;
|
||||
`(set! ,(make-live-info) ,z (asm ,info ,(asm-popcount (info-inline? info)) ,x))
|
||||
|
||||
;; Link-editable variant, for corresponding `popcount-op`:
|
||||
(let ([urdi (make-precolored-unspillable 'urdi %rdi)]
|
||||
[urax (make-precolored-unspillable 'urax %rax)])
|
||||
(seq
|
||||
`(set! ,(make-live-info) ,urdi ,x)
|
||||
`(set! ,(make-live-info) ,urax (asm ,info ,(asm-popcount (info-inline? info)) ,urdi))
|
||||
`(set! ,(make-live-info) ,z ,urax)))])
|
||||
|
||||
(define-instruction value move
|
||||
[(op (z mem) (x ur imm32))
|
||||
|
@ -969,6 +980,16 @@
|
|||
(safe-assert (and (info-kill*? info) (memq %rdx (info-kill*-kill* info))))
|
||||
`(set! ,(make-live-info) ,z (asm ,info ,asm-read-time-stamp-counter))])
|
||||
|
||||
;; currently returns ECX from CPUID function 1
|
||||
(define-instruction value cpuid
|
||||
[(op (z ur))
|
||||
(safe-assert (eq? z %rax))
|
||||
(safe-assert (and (info-kill*? info)
|
||||
(memq %rbx (info-kill*-kill* info))
|
||||
(memq %rcx (info-kill*-kill* info))
|
||||
(memq %rdx (info-kill*-kill* info))))
|
||||
`(set! ,(make-live-info) ,z (asm ,info ,asm-cpuid))])
|
||||
|
||||
; NB: shouldn't need to list (info-kill*-live*-live* info) ... here, since we've already
|
||||
; NB: computed spillable/register live sets
|
||||
(define-instruction effect (c-call)
|
||||
|
@ -1007,6 +1028,7 @@
|
|||
asm-enter asm-foreign-call asm-foreign-callable
|
||||
asm-inc-profile-counter
|
||||
asm-inc-cc-counter asm-read-time-stamp-counter asm-read-performance-monitoring-counter
|
||||
asm-cpuid
|
||||
; threaded version specific
|
||||
asm-get-tc asm-activate-thread asm-deactivate-thread asm-unactivate-thread
|
||||
; machine dependent exports
|
||||
|
@ -1143,6 +1165,8 @@
|
|||
|
||||
(define-op popcount (*) popcount-op)
|
||||
|
||||
(define-op cpuid two-byte-op #b1111 #b10100010)
|
||||
|
||||
; also do inc-reg dec-reg
|
||||
|
||||
; the following are forms of the call instruction and push the return address
|
||||
|
@ -1582,17 +1606,68 @@
|
|||
[3 op-code]
|
||||
[0 (fxlogand (ax-ea-reg-code reg) 7)]))))))
|
||||
|
||||
;; Direct POPCNT instruction variant:
|
||||
#;
|
||||
(define popcount-op
|
||||
(lambda (op size dest-reg src-ea code*)
|
||||
(begin
|
||||
(emit-code (op dest-reg src-ea code*)
|
||||
(build byte #xF3)
|
||||
(ax-ea-rex (if (eq? size 'quad) 1 0) src-ea dest-reg size)
|
||||
(build byte #x0F)
|
||||
(build byte #xB8)
|
||||
(ax-ea-modrm-reg src-ea dest-reg)
|
||||
(ax-ea-sib src-ea)
|
||||
(ax-ea-addr-disp src-ea)))))
|
||||
(lambda (op size dest-reg src-ea inline? code*)
|
||||
(let ([code* (emit-code (op src-ea dest-reg code*)
|
||||
(build byte #xF3)
|
||||
(ax-ea-rex (if (eq? size 'quad) 1 0) src-ea dest-reg size)
|
||||
(build byte #x0F)
|
||||
(build byte #xB8)
|
||||
(ax-ea-modrm-reg src-ea dest-reg)
|
||||
(ax-ea-sib src-ea)
|
||||
(ax-ea-addr-disp src-ea))])
|
||||
(if (not (and (ax-register? src-ea)
|
||||
(fx= (ax-ea-reg-code src-ea)
|
||||
(ax-ea-reg-code dest-reg))))
|
||||
(emit xor dest-reg dest-reg code*) ; avoid false dependency
|
||||
code*))))
|
||||
|
||||
;; Link-editable variant:
|
||||
(define popcount-op
|
||||
(let ([target `(x86_64-popcount ,(constant code-data-disp) (library ,(lookup-libspec popcount-slow)))])
|
||||
(lambda (op size dest-rax src-rdi inline? code*)
|
||||
(safe-assert (and (ax-register? dest-rax) (ax-register? src-dir)))
|
||||
(record-case dest-rax
|
||||
[(reg) dest-rax
|
||||
(record-case src-rdi
|
||||
[(reg) src-rdi
|
||||
(safe-assert (and (eq? dest-rax %rax) (eq? src-rdi %rdi)))
|
||||
(cond
|
||||
[(not inline?)
|
||||
;; Set up a call to `popcount-slow`, which the linker
|
||||
;; can replace with a POPCNT instruction:
|
||||
(asm-helper-call code* target dest-rax)]
|
||||
[else
|
||||
;; Used for the body of `popcount-slow`.
|
||||
;; This is the sequence generated by LLVM's __builtin_popcountl()
|
||||
;; __builtin_popcountl() intrinsic, but with pushes and pops
|
||||
;; to save used registers other than the result register %rax.
|
||||
(emit-literal-code (op dest-rax src-rdi code*)
|
||||
57 ; push %rdi
|
||||
51 ; push %rcx
|
||||
48 89 f8 ; movq %rdi, %rax
|
||||
48 d1 e8 ; shrq %rax
|
||||
48 b9 55 55 55 55 55 55 55 55 ; movabsq $6148914691236517205, %rcx
|
||||
48 21 c1 ; andq %rax, %rcx
|
||||
48 29 cf ; subq %rcx, %rdi
|
||||
48 b8 33 33 33 33 33 33 33 33 ; movabsq $3689348814741910323, %rax
|
||||
48 89 f9 ; movq %rdi, %rcx
|
||||
48 21 c1 ; andq %rax, %rcx
|
||||
48 c1 ef 02 ; shrq $2, %rdi
|
||||
48 21 c7 ; andq %rax, %rdi
|
||||
48 01 cf ; addq %rcx, %rdi
|
||||
48 89 f8 ; movq %rdi, %rax
|
||||
48 c1 e8 04 ; shrq $4, %rax
|
||||
48 8d 04 38 ; leaq (%rax,%rdi), %rax
|
||||
48 b9 0f 0f 0f 0f 0f 0f 0f 0f ; movabsq $1085102592571150095, %rcx
|
||||
48 21 c1 ; andq %rax, %rcx
|
||||
48 b8 01 01 01 01 01 01 01 01 ; movabsq $72340172838076673, %rax
|
||||
48 0f af c1 ; imulq %rcx, %rax
|
||||
48 c1 e8 38 ; shrq $56, %rax
|
||||
59 ; pop %rcx
|
||||
5f)])])])))) ; pop %rdi
|
||||
|
||||
(define-syntax emit-code
|
||||
(lambda (x)
|
||||
|
@ -1801,7 +1876,7 @@
|
|||
(define asm-size
|
||||
(lambda (x)
|
||||
(case (car x)
|
||||
[(asm x86_64-jump x86_64-call) 0]
|
||||
[(asm x86_64-jump x86_64-call x86_64-popcount) 0]
|
||||
[(byte) 1]
|
||||
[(word) 2]
|
||||
[(long) 4]
|
||||
|
@ -1984,9 +2059,10 @@
|
|||
(emit mulsi src1 src0 dest code*))))
|
||||
|
||||
(define asm-popcount
|
||||
(lambda (code* dest src)
|
||||
(Trivit (src)
|
||||
(emit popcount (cons 'reg dest) src code*))))
|
||||
(lambda (inline?)
|
||||
(lambda (code* dest src)
|
||||
(Trivit (dest src)
|
||||
(emit popcount dest src inline? code*)))))
|
||||
|
||||
(define-who asm-addop
|
||||
(lambda (op)
|
||||
|
@ -2018,6 +2094,15 @@
|
|||
(safe-assert (eq? dest %rax))
|
||||
(emit rdtsc code*)))
|
||||
|
||||
(define asm-cpuid
|
||||
(lambda (code* dest)
|
||||
; rbx/rcx/rdx is an implied dest and included in info's kill list
|
||||
(safe-assert (eq? dest %rax))
|
||||
(emit movi '(imm 1) (cons 'reg %rax)
|
||||
(emit cpuid
|
||||
(emit mov (cons 'reg %rcx) (cons 'reg %rax)
|
||||
code*)))))
|
||||
|
||||
(define asm-inc-profile-counter
|
||||
(lambda (code* dest src)
|
||||
(Trivit (dest src)
|
||||
|
|
Loading…
Reference in New Issue
Block a user