use POPCNT instruction when available on x86_64

On x86_64, a POPCNT instruction is usually available, and it can speed
up `fxpopcount` operations by a factor of 2-3.

Since POPCNT isn't always available, code using `fxpopcount` is
compiled to a call to a generic implementation. The linker substitutes
a POPCNT instruction when it determines at runtime that POPCNT is
available.

Some measurements on a 2018 MacBook Pro (2.7 GHz Core i7) using the
program below:

 popcnt = this implementation, POPCNT discovered
 nocnt  = this implementation, POPCNT considered unavailable
 optcnt = compile to use POPCNT directly (no linker work)
 cpcnt  = compile to inlined generic (no linker work, no POPCNT)

Since the generic implementation is always a 64-bit popcount, it's not
as good as an inlined version for `fxpopcount32`, but otherwise the
link-edit approach to POPCNT works well:

            fxpopcount      fxpopcount32
 popcnt:       0.098s
 nocnt:        0.284s
 optcnt        0.109s  [slower means noise?]
 cpcnt:        0.279s         0.188s

 (optimize-level 3)
 (time
  (let loop ([v #f] [i 100000000])
    (if (fx= i 0)
        v
        (loop (fxpopcount i) (fx- i 1)))))

original commit: 5f090e509f8fe5edc777ed9f0463b20c2e571336
This commit is contained in:
Matthew Flatt 2020-01-11 09:28:38 -07:00
parent f88714210b
commit 540c58bbe8
25 changed files with 221 additions and 41 deletions

View File

@ -118,6 +118,9 @@ extern void S_set_code_obj PROTO((char *who, IFASLCODE typ, ptr p, iptr n,
extern ptr S_get_code_obj PROTO((IFASLCODE typ, ptr p, iptr n, iptr o));
extern int S_fasl_stream_read PROTO((void *stream, octet *dest, iptr n));
extern int S_fasl_intern_rtd(ptr *x);
#ifdef X86_64
extern void x86_64_set_popcount_present PROTO((ptr code));
#endif
/* vfasl.c */
extern ptr S_to_vfasl PROTO((ptr v));

View File

@ -241,6 +241,7 @@ static uptr ppc32_get_jump PROTO((void *address));
#ifdef X86_64
static void x86_64_set_jump PROTO((void *address, uptr item, IBOOL callp));
static uptr x86_64_get_jump PROTO((void *address));
static void x86_64_set_popcount PROTO((void *address, uptr item));
#endif /* X86_64 */
#ifdef SPARC64
static INT extract_reg_from_sethi PROTO((void *address));
@ -1293,6 +1294,9 @@ void S_set_code_obj(who, typ, p, n, x, o) char *who; IFASLCODE typ; iptr n, o; p
case reloc_x86_64_call:
x86_64_set_jump(address, item, 1);
break;
case reloc_x86_64_popcount:
x86_64_set_popcount(address, item);
break;
#endif /* X86_64 */
#ifdef SPARC64
case reloc_sparc64abs:
@ -1364,6 +1368,9 @@ ptr S_get_code_obj(typ, p, n, o) IFASLCODE typ; iptr n, o; ptr p; {
case reloc_x86_64_call:
item = x86_64_get_jump(address);
break;
case reloc_x86_64_popcount:
item = (uptr)Svector_ref(S_G.library_entry_vector, library_popcount_slow) + o;
break;
#endif /* X86_64 */
#ifdef SPARC64
case reloc_sparc64abs:
@ -1509,18 +1516,20 @@ static uptr ppc32_get_jump(void *address) {
#endif /* PPC32 */
#ifdef X86_64
static void x86_64_set_jump(void *address, uptr item, IBOOL callp) {
I64 disp = (I64)item - ((I64)address + 5); /* 5 = size of call instruction */
if ((I32)disp == disp) {
*(octet *)address = callp ? 0xE8 : 0xE9; /* call or jmp disp32 opcode */
*(I32 *)((uptr)address + 1) = (I32)disp;
*((octet *)address + 5) = 0x90; /* nop */
*((octet *)address + 6) = 0x90; /* nop */
*((octet *)address + 7) = 0x90; /* nop */
*((octet *)address + 8) = 0x90; /* nop */
*((octet *)address + 9) = 0x90; /* nop */
*((octet *)address + 10) = 0x90; /* nop */
*((octet *)address + 11) = 0x90; /* nop */
/* 7-byte nop: */
*((octet *)address + 5) = 0x0F;
*((octet *)address + 6) = 0x1F;
*((octet *)address + 7) = 0x80;
*((octet *)address + 8) = 0x00;
*((octet *)address + 9) = 0x00;
*((octet *)address + 10) = 0x00;
*((octet *)address + 11) = 0x00;
} else {
*(octet *)address = 0x48; /* REX w/REX.w set */
*((octet *)address + 1)= 0xB8; /* MOV imm64 to RAX */
@ -1538,6 +1547,36 @@ static uptr x86_64_get_jump(void *address) {
/* must be short form: call/jmp */
return ((uptr)address + 5) + *(I32 *)((uptr)address + 1);
}
static int popcount_present;
static void x86_64_set_popcount(void *address, uptr item) {
if (!popcount_present) {
x86_64_set_jump(address, item, 1);
} else {
*((octet *)address + 0) = 0x48; /* REX */
*((octet *)address + 1) = 0x31; /* XOR RAX, RAX - avoid false dependency */
*((octet *)address + 2) = 0xc0;
*((octet *)address + 3) = 0xF3;
*((octet *)address + 4) = 0x48; /* REX */
*((octet *)address + 5) = 0x0F; /* POPCNT */
*((octet *)address + 6) = 0xB8;
*((octet *)address + 7) = 0xC7; /* RDI -> RAX */
/* 4-byte nop: */
*((octet *)address + 8) = 0x0F;
*((octet *)address + 9) = 0x1F;
*((octet *)address + 10) = 0x40;
*((octet *)address + 11) = 0x00;
}
}
void x86_64_set_popcount_present(ptr code) {
/* cpu_features returns ECX after CPUID for function 1 */
int (*cpu_features)() = (int (*)())((uptr)code + code_data_disp);
if (cpu_features() & (1 << 23))
popcount_present = 1;
}
#endif /* X86_64 */
#ifdef SPARC64

View File

@ -42,6 +42,10 @@ static void install_library_entry(n, x) ptr n, x; {
S_G.nonprocedure_code = x;
S_retrofit_nonprocedure_code();
}
#ifdef X86_64
if (n == FIX(library_cpu_features))
x86_64_set_popcount_present(x);
#endif
}
ptr S_lookup_library_entry(n, errorp) iptr n; IBOOL errorp; {

View File

@ -62,7 +62,7 @@ InstallLZ4Target=
# no changes should be needed below this point #
###############################################################################
Version=csv9.5.3.10
Version=csv9.5.3.11
Include=boot/$m
PetiteBoot=boot/$m/petite.boot
SchemeBoot=boot/$m/scheme.boot

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor)

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor)

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor)

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor windows)

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor)

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor)

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor)

View File

@ -328,7 +328,7 @@
[(_ foo e1 e2) e1] ...
[(_ bar e1 e2) e2]))))])))
(define-constant scheme-version #x0905030A)
(define-constant scheme-version #x0905030B)
(define-syntax define-machine-types
(lambda (x)
@ -500,7 +500,7 @@
(sparc reloc-sparcabs reloc-sparcrel)
(sparc64 reloc-sparc64abs reloc-sparc64rel)
(ppc reloc-ppccall reloc-ppcload)
(x86_64 reloc-x86_64-call reloc-x86_64-jump)
(x86_64 reloc-x86_64-call reloc-x86_64-jump reloc-x86_64-popcount)
(arm32 reloc-arm32-abs reloc-arm32-call reloc-arm32-jump)
(ppc32 reloc-ppc32-abs reloc-ppc32-call reloc-ppc32-jump))
@ -2716,6 +2716,8 @@
($wrapper-apply #f 0 #f #f)
(wrapper-apply #f 0 #f #f)
(arity-wrapper-apply #f 0 #f #f)
(popcount-slow #f 0 #f #t)
(cpu-features #f 0 #f #t)
))
(let ()

View File

@ -204,6 +204,10 @@
(let ([a1 (fx- a 12)] [x* (cons (mkcode x) x*)])
(let ([r ($reloc (constant reloc-x86_64-call) n (fx- a1 ra))])
(mkc0 (cdr c*) a (cons r r*) a1 x*)))]
[(x86_64-popcount) (n x)
(let ([a1 (fx- a 12)] [x* (cons (mkcode x) x*)])
(let ([r ($reloc (constant reloc-x86_64-popcount) n (fx- a1 ra))])
(mkc0 (cdr c*) a (cons r r*) a1 x*)))]
[else (c-assembler-output-error c)])]
[else (c-assembler-output-error c)])]))))
p))]
@ -258,7 +262,7 @@
[else (void)])]
[(x86_64)
(record-case x
[(x86_64-jump x86_64-call) (n x) (build x d)]
[(x86_64-jump x86_64-call x86_64-popcount) (n x) (build x d)]
[else (void)])]
[(arm32)
(record-case x
@ -394,6 +398,10 @@
(let ([a1 (fx- a 12)]) ; 10-byte moviq followed by 2-byte call
(let ([r ($reloc (constant reloc-x86_64-call) n (fx- a1 ra))])
(prf0 (cdr c*) a (cons r r*) a1 (cons x x*))))]
[(x86_64-popcount) (n x)
(let ([a1 (fx- a 12)]) ; like a call, for worst case
(let ([r ($reloc (constant reloc-x86_64-popcount) n (fx- a1 ra))])
(prf0 (cdr c*) a (cons r r*) a1 (cons x x*))))]
[else (c-assembler-output-error c)])]
[else (c-assembler-output-error c)])]))))]))

View File

@ -995,6 +995,11 @@
(sealed #t)
(fields offset))
(define-record-type info-inline (nongenerative)
(parent info)
(sealed #t)
(fields))
(module ()
(record-writer (record-type-descriptor info-load)
(lambda (x p wr)
@ -5723,7 +5728,8 @@
(define hand-coded-closure?
(lambda (name)
(not (memq name '(nuate nonprocedure-code error-invoke invoke
$wrapper-apply wrapper-apply arity-wrapper-apply)))))
$wrapper-apply wrapper-apply arity-wrapper-apply
popcount-slow cpu-features)))))
(define-inline 2 $hand-coded
[(name)
(nanopass-case (L7 Expr) name
@ -13384,6 +13390,32 @@
(in %ac0 %ac1 scheme-args)
(out %cp %xp %yp %ts %td extra-regs))
(goto ,Lexit))))]
[(popcount-slow)
`(lambda ,(make-info "popcount-slow" '()) 0 ()
,(constant-case architecture
[(x86_64)
`(seq
(set! ,%rax (inline ,(make-info-inline) ,%popcount ,%rdi))
(asm-c-return ,null-info ,%rdi ,%rax))]
[else
;; Generate anything, since this should not get called
`(seq
(set! ,%ac0 (immediate 0))
(jump ,%ref-ret (,%ac0)))]))]
[(cpu-features)
`(lambda ,(make-info "cpu-features" '()) 0 ()
,(constant-case architecture
[(x86_64)
(%seq
(set! ,%rdi ,%rbx) ; %rbx must be preserved
(set! ,%rax (inline ,(make-info-kill* (reg-list %rbx %rcx %rdx)) ,%cpuid))
(set! ,%rbx ,%rdi)
(asm-c-return ,null-info ,%rax ,%rbx))]
[else
;; Generate anything, since this should not get called
`(seq
(set! ,%ac0 (immediate 0))
(jump ,%ref-ret (,%ac0)))]))]
[else ($oops who "unrecognized hand-coded name ~s" sym)])]))
(define-pass np-expose-allocation-pointer : L13.5 (ir) -> L14 ()

View File

@ -131,6 +131,8 @@
(define-hand-coded-library-entry $wrapper-apply)
(define-hand-coded-library-entry wrapper-apply)
(define-hand-coded-library-entry arity-wrapper-apply)
(define-hand-coded-library-entry popcount-slow) ; before fxpopcount use
(define-hand-coded-library-entry cpu-features) ; before fxpopcount use
(define $instantiate-code-object ($hand-coded '$instantiate-code-object))

View File

@ -1027,6 +1027,10 @@
(libspec-index (lookup-libspec nonprocedure-code)))
(def "library_dounderflow"
(libspec-index (lookup-libspec dounderflow)))
(def "library_popcount_slow"
(libspec-index (lookup-libspec popcount-slow)))
(def "library_cpu_features"
(libspec-index (lookup-libspec cpu-features)))
)))
)

View File

@ -561,6 +561,7 @@
(declare-primitive -/ovfl value #f)
(declare-primitive -/eq value #f)
(declare-primitive asmlibcall value #f)
(declare-primitive cpuid value #t) ; x86_64 only, actually side-effects ebx/ecx/edx
(declare-primitive fstpl value #f) ; x86 only
(declare-primitive fstps value #f) ; x86 only
(declare-primitive get-double value #t) ; x86_64

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor pthreads)

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor pthreads)

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor pthreads)

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor pthreads windows)

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor pthreads)

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor pthreads)

View File

@ -45,7 +45,7 @@
(define-constant unaligned-floats #t)
(define-constant unaligned-integers #t)
(define-constant integer-divide-instruction #t)
(define-constant popcount-instruction #f)
(define-constant popcount-instruction #t)
(define-constant software-floating-point #f)
(define-constant segment-table-levels 3)
(features iconv expeditor pthreads)

View File

@ -700,7 +700,18 @@
`(set! ,(make-live-info) ,z ,t)))])
(define-instruction value popcount
[(op (z ur) (x ur mem)) `(set! ,(make-live-info) ,z (asm ,info ,asm-popcount ,x))])
[(op (z ur) (x ur mem))
;; Direct POPCNT instruction variant, works with corresponding `popcount-op`:
#;
`(set! ,(make-live-info) ,z (asm ,info ,(asm-popcount (info-inline? info)) ,x))
;; Link-editable variant, for corresponding `popcount-op`:
(let ([urdi (make-precolored-unspillable 'urdi %rdi)]
[urax (make-precolored-unspillable 'urax %rax)])
(seq
`(set! ,(make-live-info) ,urdi ,x)
`(set! ,(make-live-info) ,urax (asm ,info ,(asm-popcount (info-inline? info)) ,urdi))
`(set! ,(make-live-info) ,z ,urax)))])
(define-instruction value move
[(op (z mem) (x ur imm32))
@ -969,6 +980,16 @@
(safe-assert (and (info-kill*? info) (memq %rdx (info-kill*-kill* info))))
`(set! ,(make-live-info) ,z (asm ,info ,asm-read-time-stamp-counter))])
;; currently returns ECX from CPUID function 1
(define-instruction value cpuid
[(op (z ur))
(safe-assert (eq? z %rax))
(safe-assert (and (info-kill*? info)
(memq %rbx (info-kill*-kill* info))
(memq %rcx (info-kill*-kill* info))
(memq %rdx (info-kill*-kill* info))))
`(set! ,(make-live-info) ,z (asm ,info ,asm-cpuid))])
; NB: shouldn't need to list (info-kill*-live*-live* info) ... here, since we've already
; NB: computed spillable/register live sets
(define-instruction effect (c-call)
@ -1007,6 +1028,7 @@
asm-enter asm-foreign-call asm-foreign-callable
asm-inc-profile-counter
asm-inc-cc-counter asm-read-time-stamp-counter asm-read-performance-monitoring-counter
asm-cpuid
; threaded version specific
asm-get-tc asm-activate-thread asm-deactivate-thread asm-unactivate-thread
; machine dependent exports
@ -1143,6 +1165,8 @@
(define-op popcount (*) popcount-op)
(define-op cpuid two-byte-op #b1111 #b10100010)
; also do inc-reg dec-reg
; the following are forms of the call instruction and push the return address
@ -1582,17 +1606,68 @@
[3 op-code]
[0 (fxlogand (ax-ea-reg-code reg) 7)]))))))
;; Direct POPCNT instruction variant:
#;
(define popcount-op
(lambda (op size dest-reg src-ea code*)
(begin
(emit-code (op dest-reg src-ea code*)
(build byte #xF3)
(ax-ea-rex (if (eq? size 'quad) 1 0) src-ea dest-reg size)
(build byte #x0F)
(build byte #xB8)
(ax-ea-modrm-reg src-ea dest-reg)
(ax-ea-sib src-ea)
(ax-ea-addr-disp src-ea)))))
(lambda (op size dest-reg src-ea inline? code*)
(let ([code* (emit-code (op src-ea dest-reg code*)
(build byte #xF3)
(ax-ea-rex (if (eq? size 'quad) 1 0) src-ea dest-reg size)
(build byte #x0F)
(build byte #xB8)
(ax-ea-modrm-reg src-ea dest-reg)
(ax-ea-sib src-ea)
(ax-ea-addr-disp src-ea))])
(if (not (and (ax-register? src-ea)
(fx= (ax-ea-reg-code src-ea)
(ax-ea-reg-code dest-reg))))
(emit xor dest-reg dest-reg code*) ; avoid false dependency
code*))))
;; Link-editable variant:
(define popcount-op
(let ([target `(x86_64-popcount ,(constant code-data-disp) (library ,(lookup-libspec popcount-slow)))])
(lambda (op size dest-rax src-rdi inline? code*)
(safe-assert (and (ax-register? dest-rax) (ax-register? src-dir)))
(record-case dest-rax
[(reg) dest-rax
(record-case src-rdi
[(reg) src-rdi
(safe-assert (and (eq? dest-rax %rax) (eq? src-rdi %rdi)))
(cond
[(not inline?)
;; Set up a call to `popcount-slow`, which the linker
;; can replace with a POPCNT instruction:
(asm-helper-call code* target dest-rax)]
[else
;; Used for the body of `popcount-slow`.
;; This is the sequence generated by LLVM's __builtin_popcountl()
;; __builtin_popcountl() intrinsic, but with pushes and pops
;; to save used registers other than the result register %rax.
(emit-literal-code (op dest-rax src-rdi code*)
57 ; push %rdi
51 ; push %rcx
48 89 f8 ; movq %rdi, %rax
48 d1 e8 ; shrq %rax
48 b9 55 55 55 55 55 55 55 55 ; movabsq $6148914691236517205, %rcx
48 21 c1 ; andq %rax, %rcx
48 29 cf ; subq %rcx, %rdi
48 b8 33 33 33 33 33 33 33 33 ; movabsq $3689348814741910323, %rax
48 89 f9 ; movq %rdi, %rcx
48 21 c1 ; andq %rax, %rcx
48 c1 ef 02 ; shrq $2, %rdi
48 21 c7 ; andq %rax, %rdi
48 01 cf ; addq %rcx, %rdi
48 89 f8 ; movq %rdi, %rax
48 c1 e8 04 ; shrq $4, %rax
48 8d 04 38 ; leaq (%rax,%rdi), %rax
48 b9 0f 0f 0f 0f 0f 0f 0f 0f ; movabsq $1085102592571150095, %rcx
48 21 c1 ; andq %rax, %rcx
48 b8 01 01 01 01 01 01 01 01 ; movabsq $72340172838076673, %rax
48 0f af c1 ; imulq %rcx, %rax
48 c1 e8 38 ; shrq $56, %rax
59 ; pop %rcx
5f)])])])))) ; pop %rdi
(define-syntax emit-code
(lambda (x)
@ -1801,7 +1876,7 @@
(define asm-size
(lambda (x)
(case (car x)
[(asm x86_64-jump x86_64-call) 0]
[(asm x86_64-jump x86_64-call x86_64-popcount) 0]
[(byte) 1]
[(word) 2]
[(long) 4]
@ -1984,9 +2059,10 @@
(emit mulsi src1 src0 dest code*))))
(define asm-popcount
(lambda (code* dest src)
(Trivit (src)
(emit popcount (cons 'reg dest) src code*))))
(lambda (inline?)
(lambda (code* dest src)
(Trivit (dest src)
(emit popcount dest src inline? code*)))))
(define-who asm-addop
(lambda (op)
@ -2018,6 +2094,15 @@
(safe-assert (eq? dest %rax))
(emit rdtsc code*)))
(define asm-cpuid
(lambda (code* dest)
; rbx/rcx/rdx is an implied dest and included in info's kill list
(safe-assert (eq? dest %rax))
(emit movi '(imm 1) (cons 'reg %rax)
(emit cpuid
(emit mov (cons 'reg %rcx) (cons 'reg %rax)
code*)))))
(define asm-inc-profile-counter
(lambda (code* dest src)
(Trivit (dest src)