Chez Scheme: move write-barrier fixnum guard outside write fence

On an Apple M1, an example like

      (let ([v (make-vector 1000000)])
        (for* ([_ (in-range 100)]
               [i (in-range 1000000)])
          (vector-set! v i (+ 1 i))))

would generate tightly interleaved fence and store operations, which
seems to make the processor unhappy so that the code run 40x slower
than it should.

A key part of the example is that `(+ 1 i)` defeats cp0-level
inference that the result will be a fixnum. A dynamic fixnum test
avoids adding the update to a remembered set as part of the write
barrier, but the memory fence needed for ARM must be before the store,
while the fixnum test was after the store. This change bundles the
write fence and remember-set update under a `fixnum?` guard, so they
happen to gether or not at all --- at the small cost of generating the
store instruction(s) in two branches. In the example above, neither
the fence nor remember-set update happen, but changing `(+ 1 i)` to
`(quote x)` triggers both, and performance is still ok.
This commit is contained in:
Matthew Flatt 2021-02-03 10:20:01 -07:00
parent 0a945cd5f7
commit 41623f3027

View File

@ -4097,22 +4097,26 @@
,(%inline sll ,e
(immediate ,(fx- (constant char-data-offset) (constant fixnum-offset))))
,(%constant type-char))))
(define need-store-fence?
(if-feature pthreads
(constant-case architecture
[(arm32 arm64) #t]
[else #f])
#f))
(define add-store-fence
;; A store--store fence should be good enough for safety on a platform that
;; orders load dependencies (which is anything except Alpha)
(lambda (e)
(if-feature pthreads
(constant-case architecture
[(arm32 arm64) `(seq ,(%inline store-store-fence) ,e)]
[else e])
e)))
(if need-store-fence?
`(seq ,(%inline store-store-fence) ,e)
e)))
(define build-dirty-store
(case-lambda
[(base offset e) (build-dirty-store base %zero offset e)]
[(base index offset e) (build-dirty-store base index offset e
(lambda (base index offset e) `(set! ,(%mref ,base ,index ,offset) ,e))
(lambda (s r) (add-store-fence `(seq ,s ,r))))]
[(base index offset e build-assign build-barrier-seq)
(lambda (s r) `(seq ,s ,r)))]
[(base index offset e build-assign build-remember-seq)
(nanopass-case (L7 Expr) e
[(call ,info ,mdcl ,pr ,e)
(guard (eq? (primref-name pr) '$immediate))
@ -4135,17 +4139,30 @@
(bind #f ([e e])
; eval a second so the address is not live across any calls
(bind #t ([a a])
(build-barrier-seq
(build-assign a %zero 0 e)
(%inline remember ,a))))
(add-store-fence
(build-remember-seq
(build-assign a %zero 0 e)
(%inline remember ,a)))))
(bind #t ([e e])
; eval a second so the address is not live across any calls
(bind #t ([a a])
(build-barrier-seq
(build-assign a %zero 0 e)
`(if ,(%type-check mask-fixnum type-fixnum ,e)
,(%constant svoid)
,(%inline remember ,a))))))))])]))
(if need-store-fence?
;; Fence needs to be before store, so duplicate
;; store instruction to lift out fixnum check; this
;; appears to be worthwhile on the Apple M1 to avoid
;; tighly interleaved writes and fences
`(if ,(%type-check mask-fixnum type-fixnum ,e)
,(build-assign a %zero 0 e)
,(add-store-fence
(build-remember-seq
(build-assign a %zero 0 e)
(%inline remember ,a))))
;; Generate one copy of store instruction
(build-remember-seq
(build-assign a %zero 0 e)
`(if ,(%type-check mask-fixnum type-fixnum ,e)
,(%constant svoid)
,(%inline remember ,a)))))))))])]))
(define make-build-cas
(lambda (old-v)
(lambda (base index offset v)
@ -4154,10 +4171,9 @@
(inline ,(make-info-condition-code 'eq? #f #t) ,%condition-code)))))
(define build-cas-seq
(lambda (cas remember)
(add-store-fence
`(if ,cas
(seq ,remember ,(%constant strue))
,(%constant sfalse)))))
`(if ,cas
(seq ,remember ,(%constant strue))
,(%constant sfalse))))
(define build-$record
(lambda (tag args)
(bind #f (tag)