statistical profiler, core and text interface part done

svn: r14340

original commit: 96d7291bfb1e4f4a600ecda14f314ccccfa2ff48
This commit is contained in:
Eli Barzilay 2009-03-30 00:15:38 +00:00
commit e29c411d53
13 changed files with 1220 additions and 0 deletions

View File

@ -0,0 +1,258 @@
#lang scheme/base
;; Analyzer for the sampler results
(provide analyze-samples)
(require scheme/list)
;; An encapsulation of an analyzed profile call graph:
;; - total-time: the total time observed in msec (this is generally different
;; than the time it took to run the profile).
;; - sample-number: the number of samples taken.
;; - thread-times: a list of (<thread-id> . msec) for the time spent in
;; observed threads.
;; - nodes: the list of call-graph nodes sorted by their total time.
;; - *-node: a special node that is connected as a "caller" for all toplevel
;; functions and a "callee" for all leaf functions. It will also be
;; identifiable by having both id and src fields being #f. Can be used to
;; start a graph traversal from the top or the bottom.
(provide (struct-out profile))
(define-struct profile
(total-time cpu-time sample-number thread-times nodes *-node))
;; An entry for a single profiled function:
;; - id, src: the corresponding values from `continuation-mark-set->context'.
;; - thread-ids: the list of thread identifiers this function has been seen in.
;; - total: total msecs it participated in (= time in it, including callees).
;; - self: msecs where it was at the top of the stack (= time in its own code).
;; - callers, callees: a list of `edge' values for the time spent while it was
;; called by the repective <node>, or it called it, sorted in decreasing msec
;; time.
;; Note that the sum of caller/callee edges including the special `*-node'
;; should be equal to the `total' time. So the edge from/to the `*-node' can
;; be used to get the time spent as a leaf or as a root divided by the number
;; of time the function appeared on the stack: so this value can be displayed
;; in the call-graph and the numbers will sum up nicely to a 100%.
(provide (struct-out node))
(define-struct node (id src thread-ids total self callers callees)
#:mutable
#:property prop:custom-write
(lambda (node o w?) (fprintf o "#<node:~s>" (or (node-id node) '???))))
;; An edge representing function calls between two nodes:
;; - time: the total time spent while the call was somewhere on the stack.
;; - caller, callee: the two relevant `node' values.
;; - caller-time, callee-time: the time that the caller/callee spent in this
;; call (different from the above time because each stack sample's time is
;; divided by the number of times the caller/callee appears in that slice).
(provide (struct-out edge))
(define-struct edge (total caller caller-time callee callee-time)
#:mutable
#:property prop:custom-write
(lambda (edge o w?)
(fprintf o "#<edge:~s-~s>"
(or (node-id (edge-caller edge)) '???)
(or (node-id (edge-callee edge)) '???))))
(define with-hash:not-found (gensym))
(define-syntax-rule (with-hash <hash> <key> <expr>)
(let ([t <hash>] [k <key>])
(let ([v (hash-ref t k with-hash:not-found)])
(if (eq? v with-hash:not-found)
(let ([v <expr>]) (hash-set! t k v) v)
v))))
;; This function analyzes the output of the sampler. Returns a `profile'
;; struct holding a list of `node' values, each one representing a node in the
;; call graph, with the relevant information filled in. The results are sorted
;; by the total time.
(define (analyze-samples cpu-time+samples)
(define cpu-time (car cpu-time+samples))
(define samples (cdr cpu-time+samples))
(define samples-by-thread
(let ([by-thread (split-by-thread samples)])
(for ([samples (in-vector by-thread)] [i (in-naturals 0)])
(vector-set! by-thread i (get-times samples)))
by-thread))
(define id+src->node-hash (make-hasheq))
(define (id+src->node id+src)
(with-hash id+src->node-hash id+src
(make-node (car id+src) (cdr id+src) '() 0 0 '() '())))
;; special node that is the caller of toplevels and callee of leaves
(define *-node (id+src->node '(#f . #f)))
(define call->edge
(let ([t (make-hasheq)])
(lambda (ler lee)
(with-hash (with-hash t ler (make-hasheq)) lee
(let ([e (make-edge 0 ler 0 lee 0)])
(set-node-callers! lee (cons e (node-callers lee)))
(set-node-callees! ler (cons e (node-callees ler)))
e)))))
(define total-time 0)
(define thread-times (make-vector (vector-length samples-by-thread) 0))
(for ([thread-samples (in-vector samples-by-thread)]
[thread-id (in-naturals 0)]
#:when #t
[sample (in-list thread-samples)])
(define msecs (car sample))
(define (connect ler lee ler# lee#)
(define edge (call->edge ler lee))
(set-edge-caller-time! edge (+ (edge-caller-time edge) (/ msecs lee#)))
(set-edge-callee-time! edge (+ (edge-callee-time edge) (/ msecs ler#)))
edge)
(define stack ; the stack snapshot, translated to `node' values
(for/list ([id+src (in-list (cdr sample))])
(let* ([node (id+src->node id+src)] [tids (node-thread-ids node)])
(unless (memq thread-id tids)
(set-node-thread-ids! node (cons thread-id tids)))
node)))
(define counts (get-counts stack))
(define stack+counts (map (lambda (x) (assq x counts)) stack))
(define edges
(if (null? stack)
'()
(append (let ([first (car stack+counts)] [last (last stack+counts)])
(list (connect *-node (car last) 1 (cdr last))
(connect (car first) *-node (cdr first) 1)))
(for/list ([callee (in-list stack+counts)]
[caller (in-list (cdr stack+counts))])
(connect (car caller) (car callee)
(cdr caller) (cdr callee))))))
(set! total-time (+ msecs total-time))
(for ([p (in-list counts)])
(set-node-total! (car p) (+ msecs (node-total (car p)))))
(for ([e (remove-duplicates edges eq?)])
(set-edge-total! e (+ msecs (edge-total e))))
(vector-set! thread-times thread-id
(+ msecs (vector-ref thread-times thread-id)))
(when (pair? stack)
(set-node-self! (car stack) (+ (node-self (car stack)) msecs))))
(set-node-total! *-node total-time)
;; convert the nodes from the hash to a list, do a topological sort, and then
;; sort by total time (combining both guarantees(?) sensible order)
(let* ([nodes (hash-map id+src->node-hash (lambda (k v) v))]
[nodes (topological-sort *-node (length nodes))]
[nodes (sort (remq *-node nodes) > #:key node-total)])
;; sort all the edges in the nodes according to total time
(for ([n (in-list nodes)])
(set-node-callees! n (sort (node-callees n) > #:key edge-callee-time))
(set-node-callers! n (sort (node-callers n) > #:key edge-caller-time)))
(make-profile
total-time
cpu-time
(length samples)
(for/list ([time (in-vector thread-times)] [n (in-naturals 0)])
(cons n time))
nodes
*-node)))
;; A simple topological sort of nodes using BFS, starting from node `x' which
;; will be the special *-node. Note that `expected-length' is only given for
;; the sanity check in the end.
(define (topological-sort x expected-length)
(define sorted
(let loop ([todo (list x)] [seen (list x)])
(if (null? todo)
'()
(let* ([next (append-map (lambda (x) (map edge-caller (node-callers x)))
todo)]
[next (remove-duplicates next)]
[next (remq* seen next)])
(append todo (loop next (append next seen)))))))
(if (= expected-length (length sorted))
sorted
(error 'topological "internal error")))
;; Groups raw samples by their thread-id, returns a vector with a field for
;; each thread id holding the sample data for that thread. The samples in
;; these are reversed (so they'll be sorted going forward in time).
(define (split-by-thread samples)
(define threads
(make-vector (add1 (for/fold ([n -1]) ([sample (in-list samples)])
(max (car sample) n)))
'()))
(for ([sample (in-list samples)])
(let ([id (car sample)] [data (cdr sample)])
(vector-set! threads id (cons data (vector-ref threads id)))))
threads)
#|
(equal? (split-by-thread '())
'#())
(equal? (split-by-thread '([0 x]))
'#([(x)]))
(equal? (split-by-thread '([0 x] [0 y] [0 z]))
'#([(z) (y) (x)]))
(equal? (split-by-thread '([0 x] [1 y] [2 z]))
'#([(x)] [(y)] [(z)]))
(equal? (split-by-thread '([0 x1] [1 y1] [0 x2] [2 z1] [0 x3] [2 z2]))
'#([(x3) (x2) (x1)] [(y1)] [(z2) (z1)]))
|#
;; gets a list of thread-id and data for that thread beginning with the
;; millisecond count, and returns a similar list where the samples begin with
;; the time spent for that sample. The time spent is taken as half of the two
;; touching ranges; for example, if there are three samples showing snapshot
;; times of 10, 20, 60, then the middle one is assumed to have a time of 25.
;; For the first and last samples, the time is twice the half of the single
;; touching range -- with this example, this would be 10 for the first and 40
;; for the last. If there is a thread with just one sample, it is dropped.
(define (get-times samples)
(cond
;; nothing to do
[(null? samples) '()]
;; throw out a single sample
[(null? (cdr samples)) '()]
[else (let loop ([samples samples]
[prevs (cons #f (map car samples))]
[r '()])
(if (null? samples)
(reverse r)
(let* ([prev (car prevs)]
[cur (caar samples)]
[data (cdar samples)]
[prevs (cdr prevs)]
[samples (cdr samples)]
[next (and (pair? samples) (caar samples))])
(loop samples prevs
(cons (cons (if next
;; not the last: there must be a next
(if prev (/ (- next prev) 2) (- next cur))
;; last one: there must be a prev
(- cur prev))
data)
r)))))]))
#|
(equal? (get-times '())
'())
(equal? (get-times '([10 a]))
'())
(equal? (get-times '([10 a] [20 b]))
'([10 a] [10 b]))
(equal? (get-times '([10 a] [20 b] [60 c]))
'([10 a] [25 b] [40 c]))
(equal? (get-times '([10 a] [20 b] [30 c] [40 d]))
'([10 a] [10 b] [10 c] [10 d]))
(equal? (get-times '([10 a] [20 b] [60 c] [80 d]))
'([10 a] [25 b] [30 c] [20 d]))
|#
;; returns a list of (cons item occurrences) for the items in l
(define (get-counts l)
(let loop ([l l] [r '()])
(if (null? l)
r
(let ([1st (car l)])
(let loop* ([l1 '()] [c 1] [l (cdr l)])
(cond [(null? l) (loop l1 (cons (cons 1st c) r))]
[(eq? 1st (car l)) (loop* l1 (add1 c) (cdr l))]
[else (loop* (cons (car l) l1) c (cdr l))]))))))
#|
(equal? (get-counts '()) '())
(equal? (get-counts '(1)) '([1 . 1]))
(equal? (get-counts '(1 1 1)) '([1 . 3]))
(define (set=? xs ys) (null? (append (remove* xs ys) (remove* ys xs))))
(set=? (get-counts '(1 2 3)) '([1 . 1] [2 . 1] [3 . 1]))
(set=? (get-counts '(1 2 2 3 3 3)) '([1 . 1] [2 . 2] [3 . 3]))
(set=? (get-counts '(3 1 2 3 2 3)) '([1 . 1] [2 . 2] [3 . 3]))
|#

69
collects/profile/main.ss Normal file
View File

@ -0,0 +1,69 @@
#lang scheme/base
(require "sampler.ss" (except-in "analyzer.ss" profile)
(prefix-in text: "render-text.ss")
(for-syntax scheme/base))
(define (profile-thunk thunk
#:delay [delay 0.05]
#:repeat [rpt 1]
#:threads [threads? #f]
#:render [renderer text:render])
(define cust (and threads? (make-custodian (current-custodian))))
(define sampler (create-sampler (if threads?
(list cust (current-thread))
(current-thread))
delay))
(define (run) (for ([i (in-range rpt)]) (thunk)))
(with-handlers ([void (lambda (e)
(fprintf (current-error-port)
"profiled thunk error: ~a\n"
(if (exn? e)
(exn-message e)
(format "~e" e))))])
(if threads?
(parameterize ([current-custodian cust]) (run))
(run)))
(sampler 'stop)
(renderer (analyze-samples (sampler 'get-snapshots))))
(define-syntax (profile stx)
(syntax-case stx ()
[(_ x ...)
(let loop ([expr #f] [kwds '()] [xs (syntax->list #'(x ...))])
(cond
[(null? xs)
(if expr
(with-syntax ([expr expr] [kwds (reverse kwds)])
#'(profile-thunk (lambda () expr) . kwds))
(raise-syntax-error 'profile "missing expression" stx))]
[(keyword? (syntax-e (car xs)))
(if (pair? (cdr xs))
(loop expr (list* (cadr xs) (car xs) kwds) (cddr xs))
;; let #%app throw the error
(loop expr (cons (car xs) kwds) (cdr xs)))]
[expr (raise-syntax-error 'profile "redundant expresion given"
stx (car xs))]
[else (loop (car xs) kwds (cdr xs))]))]))
(define (fib1 n) (if (<= n 1) n (+ (fib1 (- n 1)) (fib1 (- n 2)))))
(define (fib22 n) (if (<= n 2) 1 (+ (fib22 (- n 1)) (fib22 (- n 2)))))
(define (fib3 n)
(for ([i (in-range 100000000)]) (* i 3))
(if (<= n 2) 1 (+ (fib22 (- n 1)) (fib22 (- n 2)))))
(define (fibs n) (+ (fib1 n) (fib22 n) (fib3 n)))
(define (foo n)
(define ch (make-channel))
(define (bg-fib) (channel-put ch (fib1 n)))
(thread bg-fib)
(list (fib22 n) (channel-get ch)))
(require "render-graphviz.ss")
(profile (fibs 35)
;(dynamic-require '(lib "scribblings/reference/reference.scrbl") #f)
;(foo 35)
;#:render render
#:threads #t)

View File

@ -0,0 +1,41 @@
#lang scheme/base
(provide render)
(require "analyzer.ss" "utils.ss")
(define (render profile
#:hide-self [hide-self% 1/100]
#:hide-subs [hide-subs% 2/100])
(define *-node (profile-*-node profile))
(define hidden (get-hidden profile hide-self% hide-subs%))
(define nodes (remq* hidden (profile-nodes profile)))
(define total-time (profile-total-time profile))
(define node->
(let ([t (make-hasheq)])
(for ([node (in-list nodes)] [idx (in-naturals 1)])
(let ([id (node-id node)] [src (node-src node)])
(hash-set! t node
(list (format "node~a" idx)
(if id
(format "~a" id)
(regexp-replace #rx"^.*/"
(format-source (node-src node))
""))))))
(lambda (mode node)
((case mode [(index) car] [(label) cadr]) (hash-ref t node)))))
(define max-self%
(/ (for/fold ([m 0]) ([node (in-list nodes)]) (max m (node-self node)))
total-time))
(printf "digraph Profile {\n")
(for ([node (in-list nodes)])
(define self% (/ (node-self node) total-time))
(printf "~a [" (node-> 'index node))
(printf "label=~s, " (node-> 'label node))
(printf "fillcolor=\"1,~a,1\", " (exact->inexact (/ self% max-self%)))
(printf "style=filled];\n")
(for ([edge (in-list (node-callees node))])
(define callee (edge-callee edge))
(unless (or (eq? *-node callee) (memq callee hidden))
(printf "~a -> ~a;\n" (node-> 'index node) (node-> 'index callee)))))
(printf "}\n"))

View File

@ -0,0 +1,137 @@
#lang at-exp scheme/base
(provide render)
(require "analyzer.ss" "utils.ss" scheme/list)
(define (f:msec msec)
(number->string (round (inexact->exact msec))))
(define (f:msec* msec)
(string-append (f:msec msec) "ms"))
(define (display-table aligns table)
;; * thunks are used for cells that are ignored when inspecting widths
;; * chars are used for filler cells
(define (display-line strings)
(printf "~a\n" (regexp-replace #rx" +$" (apply string-append strings) "")))
(let ([widths (let loop ([table table])
(let ([table (filter pair? table)])
(if (null? table) '()
(cons (apply max (filter-map
(lambda (x)
(and (string? (car x))
(string-length (car x))))
table))
(loop (map cdr table))))))])
(for ([row (in-list table)])
(display-line
(for/list ([cell (in-list row)]
[width (in-list widths)]
[align (in-list aligns)])
(let* ([cell (cond [(char? cell) (make-string width cell)]
[(procedure? cell) (cell)]
[else cell])]
[pad (make-string (max 0 (- width (string-length cell)))
#\space)])
(case align
[(l) (string-append cell pad)]
[(r) (string-append pad cell)]
[else (error 'internal-error "poof")])))))))
(define (render profile
#:truncate-source [truncate-source 50]
#:hide-self [hide-self% 1/100]
#:hide-subs [hide-subs% 2/100])
(define (show . xs)
(let loop ([x xs])
(cond [(or (not x) (null? x) (void? x)) (void)]
[(pair? x) (loop (car x)) (loop (cdr x))]
[else (display x)]))
(newline))
(define total-time (profile-total-time profile))
(define cpu-time (profile-cpu-time profile))
(define sample-number (profile-sample-number profile))
(define granularity (/ total-time sample-number))
(define threads+times (profile-thread-times profile))
(define *-node (profile-*-node profile))
(define hidden (get-hidden profile hide-self% hide-subs%))
(define nodes (remq* hidden (profile-nodes profile)))
(define node->
(let ([t (make-hasheq)])
(for ([node (in-list nodes)] [idx (in-naturals 1)])
(let ([index (format "[~a]" idx)]
[label (format "~a" (or (node-id node) '???))])
(hash-set! t node (list index label @string-append{@label @index}))))
(lambda (mode node)
((case mode [(index) car] [(label) cadr] [(sub-label) caddr])
(hash-ref t node)))))
(define (sep ch) (list ch ch ch ch ch ch ch ch ch ch))
(define =sep (sep #\=))
(define -sep (sep #\-))
@show{
Profiling results
-----------------
Total cpu time observed: @f:msec*[total-time] (out of @f:msec*[cpu-time])
Number of samples taken: @sample-number (once every @f:msec*[granularity])
}
(when (> (length threads+times) 1)
@show{ Threads observed: @(length threads+times)})
(when (pair? hidden)
(let* ([hidden (length hidden)]
[nodes (length (profile-nodes profile))]
[self% @string-append{self<@(format-percent (or hide-self% 0))}]
[subs% @string-append{local<@(format-percent (or hide-subs% 0))}]
[%s (cond [(not hide-self%) subs%]
[(not hide-subs%) self%]
[else @string-append{@self% and @subs%}])])
@show{ (Hiding functions with @|%s|: @|hidden| of @nodes hidden)}))
(newline)
(display-table
'(r l r l l r l l l r l l)
(append*
`(,=sep
(" " " " " "" " " " " "" " " " " Caller")
("Idx" " " "To""tal " " " "Se""lf " " " "Name+src" "Local%")
(" " " " " ms""(pct)" " " "ms""(pct)" " " " Callee")
,=sep)
(for/list ([node (in-list nodes)])
(define index (node-> 'index node))
(define name (node-> 'label node))
(define total (node-total node))
(define totalS (f:msec total))
(define total% @string-append{(@(format-percent (/ total total-time)))})
(define self (node-self node))
(define selfS (f:msec self))
(define self% @string-append{(@(format-percent (/ self total-time)))})
(define name+src
(let* ([src (format-source (node-src node))]
[src-len (string-length src)]
[name-len (string-length name)])
(string-append
name " "
;; truncate-source only truncates the source
(let* ([n (and truncate-source
((+ src-len name-len 1) . - . truncate-source))]
[n (and n (positive? n) (- src-len n 3))])
(cond [(not n) src]
[(n . <= . 0) "..."]
[else (string-append "..."
(substring src (- src-len n)))])))))
(define (sub get-edges get-node get-node-time)
(for*/list ([edge (in-list (get-edges node))]
[sub (in-list (list (get-node edge)))] ; <-- hack...
#:when (not (or (eq? *-node sub) ; <-- ...for this
(memq sub hidden))))
(define name (node-> 'sub-label sub))
(define local% (format-percent (/ (get-node-time edge) total)))
`("" "" "" "" "" "" "" ""
,(string-append " " name) ,local%
"" "")))
`(,@(reverse (sub node-callers edge-caller edge-caller-time))
(,(node-> 'index node)
" " ,totalS ,total%
" " ,selfS ,self%
" " ,(lambda () name+src))
,@(sub node-callees edge-callee edge-callee-time)
,-sep)))))

134
collects/profile/sampler.ss Normal file
View File

@ -0,0 +1,134 @@
#lang scheme/base
;; The core profiler sample collector
;; (This module is a private tool for collecting profiling data, and should not
;; be used as is.)
(provide create-sampler)
;; create-sampler : creates a sample collector thread, which tracks the given
;; `to-track' value every `delay' seconds.
;; * The input value can be either a thread (track just that thread), a
;; custodian (track all threads managed by the custodian), or a list of
;; threads and/or custodians. If a custodian is given, it must be
;; subordinate to the current custodian.
;; * The collected values are (<thread-id> <thread-time> . <stack>), where
;; - The <thread-id> is an integer number identifying the thread, starting
;; from 0. If the collected data has thread ids in a 0..N range
;; (exclusive) , then there were N threads that were observed. (This can
;; be relevant when tracking a custodian, where threads can change
;; dynamically.)
;; - The <thread-time> is the result of `current-process-milliseconds' for
;; the thread that this sample came from. Note that these numbers will not
;; start at 0, since threads are likely to run before a sample is
;; collected.
;; - Finally, the <stack> part is a snapshot of the thread's stack, as
;; grabbed by `continuation-mark-set->context'. The the values in these
;; snapshots are interned to reduce memory load.
;; The results are collected sequentially, so they're always sorted from the
;; newest to the oldest. Remember that these results should be considered
;; private for the profiler collection, and can change when more information
;; needs to be collected.
;; * Returns a "controller" function that accepts messages to control the
;; sampler thread. The current set of messages that the controller
;; understands are:
;; - 'pause and 'resume: stops or resumes collecting samples. These messages
;; can be nested. Note that the thread will continue running it just won't
;; collect snapshots.
;; - 'stop: kills the collector thread. Should be called when you have your
;; data. (There is no message to start a new sampler thread, although
;; adding one will not be difficult.)
;; - 'set-tracked! <new>: changes the thread/s and/or custodian/s to track.
;; - 'set-delay! <new>: changes the sampling delay. This means that we won't
;; have a direct correlation between the number of samples and the time
;; they represent -- but the samples are statistical snapshots anyway, and
;; the results are not formulated in terms of time spent. (The time spent
;; could be added of course, but it is best to do that in terms of the
;; start/stop times)
;; - 'get-snapshots: returns the currently collected list of snapshots. Note
;; that this can be called multiple times, each will return the data that
;; is collected up to that point in time.
(define (create-sampler to-track delay)
;; the collected data
(define snapshots '())
(define cust (current-custodian))
;; intern the entries (which are (cons id/#f srcloc/#f))
(define entry-table (make-hash))
(define (intern-entry entry)
(let* ([key (or (cdr entry) (car entry))]
[en (hash-ref entry-table key #f)])
(if en
;; ELI: is this sanity check needed?
;; (if (equal? en entry)
;; en
;; (error 'profile "internal error: assumption invalid"))
en
(begin (hash-set! entry-table key entry) entry))))
(define (validate to-track who)
(let loop ([t to-track])
(cond
[(thread? t)]
[(list? t) (for-each loop t)]
[(not (custodian? t))
(raise-type-error
who "thread, custodian, or a list of threads/csutodians" to-track)]
;; test that it's subordinate
[(with-handlers ([exn:fail:contract? (lambda (_) #t)])
(custodian-managed-list t cust) #f)
(error who "got a custodian that is not subordinate to current")])))
(define paused 0)
(define thread-id
(let ([next-id 0] [t (make-weak-hasheq)])
(lambda (thd)
(or (hash-ref t thd #f)
(let ([id next-id])
(set! next-id (add1 next-id))
(hash-set! t thd id)
id)))))
(define (sampler)
(sleep delay)
(when (paused . <= . 0)
(let loop ([t to-track])
(cond [(thread? t)
(unless (eq? t sampler-thread)
(set! snapshots
(cons (list* (thread-id t)
(current-process-milliseconds t)
(map intern-entry
(continuation-mark-set->context
(continuation-marks t))))
snapshots)))]
[(custodian? t) (for-each loop (custodian-managed-list t cust))]
;; cannot assume that it's a list: we might get other values from
;; a custodian managed list
[(list? t) (for-each loop t)])))
(sampler))
(define cpu-time 0)
(define start-time (current-process-milliseconds))
(define (add-time)
(when (paused . <= . 0)
(let ([cur (current-process-milliseconds)])
(set! cpu-time (+ cpu-time (- cur start-time)))
(set! start-time cur))))
(define (ignore-time)
(when (paused . <= . 0)
(set! start-time (current-process-milliseconds))))
(define sampler-thread
(begin (validate to-track 'create-sampler)
(thread sampler)))
;; use a sema to avoid mutations from different threads, the sampler thread
;; is only reading these values so it doesn't use it.
(define sema (make-semaphore 1))
(define (sampler-controller msg [arg #f])
(define-syntax-rule (w/sema body ...)
(call-with-semaphore sema (lambda () body ...)))
(case msg
[(pause) (w/sema (add-time) (set! paused (add1 paused)))]
[(resume) (w/sema (set! paused (sub1 paused)) (ignore-time))]
[(stop) (kill-thread sampler-thread) (add-time) (set! paused +inf.0)]
[(set-tracked!) (validate arg 'sampler-controller)
(w/sema (set! to-track arg))]
[(set-delay!) (w/sema (set! delay arg))]
[(get-snapshots) (add-time) (cons cpu-time snapshots)]
[else (error 'sampler-controller "unknown message: ~e" msg)]))
sampler-controller)

View File

@ -0,0 +1,156 @@
#lang scribble/doc
@(require scribble/manual
(for-label scheme profile/analyzer))
@title[#:tag "analyzer"]{Analyzing Profile Data}
@defmodule[profile/analyzer]
Once a profile run is done, and the results are collected, the next
step is to analyze the data. In this step the sample time are
computed and summed, a call-graph representing the observed function
calls is built, and per-node and per-edge information is created.
This is the job of the main function provided by
@scheme[profile/analyzer].
@defproc[(analyze-samples [raw-sample-data any/c])
profile?]{
This function consumes the raw result of the
@seclink["sampler"]{sampler} (which is given in an undocumented form),
analyzes it, and returns a @scheme[profile] value holding the analyzed
results. Without this function, the results of the sampler are
meaningless.}
@defstruct[profile ([total-time exact-nonnegative-integer?]
[cpu-time exact-nonnegative-integer?]
[sample-number exact-nonnegative-integer?]
[thread-times (listof (cons exact-nonnegative-integer?
exact-nonnegative-integer?))]
[nodes (listof node?)]
[*-node node?])]{
Represents the analyzed profile result.
@itemize[
@item{@scheme[total-time] is the total observed time (in milliseconds)
included in the profile. This is different than the actual time the
profiling took, due to unaccounted-for time spent in untracked
threads. (E.g., the sampler thread itself.)}
@item{@scheme[cpu-time] is the actual cpu time consumed by the process
during the profiler's work.}
@item{@scheme[sample-number] holds the number of samples taken during
the profile. This can be used to compute the average time frame
each of the input samples represented.}
@item{@scheme[thread-times] holds an association list mapping thread
identifiers to cpu time for the corresponding threads. As samples
are collected, each thread that is observed is assigned a small
integer identifier. These identifers are listed for each function
call, and the total time spent in each thread is in this field.}
@item{@scheme[nodes] is a list of nodes representing all observed
functions. These nodes are the components of the call-graph that
the analyzer assembles (see the @scheme[edge] field). The nodes are
sorted by decreasing total amount of time (time spent either in the
function or in its callees), and by a top-to-bottom topological
sort.}
@item{@scheme[*-node] holds a ``special'' node value that is
constructed for every graph. This node is used as the caller for
all top-level function nodes and as the callee for all leaf nodes.
It can therefore be used to start a scan of the call graph. In
addition, the times associated with it's "callers and callees"
actually represent the time these functions spent being the root of
the computation or its leaf. (This can be different from a node's
``self'' time, since it is divided by the number of instances a
function had on the stack for every sample --- so for recursive
functions this value is different from.)}
]}
@defstruct[node ([id (or/c #f symbol?)]
[src (or/c #f srcloc?)]
[thread-ids (listof exact-nonnegative-integer?)]
[total exact-nonnegative-integer?]
[self exact-nonnegative-integer?]
[callers (listof edge?)]
[callees (listof edge?)])]{
Represents a function call node in the call graph of an analyzed
profile result.
@itemize[
@item{The @scheme[id] and @scheme[src] field hold a symbol naming the
function and/or its source location as a @scheme[srcloc] value.
This is the same as the results of
@scheme[continuation-mark-set->context], so at most of of these can
be @scheme[#f], except for the special @scheme[*-node] (see the
@scheme[profile] struct) that can be identified by both of these
being @scheme[#f].}
@item{@scheme[thread-ids] holds a list of thread identifiers that were
observed executing this function.}
@item{@scheme[total] holds the total time (in milliseconds) where this
function was anywhere on the stack. It is common to see a few
toplevel functions that have close to a 100% total time, but
otherwise small @scheme[self] times --- these functions are the ones
that derive the work that was done, but they don't do any hard work
directly.}
@item{@scheme[self] holds the total time (in milliseconds) where this
function was observed as the leaf of the stack. It represents the
actual work done by this function, rather than @scheme[total] that
represents the work done by both the function and its callees.}
@item{@scheme[callers] and @scheme[callees] hold the list of caller
and callee nodes. The nodes are not actually held in these lists,
instead, @scheme[edge] values are used --- and provide information
specific to an edge in the call-graph.}
]}
@defstruct[edge ([total exact-nonnegative-integer?]
[caller node?]
[caller-time exact-nonnegative-integer?]
[callee node?]
[callee-time exact-nonnegative-integer?])]{
Represents an edge between two function call nodes in the call graph
of an analyzed profile result.
@itemize[
@item{@scheme[total] is analogous to the @scheme[total] field of a
@scheme[node] value: the total time that this edge was anywhere on
the stack.}
@item{@scheme[caller] and @scheme[callee] hold the two nodes that are
connected by this edge.}
@item{@scheme[caller-time] and @scheme[callee-time] hold the time
spent on this edge from the caller's or the callee's perspective.
These times are different from each other (as well as from the total
time) because the sums that make them are each divided by the number
of times the caller or the callee was on the stack.
To understand this difference, consider a stack snapshot holding
@tt{A @'rarr B @'rarr B @'rarr B @'rarr A}, and representing a
second of observed cpu time. For this sample, the @tt{A @'rarr B}
edge is charged by a whole second for its total time (the same goes
for the @tt{A @'rarr A} edge, for example). Its caller time is
charged 1/2 second because @tt{A} appears twice in this stack
snapshot (in the other half, @tt{A} is chared for being a leaf ---
the caller of the special @scheme[*-node]), and its callee time is
charged 1/3 respectively.}
]}

View File

@ -0,0 +1,3 @@
#lang setup/infotab
(define scribblings '(("profile.scrbl" () (tool-library))))

View File

@ -0,0 +1,25 @@
#lang scribble/doc
@(require scribble/manual
(for-label scheme))
@title{Statistical Profiler}
The @scheme[profile] collection implements a statistical profiler.
The profiling is done by running a background thread that collects
stack snapshots via @scheme[continuation-mark-set->context], meaning
that the result is an estimate of the execution costs and it is
limited to the kind of information that
@scheme[continuation-mark-set->context] produces (most notably being
limited to functions calls, and subject to compiler optimizations);
but the result is often useful. In practice, since this method does
not require recompilation of your source and has very little runtime
overhead, it can be used for longer runs which compensates for these
limits.
@table-of-contents[]
@include-section["toplevel.scrbl"]
@include-section["sampler.scrbl"]
@include-section["analyzer.scrbl"]
@include-section["renderers.scrbl"]

View File

@ -0,0 +1,129 @@
#lang scribble/doc
@(require scribble/manual
(for-label scheme
profile/analyzer
(prefix-in text: profile/render-text)
(prefix-in graphviz: profile/render-graphviz)))
@title[#:tag "renderers"]{Profile Renderers}
After collecting the profile samples and analyzing the data, the last
aspect of profiling is to render the results. The profile collection
provides several renderers, each providing a rendering function that
consumes a @scheme[profile] instance. See the
@seclink["analyzer"]{analyzer} section for a description of the
@scheme[profile] struct if you want to implement your own renderer.
@;--------------------------------------------------------------------
@section{Textual Rendering}
@defmodule[profile/render-text]
@defproc[(render
[profile-data profile?]
[#:truncate-source truncate-source exact-nonnegative-integer? 50]
[#:hide-self hide-self% (between/c 0 1) 1/100]
[#:hide-subs hide-subs% (between/c 0 1) 2/100])
void?]{
Prints the given @scheme[profile] results as a textual table.
The printout begins with some general facts about the profile, and
then a table that represents the call-graph is printed. Each row in
this table looks like:
@verbatim[#:indent 2]{
B [M1] M2%
[N1] N2(N3%) N4(N5%) A ...path/to/source.ss:12:34
C [M3] M4%}
Where actual numbers appear in the printout. The meaning of the
numbers and labels is as follows:
@itemize{
@item{@tt{A} --- the name of the function that this node represents,
followed by the source location for the function if it is known.
The name can be ``???'' for functions with no identifier, but in
this case the source location will identify them.}
@item{@tt{N1} --- an index number associated with this node. This is
important in references to this function, since the symbolic names
are not unique (and some can be missing). The number itself has no
significance, it simply goes from 1 up.}
@item{@tt{N2} --- the time (in milliseconds) that this function has
been anywhere in a stack snapshot. This is the total time that the
execution was somewhere in this function or in its callees.
(Corresponds to the @scheme[node-total] field.)}
@item{@tt{N3} --- this is the percentage of the node's total time
(@tt{N2}) from the total observed time of the profile. An entry
with a 100% refers to a function that was active throughout the
whole execution.}
@item{@tt{N4} --- the time (in milliseconds) that this function has
been at the top of the stack snapshot. This is the time that this
function consumed doing work itself rather than calling other
functions. (Corresponds to the @scheme[node-self] field.)}
@item{@tt{N5} --- this is the percentage of @tt{N4} out of the total
observed time of the profile. Functions with high values here can
be good candidates for optimization, But, of course, they can
represent doing real work due to one of its callers that need to be
optimized.}
@item{@tt{B} and @tt{C} --- these are labels for the callers and
callees of the function. Any number of callers and callees can
appear here (including 0). The function itself can also appear in
both places if it is (non-tail) recursive.}
@item{@tt{M1} and @tt{M3} --- the index numbers for @tt{B} and
@tt{C}. They can be used to disambiguate functions with the same
name, as well as a quick way to find the corresponding entry in the
table.}
@item{@tt{M2} and @tt{M4} --- the percentages of the time @tt{A} spent
being called by @tt{B} and calling @tt{C}. These percentages
represent the time that this edge was found on a stack snapshot,
divided by the number of occurrences of @tt{A} on the same snapshot.
The number is the percentage of these times out of @tt{N2}, the
total time @tt{A} has been active.
The total percentages for the all caller and for all callees should
be close to 100% minus the time @tt{A} was the leaf or the root.
These values correspond to the @scheme[edge-caller-time] and
@scheme[edge-callee-time] fields; see the documentation for further
details.}
}
The function has a few keyword arguments to customize its output:
@itemize[
@item{The @scheme[truncate-source] argument determines the length that
the source string should take (together with its label).}
@item{@scheme[hide-self%] and @scheme[hide-subs%] control hiding some
of the nodes. A node is hidden if its self time (@tt{N3} in the
above example) is smaller than @scheme[hide-self%] @emph{and} if all
places where it occurs as a caller or a callee have percentages that
are smaller than @scheme[hide-subs%]. The reason for requiring both
conditions is to avoid having ``dangling references'' to hidden
nodes.}
]}
@;--------------------------------------------------------------------
@section{Graph Rendering}
@defmodule[profile/render-graphviz]
@defproc[(render
[profile-data profile?]
[#:hide-self hide-self% (between/c 0 1) 1/100]
[#:hide-subs hide-subs% (between/c 0 1) 2/100])
void?]{
Prints the given @scheme[profile] results as a Graphviz directed
graph.
This is an experimental module, provided mostly as a proof-of-concept.
It renders the profile's call-graph as a graph representation for one
of the Graphviz tools to render. Nodes are colored according to their
`self' percentages, and edges.
The keyword arguments control hiding nodes in the same way as with the
textual renderer.}

View File

@ -0,0 +1,60 @@
#lang scribble/doc
@(require scribble/manual
(for-label scheme profile/sampler profile/analyzer))
@title[#:tag "sampler"]{Collecting Profile Information}
@defmodule[profile/sampler]
@defproc[(create-sampler [to-track (or/c thread? custodian?
(listof (or/c thread? custodian?)))]
[delay nonnegative-number?])
((symbol?) (any/c) . ->* . any/c)]{
Creates a sample collector thread, which tracks the given
@scheme[to-track] value every @scheme[delay] seconds. The
@scheme[to-track] value can be either a thread (track just that
thread), a custodian (track all threads managed by the custodian), or
a list of threads and/or custodians. If a custodian is given, it must
be subordinate to the current custodian.
The resulting value is a controller function, which consumes a message
consisting of a symbol and an optional argument, and can affect the
sampler. The following messages are currently supported:
@itemize[
@item{@scheme['pause] and @scheme['resume] will stop or resume data
collection. These messages can be nested. Note that the thread will
continue running it will just stop collecting snapshots.}
@item{@scheme['stop] kills the controlled thread. It should be called
when no additional data should be collected. (This is currently
irreversible: there is no message to start a new sampler thread.)}
@item{@scheme['set-tracked!] with a value will change the tracked
objects (initially specified as the @scheme[to-track] argument) to
the given value.}
@item{@scheme['set-tracked!] with a value will change the delay that
the sampler us taking between snapshots. Note that although
changing this means that the snapshots are not uniformly
distributed, the results will still be sensible --- this is because
the cpu time between samples is taken into account when the
resulting data is analyzed.}
@item{Finally, a @scheme['get-snapshots] message will make the
controller return the currently collected data. Note that this can
be called multiple times, each call will return the data thatis
collected up to that point in time. In addition, it can be (and
usually is) called after the sampler was stopped.
The value that is returned should be considered as an undocumented
internal detail of the profiler, to be sent to
@scheme[analyze-samples] for analysis. The reason this is not done
automatically, is that a future extension might allow you to combine
several sampler results, making it possible to combine a profile
analysis from several individual runs, possibly from different
machines.}
]}

View File

@ -0,0 +1,67 @@
#lang scribble/doc
@(require scribble/manual
(for-label scheme profile profile/sampler
(only-in profile/analyzer analyze-samples)
(prefix-in text: profile/render-text)))
@title{Toplevel Interface}
@defmodule[profile]
This module provides one procedure and one macroxs that are convenient
high-level entry points for timing expressions. This hides the
details that are available through other parts of the library, and is
intended as a convenient tool for profiling code.
@defproc[(profile-thunk [thunk (-> any/c)]
[#:delay delay nonnegative-number? 0.05]
[#:repeat iterations exact-nonnegative-integer? 1]
[#:threads threads? any/c #f]
[#:render renderer text:render])
void?]{
Executes the given thunk while collecting profiling data, and render
this data when done. Keyword arguments can customize the profiling:
@itemize[
@item{The profiler works by @scheme[create-sampler] starting a
``sampler'' thread whose job is to collect stack samples
periodically (using @scheme[continuation-mark-set->context]).
@scheme[delay] determines the amount of time the sampler
@scheme[sleep]s for between samples. Note that this is will be
close, but not identical to, the frequency in which data is actually
sampled.}
@item{When the profiled computation takes a short amount of time, the
collected data will not be accurate. In this case, you can specify
an @scheme[iterations] argument to repeat the evaluation a number of
times which will improve the accuracy of the resulting report.}
@item{Normally, the sampler collects snapshots of the
@scheme[current-thread]'s stack. If there is some computation that
happens on a different thread, that work will not be reflected in
the results: the only effect will be suspiciously small value for
the observed time, because the collected data is taking into account
the cpu time that the thread actually performed (it uses
@scheme[current-process-milliseconds] with the running thread as an
argument). Specifying a non-@scheme[#f] value for the
@scheme[threads?] argument will arrange for all threads that are
started during the evaluation to be tracked. Note that this means
that the computation will actually run in a new sub-custodian, as
this is the only way to be able to track such threads.}
@item{Once the computation has finished, the sampler is stopped, and
the accumulated data is collected. It is then analyzed by
@scheme[analyze-samples], and the analyzed profile data is fed into
a renderer. Use an identity function (@scheme[values]) to get the
analyzed result, and render it yourself, or use one of the existing
renderers (see @secref["renderers"]).}
]}
@defform[(profile expr keyword-arguments ...)]{
A macro version of @scheme[profile-thunk]. The keyword arguments can
be specified in the same was as for a function call: they can appear
before and/or after the expression to be profiled.}

37
collects/profile/utils.ss Normal file
View File

@ -0,0 +1,37 @@
#lang scheme/base
(provide format-percent format-source get-hidden)
(require "analyzer.ss")
(define (format-percent percent)
(let ([percent (inexact->exact (round (* percent 1000)))])
(format "~a.~a%" (quotient percent 10) (modulo percent 10))))
(define (format-source src)
(if src
(format "~a:~a"
(srcloc-source src)
(if (srcloc-line src)
(format "~a:~a" (srcloc-line src) (srcloc-column src))
(format "#~a" (srcloc-position src))))
"(unknown source)"))
;; hide a node if its self time is smaller than the self threshold *and* all of
;; its edges are below the sub-node threshold too -- this avoids confusing
;; output where a node does not have an entry but appears as a caller/callee.
(define (get-hidden profile hide-self% hide-subs%)
(define self% (or hide-self% 0))
(define subs% (or hide-subs% 0))
(define total-time (profile-total-time profile))
(define (hide? node)
(define (hide-sub? get-subs edge-sub edge-sub-time)
(define %s
(map (lambda (edge)
(let ([total (node-total (edge-sub edge))])
(if (zero? total) 0 (/ (edge-sub-time edge) total))))
(get-subs node)))
(subs% . >= . (apply max %s)))
(and (self% . >= . (/ (node-self node) total-time))
(hide-sub? node-callees edge-callee edge-caller-time)
(hide-sub? node-callers edge-caller edge-callee-time)))
(if (or (> self% 0) (> subs% 0)) (filter hide? (profile-nodes profile)) '()))

View File

@ -0,0 +1,104 @@
#lang scheme/base
(require tests/eli-tester profile/analyzer scheme/match scheme/list)
(define A '(A . #f))
(define B '(B . #f))
(define C '(C . #f))
(define (analyze cpu+lists)
(profile->sexpr
(analyze-samples
(cons (car cpu+lists)
(map (lambda (x) (append (take x 2) (reverse (drop x 2))))
(reverse (cdr cpu+lists)))))))
(define (profile->sexpr prof)
(define (node-id* node)
(or (node-id node) (if (node-src node) '??? '*)))
(define (edges->sexprs node get get-time)
(for/list ([edge (get node)])
`(,(node-id* (edge-caller edge)) -> ,(node-id* (edge-callee edge))
time= ,(get-time edge)
total= ,(edge-time edge))))
(define (node->sexpr node)
`(,(node-id* node)
total= ,(node-total node)
self= ,(node-self node)
callers: ,@(edges->sexprs node node-callers edge-caller-time)
callees: ,@(edges->sexprs node node-callees edge-callee-time)
threads= ,(node-thread-ids node)))
`(total= ,(profile-total-time prof)
samples= ,(profile-sample-number prof)
cpu= ,(profile-cpu-time prof)
thread-times= ,(profile-thread-times prof)
,@(map node->sexpr (cons (profile-*-node prof) (profile-nodes prof)))))
(test
(match (analyze `(10
[0 0 ,A]
[0 1 ,A]))
[`(total= 2 samples= 2 cpu= 10 thread-times= ([0 . 2])
[* total= 2 self= 0
callers: [A -> * time= 2 total= 2]
callees: [* -> A time= 2 total= 2]
threads= ()]
[A total= 2 self= 2
callers: [* -> A time= 2 total= 2]
callees: [A -> * time= 2 total= 2]
threads= (0)])
'ok]
[bad (error 'test ">>> ~s" bad)])
(match (analyze `(10
[0 0 ,A ,B ,A]
[0 1 ,A ,B ,A]))
[`(total= 2 samples= 2 cpu= 10 thread-times= ([0 . 2])
[* total= 2 self= 0
callers: [A -> * time= 2 total= 2]
callees: [* -> A time= 2 total= 2]
threads= ()]
[A total= 2 self= 2
callers: [B -> A time= 2/2 total= 2]
[* -> A time= 2/2 total= 2]
callees: [A -> B time= 2/2 total= 2]
[A -> * time= 2/2 total= 2]
threads= (0)]
[B total= 2 self= 0
callers: [A -> B time= 2 total= 2]
callees: [B -> A time= 2 total= 2]
threads= (0)])
'ok]
[bad (error 'test ">>> ~s" bad)])
(match (analyze `(10
[0 0 ,A ,B ,A]
[0 1 ,A ,C ,A]
[0 2 ,A ,C ,A]
[0 3 ,A ,C ,A]))
[`(total= 4 samples= 4 cpu= 10 thread-times= ([0 . 4])
[* total= 4 self= 0
callers: [A -> * time= 4 total= 4]
callees: [* -> A time= 4 total= 4]
threads= ()]
[A total= 4 self= 4
callers: [* -> A time= 4/2 total= 4]
[C -> A time= 3/2 total= 3]
[B -> A time= 1/2 total= 1]
callees: [A -> * time= 4/2 total= 4]
[A -> C time= 3/2 total= 3]
[A -> B time= 1/2 total= 1]
threads= (0)]
[C total= 3 self= 0
callers: [A -> C time= 3 total= 3]
callees: [C -> A time= 3 total= 3]
threads= (0)]
[B total= 1 self= 0
callers: [A -> B time= 1 total= 1]
callees: [B -> A time= 1 total= 1]
threads= (0)])
'ok]
[bad (error 'test ">>> ~s" bad)])
)