From e29c411d539ccac30d7ba97d6683524a5588c0e9 Mon Sep 17 00:00:00 2001
From: Eli Barzilay <eli@racket-lang.org>
Date: Mon, 30 Mar 2009 00:15:38 +0000
Subject: [PATCH] statistical profiler, core and text interface part done

svn: r14340

original commit: 96d7291bfb1e4f4a600ecda14f314ccccfa2ff48
---
 collects/profile/analyzer.ss                 | 258 +++++++++++++++++++
 collects/profile/main.ss                     |  69 +++++
 collects/profile/render-graphviz.ss          |  41 +++
 collects/profile/render-text.ss              | 137 ++++++++++
 collects/profile/sampler.ss                  | 134 ++++++++++
 collects/profile/scribblings/analyzer.scrbl  | 156 +++++++++++
 collects/profile/scribblings/info.ss         |   3 +
 collects/profile/scribblings/profile.scrbl   |  25 ++
 collects/profile/scribblings/renderers.scrbl | 129 ++++++++++
 collects/profile/scribblings/sampler.scrbl   |  60 +++++
 collects/profile/scribblings/toplevel.scrbl  |  67 +++++
 collects/profile/utils.ss                    |  37 +++
 collects/tests/profile/main.ss               | 104 ++++++++
 13 files changed, 1220 insertions(+)
 create mode 100644 collects/profile/analyzer.ss
 create mode 100644 collects/profile/main.ss
 create mode 100644 collects/profile/render-graphviz.ss
 create mode 100644 collects/profile/render-text.ss
 create mode 100644 collects/profile/sampler.ss
 create mode 100644 collects/profile/scribblings/analyzer.scrbl
 create mode 100644 collects/profile/scribblings/info.ss
 create mode 100644 collects/profile/scribblings/profile.scrbl
 create mode 100644 collects/profile/scribblings/renderers.scrbl
 create mode 100644 collects/profile/scribblings/sampler.scrbl
 create mode 100644 collects/profile/scribblings/toplevel.scrbl
 create mode 100644 collects/profile/utils.ss
 create mode 100644 collects/tests/profile/main.ss
diff --git a/collects/profile/analyzer.ss b/collects/profile/analyzer.ss
new file mode 100644
index 0000000..4da0faa
--- /dev/null
+++ b/collects/profile/analyzer.ss
@@ -0,0 +1,258 @@
+#lang scheme/base
+
+;; Analyzer for the sampler results
+
+(provide analyze-samples)
+
+(require scheme/list)
+
+;; An encapsulation of an analyzed profile call graph:
+;; - total-time: the total time observed in msec (this is generally different
+;;   than the time it took to run the profile).
+;; - sample-number: the number of samples taken.
+;; - thread-times: a list of (<thread-id> . msec) for the time spent in
+;;   observed threads.
+;; - nodes: the list of call-graph nodes sorted by their total time.
+;; - *-node: a special node that is connected as a "caller" for all toplevel
+;;   functions and a "callee" for all leaf functions.  It will also be
+;;   identifiable by having both id and src fields being #f.  Can be used to
+;;   start a graph traversal from the top or the bottom.
+(provide (struct-out profile))
+(define-struct profile
+  (total-time cpu-time sample-number thread-times nodes *-node))
+
+;; An entry for a single profiled function:
+;; - id, src: the corresponding values from `continuation-mark-set->context'.
+;; - thread-ids: the list of thread identifiers this function has been seen in.
+;; - total: total msecs it participated in (= time in it, including callees).
+;; - self: msecs where it was at the top of the stack (= time in its own code).
+;; - callers, callees: a list of `edge' values for the time spent while it was
+;;   called by the repective <node>, or it called it, sorted in decreasing msec
+;;   time.
+;; Note that the sum of caller/callee edges including the special `*-node'
+;; should be equal to the `total' time.  So the edge from/to the `*-node' can
+;; be used to get the time spent as a leaf or as a root divided by the number
+;; of time the function appeared on the stack: so this value can be displayed
+;; in the call-graph and the numbers will sum up nicely to a 100%.
+(provide (struct-out node))
+(define-struct node (id src thread-ids total self callers callees)
+  #:mutable
+  #:property prop:custom-write
+  (lambda (node o w?) (fprintf o "#<node:~s>" (or (node-id node) '???))))
+
+;; An edge representing function calls between two nodes:
+;; - time: the total time spent while the call was somewhere on the stack.
+;; - caller, callee: the two relevant `node' values.
+;; - caller-time, callee-time: the time that the caller/callee spent in this
+;;   call (different from the above time because each stack sample's time is
+;;   divided by the number of times the caller/callee appears in that slice).
+(provide (struct-out edge))
+(define-struct edge (total caller caller-time callee callee-time)
+  #:mutable
+  #:property prop:custom-write
+  (lambda (edge o w?)
+    (fprintf o "#<edge:~s-~s>"
+             (or (node-id (edge-caller edge)) '???)
+             (or (node-id (edge-callee edge)) '???))))
+
+(define with-hash:not-found (gensym))
+(define-syntax-rule (with-hash <hash> <key> <expr>)
+  (let ([t <hash>] [k <key>])
+    (let ([v (hash-ref t k with-hash:not-found)])
+      (if (eq? v with-hash:not-found)
+        (let ([v <expr>]) (hash-set! t k v) v)
+        v))))
+
+;; This function analyzes the output of the sampler.  Returns a `profile'
+;; struct holding a list of `node' values, each one representing a node in the
+;; call graph, with the relevant information filled in.  The results are sorted
+;; by the total time.
+(define (analyze-samples cpu-time+samples)
+  (define cpu-time (car cpu-time+samples))
+  (define samples  (cdr cpu-time+samples))
+  (define samples-by-thread
+    (let ([by-thread (split-by-thread samples)])
+      (for ([samples (in-vector by-thread)] [i (in-naturals 0)])
+        (vector-set! by-thread i (get-times samples)))
+      by-thread))
+  (define id+src->node-hash (make-hasheq))
+  (define (id+src->node id+src)
+    (with-hash id+src->node-hash id+src
+               (make-node (car id+src) (cdr id+src) '() 0 0 '() '())))
+  ;; special node that is the caller of toplevels and callee of leaves
+  (define *-node (id+src->node '(#f . #f)))
+  (define call->edge
+    (let ([t (make-hasheq)])
+      (lambda (ler lee)
+        (with-hash (with-hash t ler (make-hasheq)) lee
+                   (let ([e (make-edge 0 ler 0 lee 0)])
+                     (set-node-callers! lee (cons e (node-callers lee)))
+                     (set-node-callees! ler (cons e (node-callees ler)))
+                     e)))))
+  (define total-time 0)
+  (define thread-times (make-vector (vector-length samples-by-thread) 0))
+  (for ([thread-samples (in-vector samples-by-thread)]
+        [thread-id (in-naturals 0)]
+        #:when #t
+        [sample (in-list thread-samples)])
+    (define msecs (car sample))
+    (define (connect ler lee ler# lee#)
+      (define edge (call->edge ler lee))
+      (set-edge-caller-time! edge (+ (edge-caller-time edge) (/ msecs lee#)))
+      (set-edge-callee-time! edge (+ (edge-callee-time edge) (/ msecs ler#)))
+      edge)
+    (define stack ; the stack snapshot, translated to `node' values
+      (for/list ([id+src (in-list (cdr sample))])
+        (let* ([node (id+src->node id+src)] [tids (node-thread-ids node)])
+          (unless (memq thread-id tids)
+            (set-node-thread-ids! node (cons thread-id tids)))
+          node)))
+    (define counts (get-counts stack))
+    (define stack+counts (map (lambda (x) (assq x counts)) stack))
+    (define edges
+      (if (null? stack)
+        '()
+        (append (let ([first (car stack+counts)] [last (last stack+counts)])
+                  (list (connect *-node (car last) 1 (cdr last))
+                        (connect (car first) *-node (cdr first) 1)))
+                (for/list ([callee (in-list stack+counts)]
+                           [caller (in-list (cdr stack+counts))])
+                  (connect (car caller) (car callee)
+                           (cdr caller) (cdr callee))))))
+    (set! total-time (+ msecs total-time))
+    (for ([p (in-list counts)])
+      (set-node-total! (car p) (+ msecs (node-total (car p)))))
+    (for ([e (remove-duplicates edges eq?)])
+      (set-edge-total! e (+ msecs (edge-total e))))
+    (vector-set! thread-times thread-id
+                 (+ msecs (vector-ref thread-times thread-id)))
+    (when (pair? stack)
+      (set-node-self! (car stack) (+ (node-self (car stack)) msecs))))
+  (set-node-total! *-node total-time)
+  ;; convert the nodes from the hash to a list, do a topological sort, and then
+  ;; sort by total time (combining both guarantees(?) sensible order)
+  (let* ([nodes (hash-map id+src->node-hash (lambda (k v) v))]
+         [nodes (topological-sort *-node (length nodes))]
+         [nodes (sort (remq *-node nodes) > #:key node-total)])
+    ;; sort all the edges in the nodes according to total time
+    (for ([n (in-list nodes)])
+      (set-node-callees! n (sort (node-callees n) > #:key edge-callee-time))
+      (set-node-callers! n (sort (node-callers n) > #:key edge-caller-time)))
+    (make-profile
+     total-time
+     cpu-time
+     (length samples)
+     (for/list ([time (in-vector thread-times)] [n (in-naturals 0)])
+       (cons n time))
+     nodes
+     *-node)))
+
+;; A simple topological sort of nodes using BFS, starting from node `x' which
+;; will be the special *-node.  Note that `expected-length' is only given for
+;; the sanity check in the end.
+(define (topological-sort x expected-length)
+  (define sorted
+    (let loop ([todo (list x)] [seen (list x)])
+      (if (null? todo)
+        '()
+        (let* ([next (append-map (lambda (x) (map edge-caller (node-callers x)))
+                                 todo)]
+               [next (remove-duplicates next)]
+               [next (remq* seen next)])
+          (append todo (loop next (append next seen)))))))
+  (if (= expected-length (length sorted))
+    sorted
+    (error 'topological "internal error")))
+
+;; Groups raw samples by their thread-id, returns a vector with a field for
+;; each thread id holding the sample data for that thread.  The samples in
+;; these are reversed (so they'll be sorted going forward in time).
+(define (split-by-thread samples)
+  (define threads
+    (make-vector (add1 (for/fold ([n -1]) ([sample (in-list samples)])
+                         (max (car sample) n)))
+                 '()))
+  (for ([sample (in-list samples)])
+    (let ([id (car sample)] [data (cdr sample)])
+      (vector-set! threads id (cons data (vector-ref threads id)))))
+  threads)
+#|
+(equal? (split-by-thread '())
+        '#())
+(equal? (split-by-thread '([0 x]))
+        '#([(x)]))
+(equal? (split-by-thread '([0 x] [0 y] [0 z]))
+        '#([(z) (y) (x)]))
+(equal? (split-by-thread '([0 x] [1 y] [2 z]))
+        '#([(x)] [(y)] [(z)]))
+(equal? (split-by-thread '([0 x1] [1 y1] [0 x2] [2 z1] [0 x3] [2 z2]))
+        '#([(x3) (x2) (x1)] [(y1)] [(z2) (z1)]))
+|#
+
+;; gets a list of thread-id and data for that thread beginning with the
+;; millisecond count, and returns a similar list where the samples begin with
+;; the time spent for that sample.  The time spent is taken as half of the two
+;; touching ranges; for example, if there are three samples showing snapshot
+;; times of 10, 20, 60, then the middle one is assumed to have a time of 25.
+;; For the first and last samples, the time is twice the half of the single
+;; touching range -- with this example, this would be 10 for the first and 40
+;; for the last.  If there is a thread with just one sample, it is dropped.
+(define (get-times samples)
+  (cond
+    ;; nothing to do
+    [(null? samples) '()]
+    ;; throw out a single sample
+    [(null? (cdr samples)) '()]
+    [else (let loop ([samples samples]
+                     [prevs   (cons #f (map car samples))]
+                     [r       '()])
+            (if (null? samples)
+              (reverse r)
+              (let* ([prev (car prevs)]
+                     [cur  (caar samples)]
+                     [data (cdar samples)]
+                     [prevs (cdr prevs)]
+                     [samples (cdr samples)]
+                     [next (and (pair? samples) (caar samples))])
+                (loop samples prevs
+                      (cons (cons (if next
+                                    ;; not the last: there must be a next
+                                    (if prev (/ (- next prev) 2) (- next cur))
+                                    ;; last one: there must be a prev
+                                    (- cur prev))
+                                  data)
+                            r)))))]))
+#|
+(equal? (get-times '())
+        '())
+(equal? (get-times '([10 a]))
+        '())
+(equal? (get-times '([10 a] [20 b]))
+        '([10 a] [10 b]))
+(equal? (get-times '([10 a] [20 b] [60 c]))
+        '([10 a] [25 b] [40 c]))
+(equal? (get-times '([10 a] [20 b] [30 c] [40 d]))
+        '([10 a] [10 b] [10 c] [10 d]))
+(equal? (get-times '([10 a] [20 b] [60 c] [80 d]))
+        '([10 a] [25 b] [30 c] [20 d]))
+|#
+
+;; returns a list of (cons item occurrences) for the items in l
+(define (get-counts l)
+  (let loop ([l l] [r '()])
+    (if (null? l)
+      r
+      (let ([1st (car l)])
+        (let loop* ([l1 '()] [c 1] [l (cdr l)])
+          (cond [(null? l) (loop l1 (cons (cons 1st c) r))]
+                [(eq? 1st (car l)) (loop* l1 (add1 c) (cdr l))]
+                [else (loop* (cons (car l) l1) c (cdr l))]))))))
+#|
+(equal? (get-counts '()) '())
+(equal? (get-counts '(1)) '([1 . 1]))
+(equal? (get-counts '(1 1 1)) '([1 . 3]))
+(define (set=? xs ys) (null? (append (remove* xs ys) (remove* ys xs))))
+(set=? (get-counts '(1 2 3)) '([1 . 1] [2 . 1] [3 . 1]))
+(set=? (get-counts '(1 2 2 3 3 3)) '([1 . 1] [2 . 2] [3 . 3]))
+(set=? (get-counts '(3 1 2 3 2 3)) '([1 . 1] [2 . 2] [3 . 3]))
+|#
diff --git a/collects/profile/main.ss b/collects/profile/main.ss
new file mode 100644
index 0000000..d6767d0
--- /dev/null
+++ b/collects/profile/main.ss
@@ -0,0 +1,69 @@
+#lang scheme/base
+
+(require "sampler.ss" (except-in "analyzer.ss" profile)
+         (prefix-in text: "render-text.ss")
+         (for-syntax scheme/base))
+
+(define (profile-thunk thunk
+                       #:delay   [delay 0.05]
+                       #:repeat  [rpt 1]
+                       #:threads [threads? #f]
+                       #:render  [renderer text:render])
+  (define cust (and threads? (make-custodian (current-custodian))))
+  (define sampler (create-sampler (if threads?
+                                    (list cust (current-thread))
+                                    (current-thread))
+                                  delay))
+  (define (run) (for ([i (in-range rpt)]) (thunk)))
+  (with-handlers ([void (lambda (e)
+                          (fprintf (current-error-port)
+                                   "profiled thunk error: ~a\n"
+                                   (if (exn? e)
+                                     (exn-message e)
+                                     (format "~e" e))))])
+    (if threads?
+      (parameterize ([current-custodian cust]) (run))
+      (run)))
+  (sampler 'stop)
+  (renderer (analyze-samples (sampler 'get-snapshots))))
+
+(define-syntax (profile stx)
+  (syntax-case stx ()
+    [(_ x ...)
+     (let loop ([expr #f] [kwds '()] [xs (syntax->list #'(x ...))])
+       (cond
+         [(null? xs)
+          (if expr
+            (with-syntax ([expr expr] [kwds (reverse kwds)])
+              #'(profile-thunk (lambda () expr) . kwds))
+            (raise-syntax-error 'profile "missing expression" stx))]
+         [(keyword? (syntax-e (car xs)))
+          (if (pair? (cdr xs))
+            (loop expr (list* (cadr xs) (car xs) kwds) (cddr xs))
+            ;; let #%app throw the error
+            (loop expr (cons (car xs) kwds) (cdr xs)))]
+         [expr (raise-syntax-error 'profile "redundant expresion given"
+                                   stx (car xs))]
+         [else (loop (car xs) kwds (cdr xs))]))]))
+
+
+(define (fib1 n) (if (<= n 1) n (+ (fib1 (- n 1)) (fib1 (- n 2)))))
+(define (fib22 n) (if (<= n 2) 1 (+ (fib22 (- n 1)) (fib22 (- n 2)))))
+(define (fib3 n)
+  (for ([i (in-range 100000000)]) (* i 3))
+  (if (<= n 2) 1 (+ (fib22 (- n 1)) (fib22 (- n 2)))))
+(define (fibs n) (+ (fib1 n) (fib22 n) (fib3 n)))
+
+(define (foo n)
+  (define ch (make-channel))
+  (define (bg-fib) (channel-put ch (fib1 n)))
+  (thread bg-fib)
+  (list (fib22 n) (channel-get ch)))
+
+(require "render-graphviz.ss")
+
+(profile (fibs 35)
+         ;(dynamic-require '(lib "scribblings/reference/reference.scrbl") #f)
+         ;(foo 35)
+         ;#:render render
+         #:threads #t)
diff --git a/collects/profile/render-graphviz.ss b/collects/profile/render-graphviz.ss
new file mode 100644
index 0000000..2c6e5c8
--- /dev/null
+++ b/collects/profile/render-graphviz.ss
@@ -0,0 +1,41 @@
+#lang scheme/base
+
+(provide render)
+
+(require "analyzer.ss" "utils.ss")
+
+(define (render profile
+                #:hide-self [hide-self% 1/100]
+                #:hide-subs [hide-subs% 2/100])
+  (define *-node (profile-*-node profile))
+  (define hidden (get-hidden profile hide-self% hide-subs%))
+  (define nodes  (remq* hidden (profile-nodes profile)))
+  (define total-time (profile-total-time profile))
+  (define node->
+    (let ([t (make-hasheq)])
+      (for ([node (in-list nodes)] [idx (in-naturals 1)])
+        (let ([id (node-id node)] [src (node-src node)])
+        (hash-set! t node
+                   (list (format "node~a" idx)
+                         (if id
+                           (format "~a" id)
+                           (regexp-replace #rx"^.*/"
+                                           (format-source (node-src node))
+                                           ""))))))
+      (lambda (mode node)
+        ((case mode [(index) car] [(label) cadr]) (hash-ref t node)))))
+  (define max-self%
+    (/ (for/fold ([m 0]) ([node (in-list nodes)]) (max m (node-self node)))
+       total-time))
+  (printf "digraph Profile {\n")
+  (for ([node (in-list nodes)])
+    (define self% (/ (node-self node) total-time))
+    (printf "~a [" (node-> 'index node))
+    (printf "label=~s, " (node-> 'label node))
+    (printf "fillcolor=\"1,~a,1\", " (exact->inexact (/ self% max-self%)))
+    (printf "style=filled];\n")
+    (for ([edge (in-list (node-callees node))])
+      (define callee (edge-callee edge))
+      (unless (or (eq? *-node callee) (memq callee hidden))
+        (printf "~a -> ~a;\n" (node-> 'index node) (node-> 'index callee)))))
+  (printf "}\n"))
diff --git a/collects/profile/render-text.ss b/collects/profile/render-text.ss
new file mode 100644
index 0000000..0688f8f
--- /dev/null
+++ b/collects/profile/render-text.ss
@@ -0,0 +1,137 @@
+#lang at-exp scheme/base
+
+(provide render)
+
+(require "analyzer.ss" "utils.ss" scheme/list)
+
+(define (f:msec msec)
+  (number->string (round (inexact->exact msec))))
+
+(define (f:msec* msec)
+  (string-append (f:msec msec) "ms"))
+
+(define (display-table aligns table)
+  ;; * thunks are used for cells that are ignored when inspecting widths
+  ;; * chars are used for filler cells
+  (define (display-line strings)
+    (printf "~a\n" (regexp-replace #rx" +$" (apply string-append strings) "")))
+  (let ([widths (let loop ([table table])
+                  (let ([table (filter pair? table)])
+                    (if (null? table) '()
+                        (cons (apply max (filter-map
+                                          (lambda (x)
+                                            (and (string? (car x))
+                                                 (string-length (car x))))
+                                          table))
+                              (loop (map cdr table))))))])
+    (for ([row (in-list table)])
+      (display-line
+       (for/list ([cell (in-list row)]
+                  [width (in-list widths)]
+                  [align (in-list aligns)])
+         (let* ([cell (cond [(char? cell) (make-string width cell)]
+                            [(procedure? cell) (cell)]
+                            [else cell])]
+                [pad  (make-string (max 0 (- width (string-length cell)))
+                                   #\space)])
+           (case align
+             [(l) (string-append cell pad)]
+             [(r) (string-append pad cell)]
+             [else (error 'internal-error "poof")])))))))
+
+(define (render profile
+                #:truncate-source [truncate-source 50]
+                #:hide-self       [hide-self% 1/100]
+                #:hide-subs       [hide-subs% 2/100])
+  (define (show . xs)
+    (let loop ([x xs])
+      (cond [(or (not x) (null? x) (void? x)) (void)]
+            [(pair? x) (loop (car x)) (loop (cdr x))]
+            [else (display x)]))
+    (newline))
+  (define total-time    (profile-total-time    profile))
+  (define cpu-time      (profile-cpu-time      profile))
+  (define sample-number (profile-sample-number profile))
+  (define granularity   (/ total-time sample-number))
+  (define threads+times (profile-thread-times  profile))
+  (define *-node        (profile-*-node profile))
+  (define hidden        (get-hidden profile hide-self% hide-subs%))
+  (define nodes         (remq* hidden (profile-nodes profile)))
+  (define node->
+    (let ([t (make-hasheq)])
+      (for ([node (in-list nodes)] [idx (in-naturals 1)])
+        (let ([index (format "[~a]" idx)]
+              [label (format "~a" (or (node-id node) '???))])
+          (hash-set! t node (list index label @string-append{@label @index}))))
+      (lambda (mode node)
+        ((case mode [(index) car] [(label) cadr] [(sub-label) caddr])
+         (hash-ref t node)))))
+  (define (sep ch) (list ch ch ch ch ch ch ch ch ch ch))
+  (define =sep (sep #\=))
+  (define -sep (sep #\-))
+  @show{
+    Profiling results
+    -----------------
+      Total cpu time observed: @f:msec*[total-time] (out of @f:msec*[cpu-time])
+      Number of samples taken: @sample-number (once every @f:msec*[granularity])
+    }
+  (when (> (length threads+times) 1)
+    @show{  Threads observed:        @(length threads+times)})
+  (when (pair? hidden)
+    (let* ([hidden (length hidden)]
+           [nodes (length (profile-nodes profile))]
+           [self% @string-append{self<@(format-percent (or hide-self% 0))}]
+           [subs% @string-append{local<@(format-percent (or hide-subs% 0))}]
+           [%s (cond [(not hide-self%) subs%]
+                     [(not hide-subs%) self%]
+                     [else @string-append{@self% and @subs%}])])
+      @show{  (Hiding functions with @|%s|: @|hidden| of @nodes hidden)}))
+  (newline)
+  (display-table
+   '(r l r l l r l l l r l l)
+   (append*
+    `(,=sep
+      ("   " " " "   ""     " " " "  ""     " " " "  Caller")
+      ("Idx" " " "To""tal   " " " "Se""lf   " " " "Name+src" "Local%")
+      ("   " " " " ms""(pct)" " " "ms""(pct)" " " "  Callee")
+      ,=sep)
+    (for/list ([node (in-list nodes)])
+      (define index  (node-> 'index node))
+      (define name   (node-> 'label node))
+      (define total  (node-total node))
+      (define totalS (f:msec total))
+      (define total% @string-append{(@(format-percent (/ total total-time)))})
+      (define self   (node-self node))
+      (define selfS  (f:msec self))
+      (define self%  @string-append{(@(format-percent (/ self total-time)))})
+      (define name+src
+        (let* ([src      (format-source (node-src node))]
+               [src-len  (string-length src)]
+               [name-len (string-length name)])
+          (string-append
+           name " "
+           ;; truncate-source only truncates the source
+           (let* ([n (and truncate-source
+                          ((+ src-len name-len 1) . - . truncate-source))]
+                  [n (and n (positive? n) (- src-len n 3))])
+             (cond [(not n) src]
+                   [(n . <= . 0) "..."]
+                   [else (string-append "..."
+                                        (substring src (- src-len n)))])))))
+      (define (sub get-edges get-node get-node-time)
+        (for*/list ([edge (in-list (get-edges node))]
+                    [sub  (in-list (list (get-node edge)))] ; <-- hack...
+                    #:when (not (or (eq? *-node sub)        ; <-- ...for this
+                                    (memq sub hidden))))
+          (define name   (node-> 'sub-label sub))
+          (define local% (format-percent (/ (get-node-time edge) total)))
+          `("" "" "" "" "" "" "" ""
+            ,(string-append "  " name) ,local%
+            "" "")))
+      `(,@(reverse (sub node-callers edge-caller edge-caller-time))
+        (,(node-> 'index node)
+         " " ,totalS ,total%
+         " " ,selfS  ,self%
+         " " ,(lambda () name+src))
+        ,@(sub node-callees edge-callee edge-callee-time)
+        ,-sep)))))
diff --git a/collects/profile/sampler.ss b/collects/profile/sampler.ss
new file mode 100644
index 0000000..2f0ef36
--- /dev/null
+++ b/collects/profile/sampler.ss
@@ -0,0 +1,134 @@
+#lang scheme/base
+
+;; The core profiler sample collector
+;; (This module is a private tool for collecting profiling data, and should not
+;; be used as is.)
+
+(provide create-sampler)
+
+;; create-sampler : creates a sample collector thread, which tracks the given
+;; `to-track' value every `delay' seconds.
+;; * The input value can be either a thread (track just that thread), a
+;;   custodian (track all threads managed by the custodian), or a list of
+;;   threads and/or custodians.  If a custodian is given, it must be
+;;   subordinate to the current custodian.
+;; * The collected values are (<thread-id> <thread-time> . <stack>), where
+;;   - The <thread-id> is an integer number identifying the thread, starting
+;;     from 0.  If the collected data has thread ids in a 0..N range
+;;     (exclusive) , then there were N threads that were observed.  (This can
+;;     be relevant when tracking a custodian, where threads can change
+;;     dynamically.)
+;;   - The <thread-time> is the result of `current-process-milliseconds' for
+;;     the thread that this sample came from.  Note that these numbers will not
+;;     start at 0, since threads are likely to run before a sample is
+;;     collected.
+;;   - Finally, the <stack> part is a snapshot of the thread's stack, as
+;;     grabbed by `continuation-mark-set->context'.  The the values in these
+;;     snapshots are interned to reduce memory load.
+;;   The results are collected sequentially, so they're always sorted from the
+;;   newest to the oldest.  Remember that these results should be considered
+;;   private for the profiler collection, and can change when more information
+;;   needs to be collected.
+;; * Returns a "controller" function that accepts messages to control the
+;;   sampler thread.  The current set of messages that the controller
+;;   understands are:
+;;   - 'pause and 'resume: stops or resumes collecting samples.  These messages
+;;     can be nested.  Note that the thread will continue running it just won't
+;;     collect snapshots.
+;;   - 'stop: kills the collector thread.  Should be called when you have your
+;;     data.  (There is no message to start a new sampler thread, although
+;;     adding one will not be difficult.)
+;;   - 'set-tracked! <new>: changes the thread/s and/or custodian/s to track.
+;;   - 'set-delay! <new>: changes the sampling delay.  This means that we won't
+;;     have a direct correlation between the number of samples and the time
+;;     they represent -- but the samples are statistical snapshots anyway, and
+;;     the results are not formulated in terms of time spent.  (The time spent
+;;     could be added of course, but it is best to do that in terms of the
+;;     start/stop times)
+;;   - 'get-snapshots: returns the currently collected list of snapshots.  Note
+;;     that this can be called multiple times, each will return the data that
+;;     is collected up to that point in time.
+(define (create-sampler to-track delay)
+  ;; the collected data
+  (define snapshots '())
+  (define cust (current-custodian))
+  ;; intern the entries (which are (cons id/#f srcloc/#f))
+  (define entry-table (make-hash))
+  (define (intern-entry entry)
+    (let* ([key (or (cdr entry) (car entry))]
+           [en  (hash-ref entry-table key #f)])
+      (if en
+        ;; ELI: is this sanity check needed?
+        ;; (if (equal? en entry)
+        ;;   en
+        ;;   (error 'profile "internal error: assumption invalid"))
+        en
+        (begin (hash-set! entry-table key entry) entry))))
+  (define (validate to-track who)
+    (let loop ([t to-track])
+      (cond
+        [(thread? t)]
+        [(list? t) (for-each loop t)]
+        [(not (custodian? t))
+         (raise-type-error
+          who "thread, custodian, or a list of threads/csutodians" to-track)]
+        ;; test that it's subordinate
+        [(with-handlers ([exn:fail:contract? (lambda (_) #t)])
+           (custodian-managed-list t cust) #f)
+         (error who "got a custodian that is not subordinate to current")])))
+  (define paused 0)
+  (define thread-id
+    (let ([next-id 0] [t (make-weak-hasheq)])
+      (lambda (thd)
+        (or (hash-ref t thd #f)
+            (let ([id next-id])
+              (set! next-id (add1 next-id))
+              (hash-set! t thd id)
+              id)))))
+  (define (sampler)
+    (sleep delay)
+    (when (paused . <= . 0)
+      (let loop ([t to-track])
+        (cond [(thread? t)
+               (unless (eq? t sampler-thread)
+                 (set! snapshots
+                       (cons (list* (thread-id t)
+                                    (current-process-milliseconds t)
+                                    (map intern-entry
+                                         (continuation-mark-set->context
+                                          (continuation-marks t))))
+                             snapshots)))]
+              [(custodian? t) (for-each loop (custodian-managed-list t cust))]
+              ;; cannot assume that it's a list: we might get other values from
+              ;; a custodian managed list
+              [(list? t) (for-each loop t)])))
+    (sampler))
+  (define cpu-time   0)
+  (define start-time (current-process-milliseconds))
+  (define (add-time)
+    (when (paused . <= . 0)
+      (let ([cur (current-process-milliseconds)])
+        (set! cpu-time (+ cpu-time (- cur start-time)))
+        (set! start-time cur))))
+  (define (ignore-time)
+    (when (paused . <= . 0)
+      (set! start-time (current-process-milliseconds))))
+  (define sampler-thread
+    (begin (validate to-track 'create-sampler)
+           (thread sampler)))
+  ;; use a sema to avoid mutations from different threads, the sampler thread
+  ;; is only reading these values so it doesn't use it.
+  (define sema (make-semaphore 1))
+  (define (sampler-controller msg [arg #f])
+    (define-syntax-rule (w/sema body ...)
+      (call-with-semaphore sema (lambda () body ...)))
+    (case msg
+      [(pause)  (w/sema (add-time) (set! paused (add1 paused)))]
+      [(resume) (w/sema (set! paused (sub1 paused)) (ignore-time))]
+      [(stop)   (kill-thread sampler-thread) (add-time) (set! paused +inf.0)]
+      [(set-tracked!) (validate arg 'sampler-controller)
+                      (w/sema (set! to-track arg))]
+      [(set-delay!)   (w/sema (set! delay arg))]
+      [(get-snapshots) (add-time) (cons cpu-time snapshots)]
+      [else (error 'sampler-controller "unknown message: ~e" msg)]))
+  sampler-controller)
diff --git a/collects/profile/scribblings/analyzer.scrbl b/collects/profile/scribblings/analyzer.scrbl
new file mode 100644
index 0000000..375c395
--- /dev/null
+++ b/collects/profile/scribblings/analyzer.scrbl
@@ -0,0 +1,156 @@
+#lang scribble/doc
+
+@(require scribble/manual
+          (for-label scheme profile/analyzer))
+
+@title[#:tag "analyzer"]{Analyzing Profile Data}
+
+@defmodule[profile/analyzer]
+
+Once a profile run is done, and the results are collected, the next
+step is to analyze the data.  In this step the sample time are
+computed and summed, a call-graph representing the observed function
+calls is built, and per-node and per-edge information is created.
+This is the job of the main function provided by
+@scheme[profile/analyzer].
+
+@defproc[(analyze-samples [raw-sample-data any/c])
+         profile?]{
+
+This function consumes the raw result of the
+@seclink["sampler"]{sampler} (which is given in an undocumented form),
+analyzes it, and returns a @scheme[profile] value holding the analyzed
+results.  Without this function, the results of the sampler are
+meaningless.}
+
+
+@defstruct[profile ([total-time exact-nonnegative-integer?]
+                    [cpu-time   exact-nonnegative-integer?]
+                    [sample-number exact-nonnegative-integer?]
+                    [thread-times  (listof (cons exact-nonnegative-integer?
+                                                 exact-nonnegative-integer?))]
+                    [nodes  (listof node?)]
+                    [*-node node?])]{
+
+Represents the analyzed profile result.
+
+@itemize[
+
+@item{@scheme[total-time] is the total observed time (in milliseconds)
+  included in the profile.  This is different than the actual time the
+  profiling took, due to unaccounted-for time spent in untracked
+  threads.  (E.g., the sampler thread itself.)}
+
+@item{@scheme[cpu-time] is the actual cpu time consumed by the process
+  during the profiler's work.}
+
+@item{@scheme[sample-number] holds the number of samples taken during
+  the profile.  This can be used to compute the average time frame
+  each of the input samples represented.}
+
+@item{@scheme[thread-times] holds an association list mapping thread
+  identifiers to cpu time for the corresponding threads.  As samples
+  are collected, each thread that is observed is assigned a small
+  integer identifier.  These identifers are listed for each function
+  call, and the total time spent in each thread is in this field.}
+
+@item{@scheme[nodes] is a list of nodes representing all observed
+  functions.  These nodes are the components of the call-graph that
+  the analyzer assembles (see the @scheme[edge] field).  The nodes are
+  sorted by decreasing total amount of time (time spent either in the
+  function or in its callees), and by a top-to-bottom topological
+  sort.}
+
+@item{@scheme[*-node] holds a ``special'' node value that is
+  constructed for every graph.  This node is used as the caller for
+  all top-level function nodes and as the callee for all leaf nodes.
+  It can therefore be used to start a scan of the call graph.  In
+  addition, the times associated with it's "callers and callees"
+  actually represent the time these functions spent being the root of
+  the computation or its leaf.  (This can be different from a node's
+  ``self'' time, since it is divided by the number of instances a
+  function had on the stack for every sample --- so for recursive
+  functions this value is different from.)}
+
+]}
+
+
+@defstruct[node ([id      (or/c #f symbol?)]
+                 [src     (or/c #f srcloc?)]
+                 [thread-ids (listof exact-nonnegative-integer?)]
+                 [total   exact-nonnegative-integer?]
+                 [self    exact-nonnegative-integer?]
+                 [callers (listof edge?)]
+                 [callees (listof edge?)])]{
+
+Represents a function call node in the call graph of an analyzed
+profile result.
+
+@itemize[
+
+@item{The @scheme[id] and @scheme[src] field hold a symbol naming the
+  function and/or its source location as a @scheme[srcloc] value.
+  This is the same as the results of
+  @scheme[continuation-mark-set->context], so at most of of these can
+  be @scheme[#f], except for the special @scheme[*-node] (see the
+  @scheme[profile] struct) that can be identified by both of these
+  being @scheme[#f].}
+
+@item{@scheme[thread-ids] holds a list of thread identifiers that were
+  observed executing this function.}
+
+@item{@scheme[total] holds the total time (in milliseconds) where this
+  function was anywhere on the stack.  It is common to see a few
+  toplevel functions that have close to a 100% total time, but
+  otherwise small @scheme[self] times --- these functions are the ones
+  that derive the work that was done, but they don't do any hard work
+  directly.}
+
+@item{@scheme[self] holds the total time (in milliseconds) where this
+  function was observed as the leaf of the stack.  It represents the
+  actual work done by this function, rather than @scheme[total] that
+  represents the work done by both the function and its callees.}
+
+@item{@scheme[callers] and @scheme[callees] hold the list of caller
+  and callee nodes.  The nodes are not actually held in these lists,
+  instead, @scheme[edge] values are used --- and provide information
+  specific to an edge in the call-graph.}
+
+]}
+
+
+@defstruct[edge ([total       exact-nonnegative-integer?]
+                 [caller      node?]
+                 [caller-time exact-nonnegative-integer?]
+                 [callee      node?]
+                 [callee-time exact-nonnegative-integer?])]{
+
+Represents an edge between two function call nodes in the call graph
+of an analyzed profile result.
+
+@itemize[
+
+@item{@scheme[total] is analogous to the @scheme[total] field of a
+  @scheme[node] value: the total time that this edge was anywhere on
+  the stack.}
+
+@item{@scheme[caller] and @scheme[callee] hold the two nodes that are
+  connected by this edge.}
+
+@item{@scheme[caller-time] and @scheme[callee-time] hold the time
+  spent on this edge from the caller's or the callee's perspective.
+  These times are different from each other (as well as from the total
+  time) because the sums that make them are each divided by the number
+  of times the caller or the callee was on the stack.
+
+  To understand this difference, consider a stack snapshot holding
+  @tt{A @'rarr B @'rarr B @'rarr B @'rarr A}, and representing a
+  second of observed cpu time.  For this sample, the @tt{A @'rarr B}
+  edge is charged by a whole second for its total time (the same goes
+  for the @tt{A @'rarr A} edge, for example).  Its caller time is
+  charged 1/2 second because @tt{A} appears twice in this stack
+  snapshot (in the other half, @tt{A} is chared for being a leaf ---
+  the caller of the special @scheme[*-node]), and its callee time is
+  charged 1/3 respectively.}
+
+]}
diff --git a/collects/profile/scribblings/info.ss b/collects/profile/scribblings/info.ss
new file mode 100644
index 0000000..860b926
--- /dev/null
+++ b/collects/profile/scribblings/info.ss
@@ -0,0 +1,3 @@
+#lang setup/infotab
+
+(define scribblings '(("profile.scrbl" () (tool-library))))
diff --git a/collects/profile/scribblings/profile.scrbl b/collects/profile/scribblings/profile.scrbl
new file mode 100644
index 0000000..ef93db5
--- /dev/null
+++ b/collects/profile/scribblings/profile.scrbl
@@ -0,0 +1,25 @@
+#lang scribble/doc
+
+@(require scribble/manual
+          (for-label scheme))
+
+@title{Statistical Profiler}
+
+The @scheme[profile] collection implements a statistical profiler.
+The profiling is done by running a background thread that collects
+stack snapshots via @scheme[continuation-mark-set->context], meaning
+that the result is an estimate of the execution costs and it is
+limited to the kind of information that
+@scheme[continuation-mark-set->context] produces (most notably being
+limited to functions calls, and subject to compiler optimizations);
+but the result is often useful.  In practice, since this method does
+not require recompilation of your source and has very little runtime
+overhead, it can be used for longer runs which compensates for these
+limits.
+
+@table-of-contents[]
+
+@include-section["toplevel.scrbl"]
+@include-section["sampler.scrbl"]
+@include-section["analyzer.scrbl"]
+@include-section["renderers.scrbl"]
diff --git a/collects/profile/scribblings/renderers.scrbl b/collects/profile/scribblings/renderers.scrbl
new file mode 100644
index 0000000..b00bd96
--- /dev/null
+++ b/collects/profile/scribblings/renderers.scrbl
@@ -0,0 +1,129 @@
+#lang scribble/doc
+
+@(require scribble/manual
+          (for-label scheme
+                     profile/analyzer
+                     (prefix-in text: profile/render-text)
+                     (prefix-in graphviz: profile/render-graphviz)))
+
+@title[#:tag "renderers"]{Profile Renderers}
+
+After collecting the profile samples and analyzing the data, the last
+aspect of profiling is to render the results.  The profile collection
+provides several renderers, each providing a rendering function that
+consumes a @scheme[profile] instance.  See the
+@seclink["analyzer"]{analyzer} section for a description of the
+@scheme[profile] struct if you want to implement your own renderer.
+
+@;--------------------------------------------------------------------
+@section{Textual Rendering}
+
+@defmodule[profile/render-text]
+
+@defproc[(render
+          [profile-data profile?]
+          [#:truncate-source truncate-source exact-nonnegative-integer? 50]
+          [#:hide-self hide-self% (between/c 0 1) 1/100]
+          [#:hide-subs hide-subs% (between/c 0 1) 2/100])
+         void?]{
+
+Prints the given @scheme[profile] results as a textual table.
+
+The printout begins with some general facts about the profile, and
+then a table that represents the call-graph is printed.  Each row in
+this table looks like:
+
+@verbatim[#:indent 2]{
+                          B [M1] M2%
+  [N1] N2(N3%) N4(N5%)  A ...path/to/source.ss:12:34
+                          C [M3] M4%}
+
+Where actual numbers appear in the printout.  The meaning of the
+numbers and labels is as follows:
+@itemize{
+@item{@tt{A} --- the name of the function that this node represents,
+  followed by the source location for the function if it is known.
+  The name can be ``???'' for functions with no identifier, but in
+  this case the source location will identify them.}
+@item{@tt{N1} --- an index number associated with this node.  This is
+  important in references to this function, since the symbolic names
+  are not unique (and some can be missing).  The number itself has no
+  significance, it simply goes from 1 up.}
+@item{@tt{N2} --- the time (in milliseconds) that this function has
+  been anywhere in a stack snapshot.  This is the total time that the
+  execution was somewhere in this function or in its callees.
+  (Corresponds to the @scheme[node-total] field.)}
+@item{@tt{N3} --- this is the percentage of the node's total time
+  (@tt{N2}) from the total observed time of the profile.  An entry
+  with a 100% refers to a function that was active throughout the
+  whole execution.}
+@item{@tt{N4} --- the time (in milliseconds) that this function has
+  been at the top of the stack snapshot.  This is the time that this
+  function consumed doing work itself rather than calling other
+  functions.  (Corresponds to the @scheme[node-self] field.)}
+@item{@tt{N5} --- this is the percentage of @tt{N4} out of the total
+  observed time of the profile.  Functions with high values here can
+  be good candidates for optimization, But, of course, they can
+  represent doing real work due to one of its callers that need to be
+  optimized.}
+@item{@tt{B} and @tt{C} --- these are labels for the callers and
+  callees of the function.  Any number of callers and callees can
+  appear here (including 0).  The function itself can also appear in
+  both places if it is (non-tail) recursive.}
+@item{@tt{M1} and @tt{M3} --- the index numbers for @tt{B} and
+  @tt{C}.  They can be used to disambiguate functions with the same
+  name, as well as a quick way to find the corresponding entry in the
+  table.}
+@item{@tt{M2} and @tt{M4} --- the percentages of the time @tt{A} spent
+  being called by @tt{B} and calling @tt{C}.  These percentages
+  represent the time that this edge was found on a stack snapshot,
+  divided by the number of occurrences of @tt{A} on the same snapshot.
+  The number is the percentage of these times out of @tt{N2}, the
+  total time @tt{A} has been active.
+
+  The total percentages for the all caller and for all callees should
+  be close to 100% minus the time @tt{A} was the leaf or the root.
+
+  These values correspond to the @scheme[edge-caller-time] and
+  @scheme[edge-callee-time] fields; see the documentation for further
+  details.}
+}
+
+The function has a few keyword arguments to customize its output:
+@itemize[
+
+@item{The @scheme[truncate-source] argument determines the length that
+  the source string should take (together with its label).}
+
+@item{@scheme[hide-self%] and @scheme[hide-subs%] control hiding some
+  of the nodes.  A node is hidden if its self time (@tt{N3} in the
+  above example) is smaller than @scheme[hide-self%] @emph{and} if all
+  places where it occurs as a caller or a callee have percentages that
+  are smaller than @scheme[hide-subs%].  The reason for requiring both
+  conditions is to avoid having ``dangling references'' to hidden
+  nodes.}
+
+]}
+
+
+@;--------------------------------------------------------------------
+@section{Graph Rendering}
+
+@defmodule[profile/render-graphviz]
+
+@defproc[(render
+          [profile-data profile?]
+          [#:hide-self hide-self% (between/c 0 1) 1/100]
+          [#:hide-subs hide-subs% (between/c 0 1) 2/100])
+         void?]{
+
+Prints the given @scheme[profile] results as a Graphviz directed
+graph.
+
+This is an experimental module, provided mostly as a proof-of-concept.
+It renders the profile's call-graph as a graph representation for one
+of the Graphviz tools to render.  Nodes are colored according to their
+`self' percentages, and edges.
+
+The keyword arguments control hiding nodes in the same way as with the
+textual renderer.}
diff --git a/collects/profile/scribblings/sampler.scrbl b/collects/profile/scribblings/sampler.scrbl
new file mode 100644
index 0000000..1140e9c
--- /dev/null
+++ b/collects/profile/scribblings/sampler.scrbl
@@ -0,0 +1,60 @@
+#lang scribble/doc
+
+@(require scribble/manual
+          (for-label scheme profile/sampler profile/analyzer))
+
+@title[#:tag "sampler"]{Collecting Profile Information}
+
+@defmodule[profile/sampler]
+
+@defproc[(create-sampler [to-track (or/c thread? custodian?
+                                         (listof (or/c thread? custodian?)))]
+                         [delay nonnegative-number?])
+         ((symbol?) (any/c) . ->* . any/c)]{
+
+Creates a sample collector thread, which tracks the given
+@scheme[to-track] value every @scheme[delay] seconds.  The
+@scheme[to-track] value can be either a thread (track just that
+thread), a custodian (track all threads managed by the custodian), or
+a list of threads and/or custodians.  If a custodian is given, it must
+be subordinate to the current custodian.
+
+The resulting value is a controller function, which consumes a message
+consisting of a symbol and an optional argument, and can affect the
+sampler.  The following messages are currently supported:
+@itemize[
+
+@item{@scheme['pause] and @scheme['resume] will stop or resume data
+collection.  These messages can be nested.  Note that the thread will
+continue running it will just stop collecting snapshots.}
+
+@item{@scheme['stop] kills the controlled thread.  It should be called
+  when no additional data should be collected.  (This is currently
+  irreversible: there is no message to start a new sampler thread.)}
+
+@item{@scheme['set-tracked!] with a value will change the tracked
+  objects (initially specified as the @scheme[to-track] argument) to
+  the given value.}
+
+@item{@scheme['set-tracked!] with a value will change the delay that
+  the sampler us taking between snapshots.  Note that although
+  changing this means that the snapshots are not uniformly
+  distributed, the results will still be sensible --- this is because
+  the cpu time between samples is taken into account when the
+  resulting data is analyzed.}
+
+@item{Finally, a @scheme['get-snapshots] message will make the
+  controller return the currently collected data.  Note that this can
+  be called multiple times, each call will return the data thatis
+  collected up to that point in time.  In addition, it can be (and
+  usually is) called after the sampler was stopped.
+
+  The value that is returned should be considered as an undocumented
+  internal detail of the profiler, to be sent to
+  @scheme[analyze-samples] for analysis.  The reason this is not done
+  automatically, is that a future extension might allow you to combine
+  several sampler results, making it possible to combine a profile
+  analysis from several individual runs, possibly from different
+  machines.}
+
+]}
diff --git a/collects/profile/scribblings/toplevel.scrbl b/collects/profile/scribblings/toplevel.scrbl
new file mode 100644
index 0000000..5f1f20d
--- /dev/null
+++ b/collects/profile/scribblings/toplevel.scrbl
@@ -0,0 +1,67 @@
+#lang scribble/doc
+
+@(require scribble/manual
+          (for-label scheme profile profile/sampler
+                     (only-in profile/analyzer analyze-samples)
+                     (prefix-in text: profile/render-text)))
+
+@title{Toplevel Interface}
+
+@defmodule[profile]
+
+This module provides one procedure and one macroxs that are convenient
+high-level entry points for timing expressions.  This hides the
+details that are available through other parts of the library, and is
+intended as a convenient tool for profiling code.
+
+@defproc[(profile-thunk [thunk (-> any/c)]
+                        [#:delay delay nonnegative-number? 0.05]
+                        [#:repeat  iterations exact-nonnegative-integer? 1]
+                        [#:threads threads?   any/c #f]
+                        [#:render  renderer   text:render])
+         void?]{
+
+Executes the given thunk while collecting profiling data, and render
+this data when done.  Keyword arguments can customize the profiling:
+@itemize[
+
+@item{The profiler works by @scheme[create-sampler] starting a
+  ``sampler'' thread whose job is to collect stack samples
+  periodically (using @scheme[continuation-mark-set->context]).
+  @scheme[delay] determines the amount of time the sampler
+  @scheme[sleep]s for between samples.  Note that this is will be
+  close, but not identical to, the frequency in which data is actually
+  sampled.}
+
+@item{When the profiled computation takes a short amount of time, the
+  collected data will not be accurate.  In this case, you can specify
+  an @scheme[iterations] argument to repeat the evaluation a number of
+  times which will improve the accuracy of the resulting report.}
+
+@item{Normally, the sampler collects snapshots of the
+  @scheme[current-thread]'s stack.  If there is some computation that
+  happens on a different thread, that work will not be reflected in
+  the results: the only effect will be suspiciously small value for
+  the observed time, because the collected data is taking into account
+  the cpu time that the thread actually performed (it uses
+  @scheme[current-process-milliseconds] with the running thread as an
+  argument).  Specifying a non-@scheme[#f] value for the
+  @scheme[threads?] argument will arrange for all threads that are
+  started during the evaluation to be tracked.  Note that this means
+  that the computation will actually run in a new sub-custodian, as
+  this is the only way to be able to track such threads.}
+
+@item{Once the computation has finished, the sampler is stopped, and
+  the accumulated data is collected.  It is then analyzed by
+  @scheme[analyze-samples], and the analyzed profile data is fed into
+  a renderer.  Use an identity function (@scheme[values]) to get the
+  analyzed result, and render it yourself, or use one of the existing
+  renderers (see @secref["renderers"]).}
+
+]}
+
+@defform[(profile expr keyword-arguments ...)]{
+
+A macro version of @scheme[profile-thunk].  The keyword arguments can
+be specified in the same was as for a function call: they can appear
+before and/or after the expression to be profiled.}
diff --git a/collects/profile/utils.ss b/collects/profile/utils.ss
new file mode 100644
index 0000000..0228ba4
--- /dev/null
+++ b/collects/profile/utils.ss
@@ -0,0 +1,37 @@
+#lang scheme/base
+
+(provide format-percent format-source get-hidden)
+(require "analyzer.ss")
+
+(define (format-percent percent)
+  (let ([percent (inexact->exact (round (* percent 1000)))])
+    (format "~a.~a%" (quotient percent 10) (modulo percent 10))))
+
+(define (format-source src)
+  (if src
+    (format "~a:~a"
+            (srcloc-source src)
+            (if (srcloc-line src)
+              (format "~a:~a" (srcloc-line src) (srcloc-column src))
+              (format "#~a" (srcloc-position src))))
+    "(unknown source)"))
+
+;; hide a node if its self time is smaller than the self threshold *and* all of
+;; its edges are below the sub-node threshold too -- this avoids confusing
+;; output where a node does not have an entry but appears as a caller/callee.
+(define (get-hidden profile hide-self% hide-subs%)
+  (define self% (or hide-self% 0))
+  (define subs% (or hide-subs% 0))
+  (define total-time (profile-total-time profile))
+  (define (hide? node)
+    (define (hide-sub? get-subs edge-sub edge-sub-time)
+      (define %s
+        (map (lambda (edge)
+               (let ([total (node-total (edge-sub edge))])
+                 (if (zero? total) 0 (/ (edge-sub-time edge) total))))
+             (get-subs node)))
+      (subs% . >= . (apply max %s)))
+    (and (self% . >= . (/ (node-self node) total-time))
+         (hide-sub? node-callees edge-callee edge-caller-time)
+         (hide-sub? node-callers edge-caller edge-callee-time)))
+  (if (or (> self% 0) (> subs% 0)) (filter hide? (profile-nodes profile)) '()))
diff --git a/collects/tests/profile/main.ss b/collects/tests/profile/main.ss
new file mode 100644
index 0000000..4d577fc
--- /dev/null
+++ b/collects/tests/profile/main.ss
@@ -0,0 +1,104 @@
+#lang scheme/base
+
+(require tests/eli-tester profile/analyzer scheme/match scheme/list)
+
+(define A '(A . #f))
+(define B '(B . #f))
+(define C '(C . #f))
+
+(define (analyze cpu+lists)
+  (profile->sexpr
+   (analyze-samples
+    (cons (car cpu+lists)
+          (map (lambda (x) (append (take x 2) (reverse (drop x 2))))
+               (reverse (cdr cpu+lists)))))))
+
+(define (profile->sexpr prof)
+  (define (node-id* node)
+    (or (node-id node) (if (node-src node) '??? '*)))
+  (define (edges->sexprs node get get-time)
+    (for/list ([edge (get node)])
+      `(,(node-id* (edge-caller edge)) -> ,(node-id* (edge-callee edge))
+        time=  ,(get-time edge)
+        total= ,(edge-time edge))))
+  (define (node->sexpr node)
+    `(,(node-id* node)
+          total=   ,(node-total node)
+          self=    ,(node-self node)
+          callers: ,@(edges->sexprs node node-callers edge-caller-time)
+          callees: ,@(edges->sexprs node node-callees edge-callee-time)
+          threads= ,(node-thread-ids node)))
+  `(total=   ,(profile-total-time    prof)
+    samples= ,(profile-sample-number prof)
+    cpu=     ,(profile-cpu-time      prof)
+    thread-times= ,(profile-thread-times prof)
+    ,@(map node->sexpr (cons (profile-*-node prof) (profile-nodes prof)))))
+
+(test
+
+ (match (analyze `(10
+                   [0 0 ,A]
+                   [0 1 ,A]))
+   [`(total= 2 samples= 2 cpu= 10 thread-times= ([0 . 2])
+      [* total= 2 self= 0
+         callers: [A -> * time= 2 total= 2]
+         callees: [* -> A time= 2 total= 2]
+         threads= ()]
+      [A total= 2 self= 2
+         callers: [* -> A time= 2 total= 2]
+         callees: [A -> * time= 2 total= 2]
+         threads= (0)])
+    'ok]
+   [bad (error 'test ">>> ~s" bad)])
+
+ (match (analyze `(10
+                   [0 0 ,A ,B ,A]
+                   [0 1 ,A ,B ,A]))
+   [`(total= 2 samples= 2 cpu= 10 thread-times= ([0 . 2])
+      [* total= 2 self= 0
+         callers: [A -> * time= 2 total= 2]
+         callees: [* -> A time= 2 total= 2]
+         threads= ()]
+      [A total= 2 self= 2
+         callers: [B -> A time= 2/2 total= 2]
+                  [* -> A time= 2/2 total= 2]
+         callees: [A -> B time= 2/2 total= 2]
+                  [A -> * time= 2/2 total= 2]
+         threads= (0)]
+      [B total= 2 self= 0
+         callers: [A -> B time= 2 total= 2]
+         callees: [B -> A time= 2 total= 2]
+         threads= (0)])
+    'ok]
+   [bad (error 'test ">>> ~s" bad)])
+
+ (match (analyze `(10
+                   [0 0 ,A ,B ,A]
+                   [0 1 ,A ,C ,A]
+                   [0 2 ,A ,C ,A]
+                   [0 3 ,A ,C ,A]))
+   [`(total= 4 samples= 4 cpu= 10 thread-times= ([0 . 4])
+      [* total= 4 self= 0
+         callers: [A -> * time= 4 total= 4]
+         callees: [* -> A time= 4 total= 4]
+         threads= ()]
+      [A total= 4 self= 4
+         callers: [* -> A time= 4/2 total= 4]
+                  [C -> A time= 3/2 total= 3]
+                  [B -> A time= 1/2 total= 1]
+         callees: [A -> * time= 4/2 total= 4]
+                  [A -> C time= 3/2 total= 3]
+                  [A -> B time= 1/2 total= 1]
+         threads= (0)]
+      [C total= 3 self= 0
+         callers: [A -> C time= 3 total= 3]
+         callees: [C -> A time= 3 total= 3]
+         threads= (0)]
+      [B total= 1 self= 0
+         callers: [A -> B time= 1 total= 1]
+         callees: [B -> A time= 1 total= 1]
+         threads= (0)])
+    'ok]
+   [bad (error 'test ">>> ~s" bad)])
+
+ )