From 33418e9969d8ab13994759d0957dfc73386c1fa3 Mon Sep 17 00:00:00 2001
From: Ryan Culpepper <ryanc@racket-lang.org>
Date: Mon, 6 Sep 2010 23:00:47 -0600
Subject: [PATCH] added tool for detecting useless requires

original commit: 221d07626ef1f90b1be620f141c318884d141871
---
 .../analysis/check-requires.rkt               | 518 ++++++++++++++++++
 1 file changed, 518 insertions(+)
 create mode 100644 collects/macro-debugger/analysis/check-requires.rkt
diff --git a/collects/macro-debugger/analysis/check-requires.rkt b/collects/macro-debugger/analysis/check-requires.rkt
new file mode 100644
index 0000000..5f28934
--- /dev/null
+++ b/collects/macro-debugger/analysis/check-requires.rkt
@@ -0,0 +1,518 @@
+#lang racket/base
+(require racket/contract
+         racket/match
+         racket/dict
+         unstable/struct
+         syntax/modresolve
+         syntax/stx
+         syntax/id-table
+         macro-debugger/model/deriv
+         "util.rkt")
+(provide/contract
+ [check-requires (-> module-path? list?)]
+ [add-disappeared-uses? (parameter/c boolean?)]
+ [mpi->key (-> module-path-index? any/c)])
+
+#|
+
+The purpose of this script is to estimate a module's useless requires.
+
+Usage:
+
+  (check-requires <module-name>)
+
+Examples:
+
+  (check-requires 'typed-scheme)
+  (check-requires 'unstable/markparam)
+  (check-requires 'macro-debugger/syntax-browser/widget)
+
+The procedure prints one line per (non-label) require in the following
+format:
+
+  KEEP <module> at <phase> <optional-comment>
+    - The require must be kept because bindings defined within it are used.
+    - The optional comment indicates if the require must be kept
+      - only because its bindings are re-exported
+      - only because the whitelist DB says so
+
+  BYPASS <module> at <phase>
+    - The require is used, but only for bindings that could be more directly
+      obtained via another module. For example, 'racket' can be bypassed in favor
+      of some subset of 'racket/base', 'racket/contract', etc.
+
+  DROP <module> at <phase>
+    - The require appears to be unused. Unless it must be kept for side
+      effects or for bindings of a very unusual macro, it can be dropped
+      entirely.
+
+Notes:
+
+  BYPASS recommendations should often be disregarded, because the
+  required module is expressly intended as an aggregation module and the
+  only way to bypass it would be to require private modules
+  directly. See TODO for plans to improve BYPASS recommendations.
+
+  Ignore recommendations to DROP or BYPASS modules with side
+  effects. Read the section below (How it works) and also see the docs
+  for 'module-db' for whitelisting side-effectful modules.
+
+  The script is not intelligent about the language, which causes
+  certain spurious recommendations to appear frequently. For example,
+
+    DROP scheme/mzscheme at 1
+
+  means that the module's language is mzscheme, which automatically
+  inserts (require-for-syntax scheme/mzscheme). It's infeasible to
+  remove it except by rewriting the module in scheme/base or
+  racket/base.
+
+========
+
+How it works
+
+Determining whether a require is actually useless is impossible: a
+module may be required for compile-time side effect only, and there's
+no way to monitor that.
+
+Here are some approximations that are feasible to calculate:
+
+NOM-USES = A require R is "used" by a module M if, during the
+compilation of M, a reference is resolved to a binding exported by R.
+
+DEF-USES = A require R is "used" by a module M if, during the
+compilation of M, a reference is resolved to a binding defined in R.
+
+The limitations:
+ - misses side-effects
+ - misses identifiers recognized via 'free-identifier=?'
+   (But those should be recorded as 'disappeared-use anyway.)
+
+|#
+
+;; ========
+
+(define add-disappeared-uses? (make-parameter #t))
+
+;; ========
+
+;; A RefTable = hash[Phase => free-id-table[bool]]
+;; Phase = nat
+
+#|
+
+For the calculations at the end, we only want to consider identifiers
+from the expanded module (ie, syntax-source-module = here.)
+
+That means that instead of a free-id-table, we really want a dict/set
+that distinguishes between identifiers imported different ways.  eg,
+hash keyed on (nom-name, nom-mod). The reason is that a not-from-here
+identifier can block/clobber a from-here identifier if they happen to
+refer to the same binding. That messes up the analysis.
+
+Temporary solution: only add from-here identifiers to the reftable.
+
+|#
+
+;; new-reftable : -> RefTable
+(define (new-reftable)
+  (make-hash))
+
+;; reftable-get-phase : RefTable Phase -> free-id-table[bool]
+(define (reftable-get-phase refs phase)
+  (hash-ref! refs phase (lambda () (make-free-id-table #:phase phase))))
+
+;; reftable-add-all! : RefTable Phase (listof identifier) -> void
+(define (reftable-add-all! refs phase ids)
+  (let ([id-table (reftable-get-phase refs phase)])
+    (for ([id (in-list ids)]
+          #:when (here-mpi? (syntax-source-module id)))
+      (free-id-table-set! id-table id #t))))
+
+;; reftable-add! : RefTable Phase identifier -> void
+(define (reftable-add! refs phase id)
+  (reftable-add-all! refs phase (list id)))
+
+;; ========
+
+;; phase : (parameterof nat)
+(define phase (make-parameter 0))
+
+;; ========
+
+;; analyze : *Deriv* RefTable -> void
+;; *Deriv* = Deriv | LDeriv | BRule | ModRule | ... (anything from deriv.rkt)
+(define (analyze deriv refs)
+  (define (recur . args)
+    (let check ([arg args])
+      (cond [(syntax? arg) (error 'whoops "arg = ~s" arg)]
+            [(list? arg) (for-each check arg)]
+            [else (void)]))
+    (for ([arg (in-list args)])
+      (if (list? arg)
+          (apply recur arg)
+          (analyze arg refs))))
+  (define (add! ids)
+    (reftable-add-all! refs (phase) ids))
+
+  ;; (printf "analyze ~.s\n" deriv)
+
+  ;; Handle common base (ie, resolves) part of derivs, if applicable
+  (match deriv
+    [(base z1 z2 resolves ?1)
+     (add! resolves)
+     (when (and (syntax? z2) (add-disappeared-uses?))
+       (let ([uses (syntax-property z2 'disappeared-use)])
+         (add! (let loop ([x uses] [onto null])
+                 (cond [(identifier? x) (cons x onto)]
+                       [(pair? x) (loop (car x) (loop (cdr x) onto))]
+                       [else onto])))))]
+    [_
+     (void)])
+  ;; Handle individual variants
+  (match deriv
+    [(lift-deriv z1 z2 first lift-stx second)
+     (recur first second)]
+    [(tagrule z1 z2 tagged-stx next)
+     (recur next)]
+    [(lift/let-deriv z1 z2 first lift-stx second)
+     (recur first second)]
+
+    [(mrule z1 z2 rs ?1 me1 locals me2 ?2 etx next)
+     (recur locals next)]
+    [(local-exn exn)
+     (void)]
+    [(local-expansion z1 z2 for-stx? me1 inner lifted me2 opaque)
+     (parameterize ((phase (+ (phase) (if for-stx? 1 0))))
+       (recur inner))]
+    [(local-lift expr ids)
+     (void)]
+    [(local-lift-end decl)
+     (void)]
+    [(local-lift-require req expr mexpr)
+     (void)]
+    [(local-lift-provide prov)
+     (void)]
+    [(local-bind names ?1 renames bindrhs)
+     (recur bindrhs)]
+    [(local-value name ?1 resolves bound?)
+     (when (and bound? resolves)
+       (add! (cons name resolves)))]
+    [(track-origin before after)
+     (void)]
+    [(local-remark contents)
+     (void)]
+
+    [(p:variable z1 z2 rs ?1)
+     (void)]
+    [(p:module z1 z2 rs ?1 locals tag rename check tag2 ?3 body shift)
+     (recur locals check body)]
+    [(p:#%module-begin z1 z2 rs ?1 me pass1 pass2 ?2)
+     (recur pass1 pass2)]
+    [(p:define-syntaxes z1 z2 rs ?1 rhs locals)
+     (parameterize ((phase (+ (phase) 1)))
+       (recur rhs locals))]
+    [(p:define-values z1 z2 rs ?1 rhs)
+     (recur rhs)]
+
+    [(p:#%expression z1 z2 rs ?1 inner untag)
+     (recur inner)]
+    [(p:if z1 z2 rs ?1 test then else)
+     (recur test then else)]
+    [(p:wcm z1 z2 rs ?1 key mark body)
+     (recur key mark body)]
+    [(p:set! _ _ _ _ id-resolves ?2 rhs)
+     (add! id-resolves)
+     (recur rhs)]
+    [(p:set!-macro _ _ _ _ deriv)
+     (recur deriv)]
+    [(p:#%app _ _ _ _ lderiv)
+     (recur lderiv)]
+    [(p:begin _ _ _ _ lderiv)
+     (recur lderiv)]
+    [(p:begin0 _ _ _ _ first lderiv)
+     (recur first lderiv)]
+
+    [(p:lambda _ _ _ _ renames body)
+     (recur body)]
+    [(p:case-lambda _ _ _ _ renames+bodies)
+     (recur renames+bodies)]
+    [(p:let-values _ _ _ _ renames rhss body)
+     (recur rhss body)]
+    [(p:letrec-values _ _ _ _ renames rhss body)
+     (recur rhss body)]
+    [(p:letrec-syntaxes+values _ _ _ _ srenames sbindrhss vrenames vrhss body tag)
+     (recur sbindrhss vrhss body)]
+
+    [(p:provide _ _ _ _ inners ?2)
+     (recur inners)]
+
+    [(p:require _ _ _ _ locals)
+     (recur locals)]
+
+    [(p:#%stratified-body _ _ _ _ bderiv)
+     (recur bderiv)]
+
+    [(p:stop _ _ _ _) (void)]
+    [(p:unknown _ _ _ _) (void)]
+    [(p:#%top _ _ _ _)
+     (void)]
+    [(p:#%datum _ _ _ _) (void)]
+    [(p:quote _ _ _ _) (void)]
+    [(p:quote-syntax z1 z2 _ _)
+     (when z2 (analyze/quote-syntax z2 refs))]
+    [(p:#%variable-reference _ _ _ _)
+     ;; FIXME
+     (void)]
+
+    [(lderiv _ _ ?1 derivs)
+     (recur derivs)]
+
+    [(bderiv _ _ pass1 trans pass2)
+     (recur pass1 pass2)]
+
+    [(b:error ?1)
+     (void)]
+    [(b:expr _ head)
+     (recur head)]
+    [(b:splice _ head ?1 tail ?2)
+     (recur head)]
+    [(b:defvals _ head ?1 rename ?2)
+     (recur head)]
+    [(b:defstx _ head ?1 rename ?2 bindrhs)
+     (recur head bindrhs)]
+
+    [(bind-syntaxes rhs locals)
+     (parameterize ((phase (+ 1 (phase))))
+       (recur rhs locals))]
+
+    [(clc ?1 renames body)
+     (recur body)]
+
+    [(mod:prim head rename prim)
+     (recur head prim)]
+    [(mod:splice head rename ?1 tail)
+     (recur head)]
+    [(mod:lift head renames tail)
+     (recur head)]
+    [(mod:lift-end tail)
+     (void)]
+    [(mod:cons head)
+     (recur head)]
+    [(mod:skip)
+     (void)]
+
+    [(ecte _ _ locals first second locals2)
+     (recur locals first second locals2)]
+
+    [#f
+     (void)]))
+
+;; analyze/quote-syntax : stx RefTable -> void
+;; Current approach: estimate that an identifier in a syntax template
+;; may be used at (sub1 (phase)) or (phase).
+;; FIXME: Allow for more conservative choices, too.
+;; FIXME: #%top, #%app, #%datum, etc?
+;; FIXME: Track tentative (in quote-syntax) references separately?
+(define (analyze/quote-syntax qs-stx refs)
+  (let ([phases (list (phase) (sub1 (phase)))]
+        [stx (syntax-case qs-stx ()
+               [(_quote-syntax x) #'x])])
+    (define (add! id)
+      (for ([phase (in-list phases)])
+        (reftable-add! refs phase id)))
+    (let loop ([stx stx])
+      (let ([d (if (syntax? stx) (syntax-e stx) stx)])
+        (cond [(identifier? stx) (add! stx)]
+              [(pair? d)
+               (loop (car d))
+               (loop (cdr d))]
+              [(vector? d)
+               (map loop (vector->list d))]
+              [(prefab-struct-key d)
+               (map loop (struct->list d))]
+              [(box? d)
+               (loop (unbox d))]
+              [else
+               (void)])))))
+
+;; ========
+
+;; sMPI = S-expr form of mpi (see mpi->key)
+;; Using MPIs doesn't work. I conjecture that the final module shift means that
+;; all during-expansion MPIs are different from all compiled-expr MPIs.
+
+;; A UsedTable = hash[(list int sMPI) => list]
+
+;; calculate-used-approximations : RefTable -> (values UsedTable UsedTable)
+(define (calculate-used-approximations refs)
+  (let ([NOM-USES (make-hash)]
+        [DEF-USES (make-hash)])
+    (for* ([(use-phase id-table) (in-hash refs)]
+           [id (in-dict-keys id-table)])
+      ;; Only look at identifiers written in module being examined.
+      ;; (Otherwise, nom-mod & nom-phase aren't enough info (???)
+      (when (here-mpi? (syntax-source-module id)) ;; REDUNDANT
+        (let ([b (identifier-binding id use-phase)])
+          (match b
+            [(list def-mod def-sym nom-mod nom-sym
+                   def-phase nom-imp-phase nom-exp-phase)
+             ;; use-phase = def-phase + required-phase
+             ;; thus required-phase = use-phase - def-phase
+             (let* ([required-phase (- use-phase def-phase)]
+                    [key (list required-phase (mpi->key def-mod))])
+               (hash-set! DEF-USES key
+                          (cons id (hash-ref DEF-USES key null))))
+             ;; use-phase = nom-imp-phase + nom-exp-phase ?????
+             ;; We just care about nom-imp-phase, since importing into *here*
+             #|
+             ;; FIXME: This check goes wrong on defined-for-syntax ids
+             (unless (equal? use-phase (+ nom-imp-phase nom-exp-phase))
+               (error 'calculate
+                      "internal error: phases wrong in ~s @ ~s, binding = ~s"
+                      id use-phase b))
+             |#
+             (let ([key (list nom-imp-phase (mpi->key nom-mod))])
+               (hash-set! NOM-USES key
+                          (cons id (hash-ref NOM-USES key null))))]
+            [_
+             (void)]))))
+    (values NOM-USES DEF-USES)))
+
+;; ========
+
+;; get-requires : compiled-module-expr -> (listof (list int MPI))
+(define (get-requires compiled)
+  (let ([phase+mods-list (module-compiled-imports compiled)])
+    (for*/list ([phase+mods (in-list phase+mods-list)]
+                #:when (car phase+mods) ;; Skip for-label requires
+                [mod (cdr phase+mods)])
+      (list (car phase+mods) mod))))
+
+;; add-provides! : compiled-module-expr UsedTable UsedTable -> void
+(define (add-provides! compiled NOM-USES DEF-USES)
+  (define (add! mpi phase)
+    (let ([key (list phase (mpi->key mpi))])
+      (hash-set! NOM-USES key (cons 'export (hash-ref NOM-USES key null)))
+      (hash-set! DEF-USES key (cons 'export (hash-ref DEF-USES key null)))))
+  (let-values ([(vprov sprov) (module-compiled-exports compiled)])
+    (for* ([phase+exps (in-list (append vprov sprov))]
+           #:when (car phase+exps) ;; Skip for-label provides
+           [name+srcs (in-list (cdr phase+exps))]
+           [src (in-list (cadr name+srcs))])
+      (let ([name (car name+srcs)])
+        (match src
+          [(? module-path-index?)
+           (add! src 0)]
+          [(list imp-mod imp-phase-shift imp-name imp-phase-???)
+           (add! imp-mod imp-phase-shift)])))))
+
+;; ========
+
+;; A ModuleDB = hash[path/symbol => (U 'no-drop 'no-bypass)]
+;;   'no-drop = must not be dropped or bypassed because of, eg, side effects
+;;   'no-bypass = don't bypass in favor of private component modules
+;;                but if the module is unused, can drop it
+;;                (FIXME: replace with component module calculation and checking)
+
+(define (make-module-db mod+config-list)
+  (for/hash ([mod+config (in-list mod+config-list)])
+    (values (resolve-module-path (car mod+config) #f) (cadr mod+config))))
+
+;; module-db : ModuleDB
+(define module-db
+  (make-module-db
+   '([racket/base                            no-bypass]
+     [racket/contract/base                   no-bypass]
+     [racket/gui                             no-bypass]
+     [racket/match                           no-bypass]
+     ['#%builtin                             no-drop]
+
+     [typed-scheme/private/base-env          no-drop]
+     [typed-scheme/private/base-special-env  no-drop]
+     [typed-scheme/private/base-env-numeric  no-drop]
+     [typed-scheme/private/base-env-indexing no-drop])))
+
+;; ========
+
+#|
+A recommendation is one of
+  (list 'keep   module-path-index phase string/#f)
+  (list 'bypass module-path-index phase)
+  (list 'drop   module-path-index phase)
+|#
+
+;; check-requires : module-path -> (listof recommendation)
+(define (check-requires mod-path)
+  (let-values ([(compiled deriv) (get-module-code/trace mod-path)])
+    (let ([refs (new-reftable)])
+      (analyze deriv refs)
+      (let-values ([(NOM-USES DEF-USES) (calculate-used-approximations refs)])
+        (add-provides! compiled NOM-USES DEF-USES)
+        (report NOM-USES DEF-USES (get-requires compiled))))))
+
+;; report : UseTable UseTable (listof (list int mpi)) -> (listof recommendation)
+(define (report NOM-USES DEF-USES phase+mod-list)
+  (for/list ([phase+mod (in-list phase+mod-list)])
+    (let* ([key (list (car phase+mod) (mpi->key (cadr phase+mod)))]
+           [db-config
+            (hash-ref module-db
+                      (resolved-module-path-name
+                       (module-path-index-resolve (cadr phase+mod)))
+                      #f)]
+           [nom-ids (hash-ref NOM-USES key null)]
+           [def-ids (hash-ref DEF-USES key null)]
+           [phase (car phase+mod)]
+           [mod (cadr phase+mod)]
+           [name (format "~s at ~s" (mpi->key mod) phase)])
+      (cond [(and (pair? nom-ids) (pair? def-ids))
+             (list 'keep mod phase (if (ormap identifier? nom-ids) #f "for exports"))]
+            [(pair? nom-ids)
+             (if (memq db-config '(no-bypass no-drop))
+                 (list 'keep mod phase "db says no-bypass")
+                 (list 'bypass mod phase))]
+            [else
+             (if (memq db-config '(no-drop))
+                 (list 'keep mod phase "db says no-drop")
+                 (list 'drop mod phase))]))))
+
+(define (mpi->key x)
+  (let ([l (mpi->list x)])
+    (if (and (pair? l) (null? (cdr l)))
+        (car l)
+        l)))
+
+(define (mpi->list x)
+  (cond [(module-path-index? x)
+         (let-values ([(rel base) (module-path-index-split x)])
+           (if rel
+               (cons rel (mpi->list base))
+               null))]
+        [(eq? x #f)
+         null]
+        [else
+         (list x)]))
+
+#|
+TODO
+====
+
+Elaborate on BYPASS recommendations by finding the necessary modules
+further up the require chain to require directly.
+  - don't recommend private modules, though... heuristic
+
+Let user provide database of modules that should never be dropped, eg
+because they have side effects.
+  - wouldn't it be awesome if this db could be a datalog program?
+  - start simpler, though
+
+Verbose mode should show identifiers used by a module (for KEEP).
+For example, if only one used, good candidate to split out, if possible.
+
+Ambitious mode could analyze module and recommend ways to split module
+into independent submodules.
+
+More options for quote-syntax handling & explain current heuristic better.
+
+Handle for-label.
+|#