racket/collects/meta/tree/tree.rkt

#lang scheme/base

(provide (struct-out tree) leaf? tree-foldl tree-foldr tree-for-each tree->list
         and: or: not: tree-filter)

(require scheme/list)

;; ----------------------------------------------------------------------------
;; Type definitions

;; This is a generic tree representation, subs is a list of subtrees, or #f for
;; a leaf.
;; - `name' is a name for this tree as a byte string, with a "/" suffix for
;;   non-leaf nodes (the filtering code relies on this assumption)
;; - `subs' is a list of subtrees, or #f to mark a leaf
;; - `path' is the full path for to this tree (eg, FS path or a subvesion url),
;;   this code has no assumptions on what's in there
;; - `data' is a placeholder for additional data

(define-struct tree (name subs path [data #:auto #:mutable]))
(define-syntax-rule (leaf? tree) (not (tree-subs tree)))

;; ----------------------------------------------------------------------------
;; Tree utilities

(define (tree-foldl f init tree)
  (let loop ([tree tree] [acc init])
    (let ([subs (tree-subs tree)])
      (if subs
        (let dloop ([subs subs] [acc (f tree acc)])
          (if (null? subs)
            acc
            (dloop (cdr subs) (loop (car subs) acc))))
        (f tree acc)))))

(define (tree-foldr f init tree)
  (let loop ([tree tree] [acc init])
    (let ([subs (tree-subs tree)])
      (f tree (if subs
                (let dloop ([subs subs])
                  (if (null? subs)
                    acc
                    (loop (car subs) (dloop (cdr subs)))))
                acc)))))

(define (tree-for-each f tree)
  (let loop ([tree tree])
    (f tree)
    (let ([subs (tree-subs tree)])
      (when subs (for-each loop subs)))))

(define (tree->list tree) (tree-foldr cons '() tree))

;; ----------------------------------------------------------------------------
;; Tree filtering

;; A tree-filtering predicate is a function that receives a tree, and returns
;; either #t/#f to include or exclude it, or it can return a function to be
;; applied on its sub-trees.  This setup makes it possible to minimize the
;; filtering work that is needed (compared to the old code that would compare
;; full paths).  `tree-filter' takes such a predicate and returns a tree with
;; filtered subtrees, so the smallest result is the empty root.

;; Turns a byte string with globbing into a regexp string.  "*" turns to ".*",
;; "?" turns to ".", "[...]" ranges are used as is, "{...|...}" turns to
;; "(?:...|...)", backslash escapes as usual.  If the glob is "*", "*/", "**",
;; or "**/", a corresponding symbol is returned; and if the glob is all
;; literal, a byte string is returned.  No special treatment of "/"s, since
;; these are used against path elements.  Note that this is applied on each
;; part of a glob string, so "{...|...}" should not have "/"s in them.
(define glob->regexp-or-literal
  (let ([glob-item
         ((compose byte-regexp bytes-append)
          #"(?:"
          #"\\\\."   ; escaped item
          #"|"
          #"[*?{|}]" ; wildcards and options -- the only 1-character matches
          #"|\\[(?:\\^?\\]|\\^?[^]^])[^]]*\\]" ; [...] ranges
          #")"
          )]
        [substs (map cons
                     (bytes->list #"*?{|}")
                     (regexp-split #rx" " #".* . (?: | )"))])
    (define (subq bstr . xs) (regexp-quote (apply subbytes bstr xs)))
    (lambda (glob)
      (define (loop i ps r)
        (if (null? ps)
          (let ([r (apply bytes-append (reverse (cons (subq glob i) r)))])
            (byte-regexp (bytes-append #"^" r #"$")))
          (loop
           (cdar ps) (cdr ps)
           ;; length=1 is only for `*' or `?'
           (cons (if (= 1 (- (cdar ps) (caar ps)))
                   (cdr (or (assq (bytes-ref glob (caar ps)) substs)
                            (error "internal error")))
                   ;; everything else passes through as is, including all
                   ;; backslashed escapes (not always needed, but harmless)
                   (subbytes glob (caar ps) (cdar ps)))
                 ;; and stuff between these things is getting quoted
                 (if (= i (caar ps))
                   r (cons (subq glob i (caar ps)) r))))))
      (cond [(equal? #"*"   glob) '*]
            [(equal? #"*/"  glob) '*/]
            [(equal? #"**"  glob) '**]
            [(equal? #"**/" glob) '**/]
            [(regexp-match #rx#"^[*]+/?$" glob)
             (error 'glob->regexp-or-literal "bad glob: ~e" glob)]
            [else (let ([ps (regexp-match-positions* glob-item glob)])
                    (if (null? ps) glob (loop 0 ps '())))]))))

(define (glob->pred glob)
  (let loop ([xs (let* (;; ignore "//"s (otherwise they'd match nothing)
                        [glob (regexp-replace* #rx#"//+" glob #"/")]
                        ;; ignore "/" prefix (filter never uses the root path)
                        [glob (regexp-replace #rx#"^/" glob #"")])
                   (regexp-split #rx#"(?<=/)(?=.)" glob))])
    ;; - xs is never null (`regexp-split' never returns null)
    ;; - an element without a trailing slash must be the last one
    ;; - an element with a trailing slash matches non-leaf nodes only, so need
    ;;   to test subs for `*/' and `**/'
    ;; - things usually work out fine, but if it's the last element, then we
    ;;   better return #t or #f rather than a continuation predicate, since a
    ;;   predicate result will never be used and it will mess up (eg, a
    ;;   predicate result for a leaf is considered true, but (not: (lambda (t)
    ;;   #t)) is also a predicate) => use #t for `r' in this case
    (let* ([x  (car xs)]
           [x* (glob->regexp-or-literal x)]
           [xs (cdr xs)]
           [r  (or (null? xs) (loop xs))])
      (cond
        [(eq? '*   x*) (lambda (t) #t)] ; it's the last one
        [(eq? '*/  x*) (lambda (t) (and (tree-subs t) r))]
        [(eq? '**  x*) (lambda (t) #t)]
        [(eq? '**/ x*) (letrec ([R (or: r (lambda (t) (and (tree-subs t) R)))])
                         R)]
        ;; if it's the last one and it has no "/" suffix then it will match
        ;; only leaves => in this case, allow matches on non-leaf nodes by
        ;; adding the "/" (if this is not done then it's very easy to make
        ;; mistakes)
        [else
         (let ([x*/ (cond [(or (pair? xs) (regexp-match? #rx#"/$" x)) #f]
                          [(bytes? x*) (bytes-append x* #"/")]
                          [(byte-regexp? x*)
                           (glob->regexp-or-literal (bytes-append x #"/"))]
                          [else (error 'glob->pred "bad glob part: ~e" x)])])
           (cond
             [(bytes? x*/)
              (lambda (t)
                (let ([x (if (tree-subs t) x*/ x*)])
                  (and (equal? x (tree-name t)) r)))]
             [(byte-regexp? x*/)
              (lambda (t)
                (let ([x (if (tree-subs t) x*/ x*)])
                  (and (regexp-match? x (tree-name t)) r)))]
             [(bytes? x*)
              (lambda (t) (and (tree-subs t) (equal? x* (tree-name t)) r))]
             [(byte-regexp? x*)
              (lambda (t) (and (regexp-match? x* (tree-name t)) r))]))]))))

(define (pred/glob->pred pred/glob)
  (cond [(string? pred/glob) (glob->pred (string->bytes/utf-8 pred/glob))]
        [(bytes? pred/glob) (glob->pred pred/glob)]
        [(procedure? pred/glob) pred/glob]
        [else (error 'pred/glob->pred "bad predicate or glob: ~e" pred/glob)]))

;; Combine tree-filter predicates efficiently: stop when the result is #f or #t
;; for `and:' or `or:' resp., drop predicates that returned #t or #f for them.
(define-syntax-rule (define-combiner name: raw-name: pos neg)
  (begin
    (define raw-name:
      (case-lambda [() (lambda (tree) pos)]
                   [(p1) p1]
                   [(p1 p2) (lambda (tree)
                              (let ([r1 (p1 tree)] [r2 (p2 tree)])
                                (cond [(eq? neg r1) neg]
                                      [(eq? neg r2) neg]
                                      [(eq? pos r1) r2]
                                      [(eq? pos r2) r1]
                                      [else (raw-name: r1 r2)])))]
                   [ps (lambda (tree)
                         (let loop ([ps ps] [rs '()])
                           (if (null? ps)
                             (apply raw-name: (reverse rs))
                             (let ([r ((car ps) tree)] [ps (cdr ps)])
                               (cond [(eq? neg r) neg]
                                     [(eq? pos r) (loop ps rs)]
                                     [else (loop ps (cons r rs))])))))]))
    (define (name: . preds/globs)
      (apply raw-name: (map pred/glob->pred preds/globs)))))
(define-combiner and: raw-and: #t #f)
(define-combiner or:  raw-or:  #f #t)

;; Negating predicates is a little tricky, for example (not: "*/*") would
;; filter out everything in all subtrees, and since empty non-leaf nodes are
;; usually dropped by `tree-filter', this means that the containing trees will
;; be dropped too, leaving only immediate leaves.  The way to make this behave
;; more intuitively is to mark negated predicates, and when filtering with a
;; negated predicate the default is to keep empty non-leaf nodes rather than
;; drop them.  (As an aside, this can also be used to make (not: (not: f))
;; return `f'.)
(define-struct negated (pred orig) #:property prop:procedure 0)
(define (raw-not: p)
  (if (negated? p)
    (negated-orig p)
    (make-negated (lambda (tree)
                    (let ([r (p tree)])
                      (cond [(eq? #t r) #f]
                            [(eq? #f r) #t]
                            [else (raw-not: r)])))
                  p)))
(define (not: pred/glob)
  (raw-not: (pred/glob->pred pred/glob)))

;; filter a whole tree
(define (tree-filter pred/glob tree)
  (define pred (pred/glob->pred pred/glob))
  (define (subs-filter pred tree)
    (let* ([same? #t]
           [subs (tree-subs tree)]
           [new-subs (filter-map (lambda (sub)
                                   (let ([r (loop sub pred)])
                                     (unless (eq? r sub) (set! same? #f))
                                     r))
                                 subs)])
      (cond [(and (null? new-subs) (not (negated? pred))) #f]
            [same? tree]
            [else (make-tree (tree-name tree) new-subs (tree-path tree))])))
  (define (loop tree pred)
    (let ([r (pred tree)])
      (cond [(eq? #t r) tree]
            [(eq? #f r) #f]
            [(procedure? r) (and (tree-subs tree) (subs-filter r tree))]
            [else (error 'tree-filter "bad result from predicate: ~e" r)])))
  (if (leaf? tree)
    (error 'tree-filter "expecting a non-leaf, got ~e" tree)
    (or (subs-filter pred tree)
        (make-tree (tree-name tree) '() (tree-path tree)))))