trivial/icfp-2016/benchmark/suffixtree/post/lcs.rkt

153 lines
5.5 KiB
Racket

#lang typed/racket/base
(require trivial/no-colon)
;; Some utilities.
(require
(except-in "typed-data.rkt" make-label)
racket/list)
(require "label.rkt"
)
(require (only-in
"structs.rkt"
make-tree
tree-root
))
(require "ukkonen.rkt")
(: false-thunk (-> #f))
(define false-thunk (lambda () #f))
;; longest-common-substring: string string -> string
;; Returns the longest common substring between the two strings.
(provide longest-common-substring)
(: longest-common-substring (-> String String String))
(define (longest-common-substring s1 s2)
(label->string (longest-common-sublabel (string->label/with-sentinel s1)
(string->label/with-sentinel s2))))
;; longest-common-sublabel: label label -> label
;;
;; Naive use of suffix trees to find longest common sublabel between
;; two labels. Note that there's a better way to do this with
;; matching statistics: I'll try using matching statistics as soon
;; as I get this version running.
;;
;; This approach simply adds both labels to a common suffix tree,
;; does a postorder traversal to mark up the inner nodes, and then
;; finds the inner node with the deepest string depth.
(provide longest-common-sublabel)
(: longest-common-sublabel (-> Label Label Label))
(define (longest-common-sublabel label-1 label-2)
(: label-1-marks (HashTable Node Boolean))
(define label-1-marks (make-hasheq))
(: label-2-marks (HashTable Node Boolean))
(define label-2-marks (make-hasheq))
(: deepest-node (Boxof Node))
(define deepest-node (box (node (make-label "no lcs") #f '() #f)))
(: deepest-depth (Boxof Index))
(define deepest-depth (box 0))
(: main (-> Label))
(define (main)
(define tree (make-tree))
(tree-add! tree label-1)
(tree-add! tree label-2)
(mark-up-inner-nodes! (tree-root tree) 0)
(path-label (unbox deepest-node)))
(: mark-up-inner-nodes! (-> Node Index Void))
(define (mark-up-inner-nodes! node depth)
(cond [(null? (node-children node))
(when (label-source-eq? (node-up-label node) label-1)
(mark-with-label-1! node))
(when (label-source-eq? (node-up-label node) label-2)
(mark-with-label-2! node))]
[else
(for ([child (node-children node)])
(let ([i (+ depth (label-length (node-up-label child)))])
(unless (index? i) (error "NOOOOO"))
(mark-up-inner-nodes! child i)))
(absorb-children-marks! node depth)]))
(: mark-with-label-1! (-> Node Void))
(define (mark-with-label-1! node)
(hash-set! label-1-marks node #t))
(: mark-with-label-2! (-> Node Void))
(define (mark-with-label-2! node)
(hash-set! label-2-marks node #t))
(: marked-by-label-1? (-> Node Boolean))
(define (marked-by-label-1? node)
(hash-ref label-1-marks node false-thunk))
(: marked-by-label-2? (-> Node Boolean))
(define (marked-by-label-2? node)
(hash-ref label-2-marks node false-thunk))
(: marked-by-both? (-> Node Boolean))
(define (marked-by-both? node)
(and (marked-by-label-1? node)
(marked-by-label-2? node)))
(: absorb-children-marks! (-> Node Index Void))
(define (absorb-children-marks! node depth)
;(let/ec escape
(for ([child (node-children node)])
(when (marked-by-label-1? child)
(mark-with-label-1! node))
(when (marked-by-label-2? child)
(mark-with-label-2! node)))
;(when (marked-by-both? node)
; (escape))))
(when (and (marked-by-both? node)
(> depth (unbox deepest-depth)))
(set-box! deepest-depth depth)
(set-box! deepest-node node)))
(if (or (= 0 (label-length label-1))
(= 0 (label-length label-2)))
(string->label "")
(main)))
;; path-label: node -> label
;;
;; Returns a new label that represents the path from the tree root
;; to this node.
;;
;; Fixme: optimize the representation of label to be able to do this
;; without much reallocation. Maybe another label class that uses a
;; rope data structure might be better... I need to read Hans
;; Boehm's paper on "Ropes, an alternative to strings" to see how
;; much work this would be.
(provide path-label)
(: path-label (-> Node Label))
(define (path-label node)
(: collect-loop (-> (U Node #f) (Listof Label) Integer Label))
(define (collect-loop current-node collected-labels total-length)
(if current-node
(collect-loop (node-parent current-node)
(cons (node-up-label current-node) collected-labels)
(+ total-length
(label-length (node-up-label current-node))))
(build-new-label collected-labels total-length)))
(: vector-blit! (-> Label (Vectorof (U Char Symbol)) Index Void))
(define (vector-blit! src-label dest-vector dest-offset)
(let loop ((i 0))
(let ([index (+ i dest-offset)])
(when (and (< i (label-length src-label)) (index? i) (index? index))
(vector-set! dest-vector
index
(label-ref src-label i))
(loop (add1 i))))))
(: build-new-label (-> (Listof Label) Integer Label))
(define (build-new-label labels total-length)
(: vector (Vectorof (U Char Symbol)))
(define vector (make-vector total-length 'X))
(let loop ((labels labels) (i 0))
(cond [(null? labels)
(vector->label vector)]
[(index? i)
(vector-blit! (car labels) vector i)
(loop (cdr labels)
(+ i (label-length (car labels))))]
[else (error "not an index")])))
(collect-loop node '() 0))