;;; This lexer is a little tricky.
;;; HtmlPrag discards source location information, so
;;; we use HtmlPrag to get the content. Then we break
;;; the content up into terms. Regular expression
;;; matching is then used to recover the positions of
;;; the terms (line and column numbers are lost, but
;;; they aren't needed).
(module html-to-txt mzscheme
(require scheme/mpair
scheme/system
(only scheme regexp-quote)
(lib "match.ss")
"planet/file.scm"
"planet/htmlprag.ss" ; version 1.3
"planet/intersperse.scm"
(prefix txt: "lexer.scm"))
(define (html-file->txt-file src-file dest-file)
(system (format "w3m -dump 'a' >'a'" src-file dest-file)))
(define (file->shtml file)
; read file and return parsed file
(with-input-from-file file
(lambda ()
(html->shtml
(port->string
(current-input-port))))))
(define (shtml->tokens s)
; extract the terms
(match s
[(t ...) (apply append (map shtml->tokens t))]
[t (if (string? t) (list t) '())]))
; freeze : mutable-tree -> immutabale-tree
; convert mpairs to pairs
(define (freeze o)
(if (mlist? o)
(map freeze (mlist->list o))
o))
(define (html-file->string file)
(apply string-append
(intersperse " "
(shtml->tokens
(freeze
(file->shtml file))))))
#;(define (html-file->terms file)
; return a list of terms occuring in file
(map car
(txt:port->tokens
(open-input-string
(apply string-append
(intersperse " "
(shtml->tokens
(freeze
(file->shtml file)))))))))
#;(define (terms->tokens terms file)
; use regexp-match-peek-positions to recover the
; positions of the terms
(let ([in (open-input-file file)])
(let loop ([start-pos 0]
[terms terms]
[tokens '()])
(cond [(null? terms)
(reverse tokens)]
[else
(let ([term (car terms)])
(display (list start-pos term)) (newline)
(let ([pos (regexp-match-peek-positions (regexp-quote term) in start-pos)])
(if pos
(loop (cdr (car pos))
(cdr terms)
(cons (list term (list 'line? 'col? (caar pos))) tokens))
(begin
(display (format "Skipped '~a'\n" term))
(loop start-pos (cdr terms) tokens)))))]))))
; (define file "/Applications/PLT Scheme v3.99.0.10/doc/reference/begin.html")
#;(txt:port->tokens
(open-input-string
(apply string-append
(shtml->tokens
(freeze
(file->shtml
"../test/html-example.html"))))))
)