#!/usr/bin/env python

#***************************************************************************
#*                                                                         *
#*   Copyright (c) 2009 Yorik van Havre <yorik@gmx.fr>                     *  
#*                                                                         *
#*   This program is free software; you can redistribute it and/or modify  *
#*   it under the terms of the GNU Lesser General Public License (LGPL)    *
#*   as published by the Free Software Foundation; either version 2 of     *
#*   the License, or (at your option) any later version.                   *
#*   for detail see the LICENCE text file.                                 *
#*                                                                         *
#*   This program is distributed in the hope that it will be useful,       *
#*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
#*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
#*   GNU Library General Public License for more details.                  *
#*                                                                         *
#*   You should have received a copy of the GNU Library General Public     *
#*   License along with this program; if not, write to the Free Software   *
#*   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  *
#*   USA                                                                   *
#*                                                                         *
#***************************************************************************

__title__="buildpdf"
__author__ = "Yorik van Havre <yorik@uncreated.net>"
__url__ = "http://free-cad.sf.net"

"""
This script builds a pdf file from a local copy of the wiki
"""

import sys, os, re, tempfile, getopt
from urllib2 import urlopen, HTTPError

#    CONFIGURATION       #################################################

INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
REMOVE = True # if true, the temp html files are removed after successful operation
PDFCONVERTOR = 'wkhtmltopdf' # can be 'pisa', 'htmldoc' or 'wkhtmltopdf'
VERBOSE = True

#    END CONFIGURATION      ##############################################


FOLDER = "./localwiki"

fcount = dcount = 0

def crawl():
    "downloads an entire wiki site"

    # tests ###############################################
    
    if PDFCONVERTOR == 'pisa':
        try:
            import ho.pisa as pisa
        except:
            "Error: Python-pisa not installed, exiting."
            return 1
    elif PDFCONVERTOR == 'htmldoc':
        if os.system('htmldoc --version'):
            print "Error: Htmldoc not found, exiting."
            return 1
    try:
        from pyPdf import PdfFileReader,PdfFileWriter
    except:
        print "Error: Python-pypdf not installed, exiting."

    # run ########################################################
    
    buildpdffiles()
    joinpdf()

    if VERBOSE: print "All done!"
    return 0

def buildpdffiles():
    "scans a folder for html files and converts them all to pdf"
    templist = os.listdir(FOLDER)
    fileslist = []
    for i in templist:
        if i[-5:] == '.html':
            fileslist.append(i)
    for f in fileslist:
        if PDFCONVERTOR == 'pisa': createpdf_pisa(f[:-5])
        elif PDFCONVERTOR == 'wkhtmltopdf': createpdf_wkhtmltopdf(f[:-5])
        else: createpdf_htmldoc(f[:-5])

def fetch_resources(uri, rel):
        """
        Callback to allow pisa/reportlab to retrieve Images,Stylesheets, etc.
        'uri' is the href attribute from the html link element.
        'rel' gives a relative path, but it's not used here.

        Note from Yorik: Not working!!
        """
        path = os.path.join(FOLDER,uri.replace("./", ""))
        return path

def createpdf_pisa(pagename):
    "creates a pdf file from a saved page using pisa (python module)"
    import ho.pisa as pisa
    if not exists(pagename+".pdf",image=True):
        infile = file(FOLDER + os.sep + pagename+'.html','ro')
        outfile = file(FOLDER + os.sep + pagename+'.pdf','wb')
        if VERBOSE: print "Converting " + pagename + " to pdf..."
        pdf = pisa.CreatePDF(infile,outfile,FOLDER,link_callback=fetch_resources)
        outfile.close()
        if pdf.err: return pdf.err
        return 0

def createpdf_htmldoc(pagename):
    "creates a pdf file from a saved page using htmldoc (external app, but supports images)"
    if not exists(pagename+".pdf",image=True):
        infile = FOLDER + os.sep + pagename+'.html'
        outfile = FOLDER + os.sep + pagename+'.pdf'
        return os.system('htmldoc --webpage --textfont sans --browserwidth 840 -f '+outfile+' '+infile)

def createpdf_wkhtmltopdf(pagename):
    "creates a pdf file from a saved page using htmldoc (external app, but supports images)"
    if not exists(pagename+".pdf",image=True):
        infile = FOLDER + os.sep + pagename+'.html'
        outfile = FOLDER + os.sep + pagename+'.pdf'
        return os.system('wkhtmltopdf '+infile+' '+outfile)

def joinpdf():
    "creates one pdf file from several others, following order from startpage"
    from pyPdf import PdfFileReader,PdfFileWriter
    if VERBOSE: print "Building table of contents..."
    f = open(FOLDER+os.sep+INDEX+'.html')
    html = ''
    for line in f: html += line
    f.close()
    html = html.replace("\n"," ")
    html = html.replace("> <","><")
    html = re.findall("<ul.*/ul>",html)[0]
    pages = re.findall('href="(.*?)"',html)
    pages.insert(1,INDEX+".html")
    result = PdfFileWriter()
    for p in pages:
        if exists(p[:-5]):
            if VERBOSE: print 'Appending',p
            try: inputfile = PdfFileReader(file(FOLDER+os.sep+p[:-5]+'.pdf','rb'))
            except: print 'Unable to append',p
            else:
                for i in range(inputfile.getNumPages()):
                    result.addPage(inputfile.getPage(i))
    outputfile = file("freecad.pdf",'wb')
    result.write(outputfile)
    outputfile.close()
    if VERBOSE: print 'Successfully created freecad.pdf'

def local(page,image=False):
    "returns a local path for a given page/image"
    if image:
        return FOLDER + os.sep + page
    else:
        return FOLDER + os.sep + page + '.html'

def exists(page,image=False):
    "checks if given page/image already exists"
    path = local(page,image)
    if os.path.exists(path): return True
    return False
    
if __name__ == "__main__":
	crawl()