170 lines
6.4 KiB
Python
Executable File
170 lines
6.4 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
#***************************************************************************
|
|
#* *
|
|
#* Copyright (c) 2009 Yorik van Havre <yorik@gmx.fr> *
|
|
#* *
|
|
#* This program is free software; you can redistribute it and/or modify *
|
|
#* it under the terms of the GNU Lesser General Public License (LGPL) *
|
|
#* as published by the Free Software Foundation; either version 2 of *
|
|
#* the License, or (at your option) any later version. *
|
|
#* for detail see the LICENCE text file. *
|
|
#* *
|
|
#* This program is distributed in the hope that it will be useful, *
|
|
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
|
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
|
#* GNU Library General Public License for more details. *
|
|
#* *
|
|
#* You should have received a copy of the GNU Library General Public *
|
|
#* License along with this program; if not, write to the Free Software *
|
|
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
|
|
#* USA *
|
|
#* *
|
|
#***************************************************************************
|
|
|
|
__title__="buildpdf"
|
|
__author__ = "Yorik van Havre <yorik@uncreated.net>"
|
|
__url__ = "http://free-cad.sf.net"
|
|
|
|
"""
|
|
This script builds a pdf file from a local copy of the wiki
|
|
"""
|
|
|
|
import sys, os, re, tempfile, getopt
|
|
from urllib2 import urlopen, HTTPError
|
|
|
|
# CONFIGURATION #################################################
|
|
|
|
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
|
REMOVE = True # if true, the temp html files are removed after successful operation
|
|
PDFCONVERTOR = 'wkhtmltopdf' # can be 'pisa', 'htmldoc' or 'wkhtmltopdf'
|
|
VERBOSE = True
|
|
|
|
# END CONFIGURATION ##############################################
|
|
|
|
|
|
FOLDER = "./localwiki"
|
|
|
|
fcount = dcount = 0
|
|
|
|
def crawl():
|
|
"downloads an entire wiki site"
|
|
|
|
# tests ###############################################
|
|
|
|
if PDFCONVERTOR == 'pisa':
|
|
try:
|
|
import ho.pisa as pisa
|
|
except:
|
|
"Error: Python-pisa not installed, exiting."
|
|
return 1
|
|
elif PDFCONVERTOR == 'htmldoc':
|
|
if os.system('htmldoc --version'):
|
|
print "Error: Htmldoc not found, exiting."
|
|
return 1
|
|
try:
|
|
from pyPdf import PdfFileReader,PdfFileWriter
|
|
except:
|
|
print "Error: Python-pypdf not installed, exiting."
|
|
|
|
# run ########################################################
|
|
|
|
buildpdffiles()
|
|
joinpdf()
|
|
|
|
if VERBOSE: print "All done!"
|
|
return 0
|
|
|
|
def buildpdffiles():
|
|
"scans a folder for html files and converts them all to pdf"
|
|
templist = os.listdir(FOLDER)
|
|
fileslist = []
|
|
for i in templist:
|
|
if i[-5:] == '.html':
|
|
fileslist.append(i)
|
|
for f in fileslist:
|
|
if PDFCONVERTOR == 'pisa': createpdf_pisa(f[:-5])
|
|
elif PDFCONVERTOR == 'wkhtmltopdf': createpdf_wkhtmltopdf(f[:-5])
|
|
else: createpdf_htmldoc(f[:-5])
|
|
|
|
def fetch_resources(uri, rel):
|
|
"""
|
|
Callback to allow pisa/reportlab to retrieve Images,Stylesheets, etc.
|
|
'uri' is the href attribute from the html link element.
|
|
'rel' gives a relative path, but it's not used here.
|
|
|
|
Note from Yorik: Not working!!
|
|
"""
|
|
path = os.path.join(FOLDER,uri.replace("./", ""))
|
|
return path
|
|
|
|
def createpdf_pisa(pagename):
|
|
"creates a pdf file from a saved page using pisa (python module)"
|
|
import ho.pisa as pisa
|
|
if not exists(pagename+".pdf",image=True):
|
|
infile = file(FOLDER + os.sep + pagename+'.html','ro')
|
|
outfile = file(FOLDER + os.sep + pagename+'.pdf','wb')
|
|
if VERBOSE: print "Converting " + pagename + " to pdf..."
|
|
pdf = pisa.CreatePDF(infile,outfile,FOLDER,link_callback=fetch_resources)
|
|
outfile.close()
|
|
if pdf.err: return pdf.err
|
|
return 0
|
|
|
|
def createpdf_htmldoc(pagename):
|
|
"creates a pdf file from a saved page using htmldoc (external app, but supports images)"
|
|
if not exists(pagename+".pdf",image=True):
|
|
infile = FOLDER + os.sep + pagename+'.html'
|
|
outfile = FOLDER + os.sep + pagename+'.pdf'
|
|
return os.system('htmldoc --webpage --textfont sans --browserwidth 840 -f '+outfile+' '+infile)
|
|
|
|
def createpdf_wkhtmltopdf(pagename):
|
|
"creates a pdf file from a saved page using htmldoc (external app, but supports images)"
|
|
if not exists(pagename+".pdf",image=True):
|
|
infile = FOLDER + os.sep + pagename+'.html'
|
|
outfile = FOLDER + os.sep + pagename+'.pdf'
|
|
return os.system('wkhtmltopdf '+infile+' '+outfile)
|
|
|
|
def joinpdf():
|
|
"creates one pdf file from several others, following order from startpage"
|
|
from pyPdf import PdfFileReader,PdfFileWriter
|
|
if VERBOSE: print "Building table of contents..."
|
|
f = open(FOLDER+os.sep+INDEX+'.html')
|
|
html = ''
|
|
for line in f: html += line
|
|
f.close()
|
|
html = html.replace("\n"," ")
|
|
html = html.replace("> <","><")
|
|
html = re.findall("<ul.*/ul>",html)[0]
|
|
pages = re.findall('href="(.*?)"',html)
|
|
pages.insert(1,INDEX+".html")
|
|
result = PdfFileWriter()
|
|
for p in pages:
|
|
if exists(p[:-5]):
|
|
if VERBOSE: print 'Appending',p
|
|
try: inputfile = PdfFileReader(file(FOLDER+os.sep+p[:-5]+'.pdf','rb'))
|
|
except: print 'Unable to append',p
|
|
else:
|
|
for i in range(inputfile.getNumPages()):
|
|
result.addPage(inputfile.getPage(i))
|
|
outputfile = file("freecad.pdf",'wb')
|
|
result.write(outputfile)
|
|
outputfile.close()
|
|
if VERBOSE: print 'Successfully created freecad.pdf'
|
|
|
|
def local(page,image=False):
|
|
"returns a local path for a given page/image"
|
|
if image:
|
|
return FOLDER + os.sep + page
|
|
else:
|
|
return FOLDER + os.sep + page + '.html'
|
|
|
|
def exists(page,image=False):
|
|
"checks if given page/image already exists"
|
|
path = local(page,image)
|
|
if os.path.exists(path): return True
|
|
return False
|
|
|
|
if __name__ == "__main__":
|
|
crawl()
|
|
|