#!/usr/bin/env python #*************************************************************************** #* * #* Copyright (c) 2009 Yorik van Havre * #* * #* This program is free software; you can redistribute it and/or modify * #* it under the terms of the GNU Library General Public License (LGPL) * #* as published by the Free Software Foundation; either version 2 of * #* the License, or (at your option) any later version. * #* for detail see the LICENCE text file. * #* * #* This program is distributed in the hope that it will be useful, * #* but WITHOUT ANY WARRANTY; without even the implied warranty of * #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * #* GNU Library General Public License for more details. * #* * #* You should have received a copy of the GNU Library General Public * #* License along with this program; if not, write to the Free Software * #* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * #* USA * #* * #*************************************************************************** __title__="wiki2qhelp" __author__ = "Yorik van Havre " __url__ = "http://www.freecadweb.org" """ This script retrieves the contents of a wiki site and saves it locally, then calls qt help compiler to produce a qhelp-assistant help file. The script can be called without arguments, it will then use the default url below, or by passing it an url and optionally a TOC name. """ import sys, os, re, tempfile, getopt from urllib2 import urlopen, HTTPError # CONFIGURATION ################################################# DEFAULTURL = "www.freecadweb.org/wiki" #default URL if no URL is passed INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site'] # pages that won't be fetched (kept online) GETTRANSLATIONS = True # Set true if you want to get the translations too. MAXFAIL = 3 # max number of retries if download fails VERBOSE = True # to display what's going on. Otherwise, runs totally silent. COMPILE = True # Wether qt assistant will be used to compile the final help file OUTPUTPATH = os.path.expanduser("~")+os.sep+'.FreeCAD' # Where to store the qch file QHELPCOMPILER = 'qhelpgenerator' QCOLLECTIOMGENERATOR = 'qcollectiongenerator' PDFOUTPUT = False # if true, a pdf file will be generated instead of qhelp. REMOVE = True # if true, the temp html files are removed after successful operation PDFCONVERTOR = 'pisa' # can be 'pisa' or 'htmldoc' # END CONFIGURATION ############################################## URL = DEFAULTURL TMPFOLDER = tempfile.mkdtemp() wikiindex = "/index.php?title=" processed = [] pisa = None usage=''' wiki2qhelp [options] [url] [index page] fetches wiki pages from the specified url, starting from specified index page, and outputs a .qch file in the specified output path. You must have qassistant installed. If no url, index page or output path is specified, the following default values will be used: url: '''+DEFAULTURL+''' index page: '''+INDEX+''' output path: '''+OUTPUTPATH+''' Options: -v: Verbose mode -c filename or --helpcompiler-exe filename: Uses filename as qt help compiler -g filename or --helpgenerator-exe filename: Uses filename as qt collection generator -o path or --out-path path: Specifies an output path -h or --help: Displays this help message -p [convertor] or --pdf [convertor]: Outputs a pdf file instead of qhelp. Convertor can be pisa (default) or htmldoc -t path or --tempfolder path: Uses path as temp folder for storing html files ''' css = """/* Basic CSS for offline wiki rendering */ body { font-family: Arial,Helvetica,sans-serif; font-size: 13px; text-align: justify; } h1 { font-size: 2.2em; font-weight: bold; background: #46A4D0; color: white; padding: 5px; -moz-border-radius: 5px; -webkit-border-radius: 5px; } pre { border: 1px dashed #333333; text-align: left; background: #EEEEEE; padding: 5px; } a:link, a:visited { font-weight: bold; text-decoration: none; color: #0084FF; } a:hover { text-decoration: underline; } .printfooter { font-size: 0.8em; color: #333333; border-top: 1px solid #333333; } .wikitable #toc { font-size: 0.8em; } #toc,.docnav { display: none; } """ fcount = dcount = 0 def rmall(dirPath): # delete dirPath and below global fcount, dcount namesHere = os.listdir(dirPath) for name in namesHere: # remove all contents first path = os.path.join(dirPath, name) if not os.path.isdir(path): # remove simple files os.remove(path) fcount = fcount + 1 else: # recur to remove subdirs rmall(path) os.rmdir(dirPath) # remove now-empty dirPath dcount = dcount + 1 def crawl(site=DEFAULTURL): "downloads an entire wiki site" # tests ############################################### if COMPILE and os.system(QHELPCOMPILER +' -v'): print ("Error: QAssistant not fully installed, exiting.") print (QHELPCOMPILER) return 1 if COMPILE and os.system(QCOLLECTIOMGENERATOR +' -v'): print ("Error: QAssistant not fully installed, exiting.") return 1 if PDFOUTPUT: if PDFCONVERTOR == 'pisa': try: import ho.pisa as pisa except: ("Error: Python-pisa not installed, exiting.") return 1 else: if os.system('htmldoc --version'): print ("Error: Htmldoc not found, exiting.") return 1 try: from pyPdf import PdfFileReader,PdfFileWriter except: print ("Error: Python-pypdf not installed, exiting.") # run ######################################################## URL = site if VERBOSE: print ("crawling "), URL, ", saving in ", TMPFOLDER if not os.path.isdir(TMPFOLDER): os.mkdir(TMPFOLDER) file = open(TMPFOLDER + os.sep + "wiki.css",'wb') file.write(css) file.close() todolist = [] count = 1 indexpages = get(INDEX) todolist.extend(indexpages) while todolist: targetpage = todolist.pop() if not targetpage in NORETRIEVE: if VERBOSE: print (count, ": Fetching ", targetpage) pages = get(targetpage) count += 1 processed.append(targetpage) for p in pages: if (not (p in todolist)) and (not (p in processed)): todolist.append(p) if VERBOSE: print ("Fetched ", count, " pages") if PDFOUTPUT: buildpdffiles() joinpdf() if REMOVE: if VERBOSE: print ("Deleting temp files...") rmall(TMPFOLDER) if COMPILE: qhp = buildtoc() qhcp = createCollProjectFile() if generate(qhcp) or compile(qhp): print ("Temp Folder ",TMPFOLDER," has not been deleted.") return 1 else: if REMOVE: if VERBOSE: print ("Deleting temp files...") rmall(TMPFOLDER) if VERBOSE: print ("All done!") return 0 def buildpdffiles(folder=TMPFOLDER,convertor=PDFCONVERTOR): "scans a folder for html files and converts them all to pdf" templist = os.listdir(folder) fileslist = [] for i in templist: if i[-5:] == '.html': fileslist.append(i) for f in fileslist: if convertor == 'pisa': createpdf_pisa(f[:-5],folder) else: createpdf_htmldoc(f[:-5],folder) def fetch_resources(uri, rel): """ Callback to allow pisa/reportlab to retrieve Images,Stylesheets, etc. 'uri' is the href attribute from the html link element. 'rel' gives a relative path, but it's not used here. Note from Yorik: Not working!! """ path = os.path.join(TMPFOLDER,uri.replace("./", "")) return path def createpdf_pisa(pagename,folder=TMPFOLDER): "creates a pdf file from a saved page using pisa (python module)" infile = file(folder + os.sep + pagename+'.html','ro') outfile = file(folder + os.sep + pagename+'.pdf','wb') if VERBOSE: print ("Converting " + pagename + " to pdf...") pdf = pisa.CreatePDF(infile,outfile,folder,link_callback=fetch_resources) outfile.close() if pdf.err: return pdf.err return 0 def createpdf_htmldoc(pagename,folder=TMPFOLDER): "creates a pdf file from a saved page using htmldoc (external app, but supports images)" infile = folder + os.sep + pagename+'.html' outfile = folder + os.sep + pagename+'.pdf' return os.system('htmldoc --webpage -f '+outfile+' '+infile) def joinpdf(folder=TMPFOLDER,startpage=INDEX,outputname='freecad.pdf'): "creates one pdf file from several others, following order from startpage" if VERBOSE: print ("Building table of contents...") f = open(folder+os.sep+startpage+'.html') html = '' for line in f: html += line f.close() html = html.replace("\n"," ") html = html.replace("> <","><") html = re.findall("",html)[0] pages = re.findall('href="(.*?)"',html) pages.insert(1,startpage+".html") result = PdfFileWriter() for p in pages: if exists(p[:-5]): if VERBOSE: print ('Appending',p) try: inputfile = PdfFileReader(file(folder+os.sep+p[:-5]+'.pdf','rb')) except: print ('Unable to append',p) else: for i in range(inputfile.getNumPages()): result.addPage(inputfile.getPage(i)) outputfile = file(OUTPUTPATH + os.sep + outputname,'wb') result.write(outputfile) outputfile.close() if VERBOSE: print ('Successfully created',OUTPUTPATH,os.sep,outputname) def compile(qhpfile,outputname='freecad.qch'): "compiles the whole html doc with qassistant" qchfile = OUTPUTPATH + os.sep + outputname if not os.system(QHELPCOMPILER + ' '+qhpfile+' -o '+qchfile): if VERBOSE: print ("Successfully created",qchfile) return 0 def generate(qhcpfile): "generates qassistant-specific settings like icon, title, ..." txt=""" The help files for FreeCAD. """ about=open(TMPFOLDER + os.sep + "about.txt","w") about.write(txt) about.close() qhcfile = OUTPUTPATH + os.sep + "freecad.qhc" if not os.system(QCOLLECTIOMGENERATOR+' '+qhcpfile+' -o '+qhcfile): if VERBOSE: print ("Successfully created ",qhcfile) return 0 def createCollProjectFile(folder=TMPFOLDER): qprojectfile = ''' FreeCAD User Manual Crystal_Clear_app_tutorials.png freecad/freecad qthelp://org.freecad.usermanual_0.9/doc/Online_Help_Startpage.html About FreeCAD about.txt Crystal_Clear_app_tutorials.png true true true freecad.qhp freecad.qch freecad.qch ''' if VERBOSE: print ("Building project file...") qfilename = folder + os.sep + "freecad.qhcp" f = open(qfilename,'w') f.write(qprojectfile) f.close() if VERBOSE: print ("Done writing qhcp file.") return qfilename def buildtoc(folder=TMPFOLDER,page=INDEX): "gets the table of contents page and parses its contents into a clean lists structure" qhelpfile = ''' org.freecad.usermanual_0.9 doc ''' def getname(line): line = re.compile('
  • ').sub('',line) line = re.compile('
  • ').sub('',line) title = line.strip() link = '' if "]*>(.*?)',line)[0].strip() link = re.findall('href="(.*?)"',line)[0].strip() return title,link if VERBOSE: print ("Building table of contents...") f = open(folder+os.sep+page+'.html') html = '' for line in f: html += line f.close() html = html.replace("\n"," ") html = html.replace("> <","><") html = re.findall("",html)[0] items = re.findall(']*>.*?|',html) inserttoc = '
    \n' insertkeywords = '' for item in items: if not ("
      " in item): if ("
    " in item): inserttoc += '
    \n' else: link = '' title,link=getname(item) if link: link='" ref="'+link insertkeywords += ('\n') inserttoc += ('
    \n') else: subitems = item.split("
      ") for i in range(len(subitems)): link = '' title,link=getname(subitems[i]) if link: link='" ref="'+link insertkeywords += ('\n') trail = '' if i == len(subitems)-1: trail = '' inserttoc += ('
      '+trail+'\n') inserttoc += '
      \n' insertfiles = "\n" for fil in os.listdir(folder): insertfiles += (""+fil+"\n") insertfiles += "\n" qhelpfile = re.compile('').sub(insertkeywords,qhelpfile) qhelpfile = re.compile('').sub(inserttoc,qhelpfile) qhelpfile = re.compile('').sub(insertfiles,qhelpfile) qfilename = folder + os.sep + "freecad.qhp" f = open(qfilename,'wb') f.write(qhelpfile) f.close() if VERBOSE: print ("Done writing qhp file.") return qfilename def get(page): "downloads a single page, returns the other pages it links to" html = fetchpage(page) html = cleanhtml(html) pages = getlinks(html) html = cleanlinks(html,pages) html = cleanimagelinks(html) output(html,page) return pages def cleanhtml(html): "cleans given html code from dirty script stuff" html = html.replace('\n','Wlinebreak') # removing linebreaks for regex processing html = re.compile('(.*)]+column-content+[^>]+>').sub('',html) # stripping before content html = re.compile(']+column-one+[^>]+>.*').sub('',html) # stripping after content html = re.compile('').sub('',html) # removing comment tags html = re.compile(']*>.*?').sub('',html) # removing script tags html = re.compile('').sub('',html) # removing IE tags html = re.compile('
      ]*>.*?
      ').sub('',html) # removing nav div html = re.compile('

      ]*>.*?

      ').sub('',html) # removing print subtitle html = re.compile('Retrieved from').sub('Online version:',html) # changing online title html = re.compile('