#!/usr/bin/env python #*************************************************************************** #* * #* Copyright (c) 2009 Yorik van Havre * #* * #* This program is free software; you can redistribute it and/or modify * #* it under the terms of the GNU Lesser General Public License (LGPL) * #* as published by the Free Software Foundation; either version 2 of * #* the License, or (at your option) any later version. * #* for detail see the LICENCE text file. * #* * #* This program is distributed in the hope that it will be useful, * #* but WITHOUT ANY WARRANTY; without even the implied warranty of * #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * #* GNU Library General Public License for more details. * #* * #* You should have received a copy of the GNU Library General Public * #* License along with this program; if not, write to the Free Software * #* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * #* USA * #* * #*************************************************************************** from __future__ import print_function __title__="downloadwiki" __author__ = "Yorik van Havre " __url__ = "http://www.freecadweb.org" """ This script retrieves the contents of a wiki site from a pages list """ import sys, os, re, tempfile, getopt from urllib2 import urlopen, HTTPError # CONFIGURATION ################################################# DEFAULTURL = "https://www.freecadweb.org" #default URL if no URL is passed INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online) GETTRANSLATIONS = False # Set true if you want to get the translations too. MAXFAIL = 3 # max number of retries if download fails VERBOSE = True # to display what's going on. Otherwise, runs totally silent. # END CONFIGURATION ############################################## FOLDER = "./localwiki" LISTFILE = "wikifiles.txt" URL = DEFAULTURL wikiindex = "/wiki/index.php?title=" defaultfile = " " css = """/* Basic CSS for offline wiki rendering */ body { font-family: Fira Sans,Arial,Helvetica,sans-serif; font-size: 14px; text-align: justify; /*background: #fff; color: #000;*/ max-width: 800px; } h1 { font-size: 2.4em; font-weight: bold; padding: 5px; border-radius: 5px; } h2 { font-weight: normal; font-size: 1.6em; border-bottom: 1px solid #ddd; } h3 { padding-left: 20px; } img { max-width: 100%; } li { margin-top: 10px; } pre, .mw-code { text-align: left; /*background: #eee;*/ padding: 5px 5px 5px 20px; font-family: mono; border-radius: 2px; } a:link, a:visited { font-weight: bold; text-decoration: none; color: #2969C4; } a:hover { text-decoration: underline; } .printfooter { font-size: 0.8em; color: #333333; border-top: 1px solid #333; margin-top: 20px; } .wikitable #toc { font-size: 0.8em; } .ct, .ctTitle, .ctOdd, .ctEven th { font-size: 1em; text-align: left; width: 190px; float: right; /*background: #eee;*/ margin-top: 10px; border-radius: 2px; } .ct { margin-left: 15px; padding: 10px; } #mw-navigation { display:none; /*TODO remove on next build (included below)*/ } """ def crawl(): "downloads an entire wiki site" global processed processed = [] if VERBOSE: print("crawling ", URL, ", saving in ", FOLDER) if not os.path.isdir(FOLDER): os.mkdir(FOLDER) file = open(FOLDER + os.sep + "wiki.css",'wb') file.write(css) file.close() dfile = open(FOLDER + os.sep + "default.html",'wb') dfile.write(defaultfile) dfile.close() lfile = open(LISTFILE) global locallist locallist = [] for l in lfile: locallist.append(l.replace("\n","")) lfile.close() todolist = locallist[:] print("getting",len(todolist),"files...") count = 1 indexpages = get(INDEX) while todolist: targetpage = todolist.pop() if VERBOSE: print(count, ": Fetching ", targetpage) get(targetpage) count += 1 if VERBOSE: print("Fetched ", count, " pages") if VERBOSE: print("All done!") return 0 def get(page): "downloads a single page, returns the other pages it links to" localpage = page if "Command_Reference" in localpage: localpage = localpage.replace("Category:","") localpage = localpage.replace("&pagefrom=","+") localpage = localpage.replace("#mw-pages","") if page[-4:] in [".png",".jpg",".svg",".gif","jpeg",".PNG",".JPG"]: fetchimage(page) elif not exists(localpage): html = fetchpage(page) html = cleanhtml(html) pages = getlinks(html) html = cleanlinks(html,pages) html = cleanimagelinks(html) output(html,page) else: if VERBOSE: print(" skipping",page) def getlinks(html): "returns a list of wikipage links in html file" links = re.findall(']*>.*?',html) pages = [] for l in links: # rg = re.findall('php\?title=(.*)\" title',l) rg = re.findall('href=.*?php\?title=(.*?)"',l) if not rg: rg = re.findall('href="\/wiki\/(.*?)"',l) if rg: rg = rg[0] if not "Command_Reference" in rg: if "#" in rg: rg = rg.split('#')[0] if ":" in rg: NORETRIEVE.append(rg) if ";" in rg: NORETRIEVE.append(rg) if "&" in rg: NORETRIEVE.append(rg) if "/" in rg: if not GETTRANSLATIONS: NORETRIEVE.append(rg) pages.append(rg) return pages def getimagelinks(html): "returns a list of image links found in an html file" return re.findall(']+>').sub('',html) # stripping before content html = re.compile('
]*>.*?
').sub('',html) # removing nav div html = re.compile('

]*>.*?

').sub('',html) # removing print subtitle html = re.compile('Retrieved from').sub('Online version:',html) # changing online title html = re.compile('