#!/usr/bin/env python #*************************************************************************** #* * #* Copyright (c) 2009 Yorik van Havre * #* * #* This program is free software; you can redistribute it and/or modify * #* it under the terms of the GNU Lesser General Public License (LGPL) * #* as published by the Free Software Foundation; either version 2 of * #* the License, or (at your option) any later version. * #* for detail see the LICENCE text file. * #* * #* This program is distributed in the hope that it will be useful, * #* but WITHOUT ANY WARRANTY; without even the implied warranty of * #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * #* GNU Library General Public License for more details. * #* * #* You should have received a copy of the GNU Library General Public * #* License along with this program; if not, write to the Free Software * #* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * #* USA * #* * #*************************************************************************** from __future__ import print_function __title__="buildwikiindex.py" __author__ = "Yorik van Havre " __url__ = "http://www.freecadweb.org" """ This script parses the contents of a wiki site and saves a file containing names of pages and images to be downloaded. """ import sys, os, re, tempfile, getopt from urllib2 import urlopen, HTTPError # CONFIGURATION ################################################# URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online) GETTRANSLATIONS = True # Set true if you want to get the translations too. MAXFAIL = 3 # max number of retries if download fails VERBOSE = True # to display what's going on. Otherwise, runs totally silent. WRITETHROUGH = True # if true, fetched files are constantly written to disk, in case of failure. # END CONFIGURATION ############################################## wikiindex = "/index.php?title=" def crawl(pagename=[]): "downloads an entire wiki site" todolist = [] processed = [] count = 1 if pagename: if not isinstance(pagename,list): pagename = [pagename] todolist = pagename else: if os.path.exists("wikifiles.txt"): f = open("wikifiles.txt","r") if VERBOSE: print("Reading existing list...") for l in f.readlines(): if l.strip() != "": if VERBOSE: print("Adding ",l) processed.append(l.strip()) f.close() if os.path.exists("todolist.txt"): f = open("todolist.txt","r") if VERBOSE: print("Reading existing todo list...") for l in f.readlines(): if l.strip() != "": todolist.append(l.strip()) f.close() else: indexpages,imgs = get(INDEX) todolist.extend(indexpages) while todolist: targetpage = todolist.pop() if (not targetpage in NORETRIEVE): if VERBOSE: print(count, ": Scanning ", targetpage) pages,images = get(targetpage) count += 1 processed.append(targetpage) processed.extend(images) if VERBOSE: print("got",len(pages),"links") for p in pages: if (not (p in todolist)) and (not (p in processed)): todolist.append(p) if WRITETHROUGH: writeList(processed) writeList(todolist,"todolist.txt") if VERBOSE: print("Fetched ", count, " pages") if not WRITETHROUGH: writeList(processed) if pagename: return processed return 0 def get(page): "downloads a single page, returns the other pages it links to" html = fetchpage(page) html = cleanhtml(html) pages = getlinks(html) images = getimagelinks(html) return pages,images def cleanhtml(html): "cleans given html code from dirty script stuff" html = html.replace('\n','Wlinebreak') # removing linebreaks for regex processing html = re.compile('(.*)]+column-content+[^>]+>').sub('',html) # stripping before content html = re.compile(']+column-one+[^>]+>.*').sub('',html) # stripping after content html = re.compile('').sub('',html) # removing comment tags html = re.compile(']*>.*?').sub('',html) # removing script tags html = re.compile('').sub('',html) # removing IE tags html = re.compile('
]*>.*?
').sub('',html) # removing nav div html = re.compile('

]*>.*?

').sub('',html) # removing print subtitle html = re.compile('Retrieved from').sub('Online version:',html) # changing online title html = re.compile('