201 lines
8.4 KiB
Python
Executable File
201 lines
8.4 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
#***************************************************************************
|
|
#* *
|
|
#* Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net> *
|
|
#* *
|
|
#* This program is free software; you can redistribute it and/or modify *
|
|
#* it under the terms of the GNU Lesser General Public License (LGPL) *
|
|
#* as published by the Free Software Foundation; either version 2 of *
|
|
#* the License, or (at your option) any later version. *
|
|
#* for detail see the LICENCE text file. *
|
|
#* *
|
|
#* This program is distributed in the hope that it will be useful, *
|
|
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
|
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
|
#* GNU Library General Public License for more details. *
|
|
#* *
|
|
#* You should have received a copy of the GNU Library General Public *
|
|
#* License along with this program; if not, write to the Free Software *
|
|
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
|
|
#* USA *
|
|
#* *
|
|
#***************************************************************************
|
|
|
|
__title__="buildwikiindex.py"
|
|
__author__ = "Yorik van Havre <yorik@uncreated.net>"
|
|
__url__ = "http://www.freecadweb.org"
|
|
|
|
"""
|
|
This script parses the contents of a wiki site and saves a file containing
|
|
names of pages and images to be downloaded.
|
|
"""
|
|
|
|
import sys, os, re, tempfile, getopt
|
|
from urllib2 import urlopen, HTTPError
|
|
|
|
# CONFIGURATION #################################################
|
|
|
|
URL = "https://www.freecadweb.org/wiki" #default URL if no URL is passed
|
|
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
|
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','Interesting_links','Syndication_feeds','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','WikiPages'] # pages that won't be fetched (kept online)
|
|
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
|
MAXFAIL = 3 # max number of retries if download fails
|
|
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
|
|
WRITETHROUGH = True # if true, fetched files are constantly written to disk, in case of failure.
|
|
|
|
# END CONFIGURATION ##############################################
|
|
|
|
wikiindex = "/index.php?title="
|
|
|
|
def crawl(pagename=[]):
|
|
"downloads an entire wiki site"
|
|
todolist = []
|
|
processed = []
|
|
count = 1
|
|
if pagename:
|
|
if not isinstance(pagename,list):
|
|
pagename = [pagename]
|
|
todolist = pagename
|
|
else:
|
|
if os.path.exists("wikifiles.txt"):
|
|
f = open("wikifiles.txt","r")
|
|
if VERBOSE: print "Reading existing list..."
|
|
for l in f.readlines():
|
|
if l.strip() != "":
|
|
if VERBOSE: print "Adding ",l
|
|
processed.append(l.strip())
|
|
f.close()
|
|
if os.path.exists("todolist.txt"):
|
|
f = open("todolist.txt","r")
|
|
if VERBOSE: print "Reading existing todo list..."
|
|
for l in f.readlines():
|
|
if l.strip() != "":
|
|
todolist.append(l.strip())
|
|
f.close()
|
|
else:
|
|
indexpages,imgs = get(INDEX)
|
|
todolist.extend(indexpages)
|
|
while todolist:
|
|
targetpage = todolist.pop()
|
|
if (not targetpage in NORETRIEVE):
|
|
if VERBOSE: print count, ": Scanning ", targetpage
|
|
pages,images = get(targetpage)
|
|
count += 1
|
|
processed.append(targetpage)
|
|
processed.extend(images)
|
|
if VERBOSE: print "got",len(pages),"links"
|
|
for p in pages:
|
|
if (not (p in todolist)) and (not (p in processed)):
|
|
todolist.append(p)
|
|
if WRITETHROUGH:
|
|
writeList(processed)
|
|
writeList(todolist,"todolist.txt")
|
|
if VERBOSE: print "Fetched ", count, " pages"
|
|
if not WRITETHROUGH:
|
|
writeList(processed)
|
|
if pagename:
|
|
return processed
|
|
return 0
|
|
|
|
def get(page):
|
|
"downloads a single page, returns the other pages it links to"
|
|
html = fetchpage(page)
|
|
html = cleanhtml(html)
|
|
pages = getlinks(html)
|
|
images = getimagelinks(html)
|
|
return pages,images
|
|
|
|
def cleanhtml(html):
|
|
"cleans given html code from dirty script stuff"
|
|
html = html.replace('\n','Wlinebreak') # removing linebreaks for regex processing
|
|
html = re.compile('(.*)<div[^>]+column-content+[^>]+>').sub('',html) # stripping before content
|
|
html = re.compile('<div[^>]+column-one+[^>]+>.*').sub('',html) # stripping after content
|
|
html = re.compile('<!--[^>]+-->').sub('',html) # removing comment tags
|
|
html = re.compile('<script[^>]*>.*?</script>').sub('',html) # removing script tags
|
|
html = re.compile('<!--\[if[^>]*>.*?endif\]-->').sub('',html) # removing IE tags
|
|
html = re.compile('<div id="jump-to-nav"[^>]*>.*?</div>').sub('',html) # removing nav div
|
|
html = re.compile('<h3 id="siteSub"[^>]*>.*?</h3>').sub('',html) # removing print subtitle
|
|
html = re.compile('Retrieved from').sub('Online version:',html) # changing online title
|
|
html = re.compile('<div id="mw-normal-catlinks[^>]>.*?</div>').sub('',html) # removing catlinks
|
|
html = re.compile('<div class="NavHead.*?</div>').sub('',html) # removing nav stuff
|
|
html = re.compile('<div class="NavContent.*?</div>').sub('',html) # removing nav stuff
|
|
html = re.compile('<div class="NavEnd.*?</div>').sub('',html) # removing nav stuff
|
|
html = re.compile('<div class="mw-pt-translate-header.*?</div>').sub('',html) # removing translations links
|
|
if not GETTRANSLATIONS:
|
|
html = re.compile('<div class="languages.*?</div>').sub('',html) # removing translations links
|
|
html = re.compile('<div class="mw-pt-languages.*?</div>').sub('',html) # removing translations links
|
|
html = re.compile('Wlinebreak').sub('\n',html) # restoring original linebreaks
|
|
return html
|
|
|
|
def getlinks(html):
|
|
"returns a list of wikipage links in html file"
|
|
global NORETRIEVE
|
|
links = re.findall('<a[^>]*>.*?</a>',html)
|
|
pages = []
|
|
for l in links:
|
|
# rg = re.findall('php\?title=(.*)\" title',l)
|
|
rg = re.findall('href=.*?php\?title=(.*?)"',l)
|
|
if not rg:
|
|
rg = re.findall('href="\/wiki\/(.*?)"',l)
|
|
if "images" in rg:
|
|
rg = None
|
|
if rg:
|
|
rg = rg[0]
|
|
if not "Command_Reference" in rg:
|
|
if "#" in rg:
|
|
rg = rg.split('#')[0]
|
|
if ":" in rg:
|
|
NORETRIEVE.append(rg)
|
|
if "&" in rg:
|
|
NORETRIEVE.append(rg)
|
|
if ";" in rg:
|
|
NORETRIEVE.append(rg)
|
|
if "/" in rg:
|
|
if not GETTRANSLATIONS:
|
|
NORETRIEVE.append(rg)
|
|
if not rg in NORETRIEVE:
|
|
pages.append(rg)
|
|
print "got link: ",rg
|
|
return pages
|
|
|
|
def getimagelinks(html):
|
|
"returns a list of image links found in an html file"
|
|
imlinks = re.findall('<img.*?src="(.*?)"',html)
|
|
imlinks = [l for l in imlinks if not l.startswith("http")] # remove external images
|
|
return imlinks
|
|
|
|
def fetchpage(page):
|
|
"retrieves given page from the wiki"
|
|
print "fetching: ",page
|
|
failcount = 0
|
|
while failcount < MAXFAIL:
|
|
try:
|
|
html = (urlopen(URL + wikiindex + page).read())
|
|
return html
|
|
except HTTPError:
|
|
failcount += 1
|
|
print 'Error: unable to fetch page ' + page
|
|
sys.exit()
|
|
|
|
def cleanList(pagelist):
|
|
"cleans the list"
|
|
npages = []
|
|
for p in pagelist:
|
|
if not p in npages:
|
|
if not "redlink" in p:
|
|
npages.append(p)
|
|
return npages
|
|
|
|
def writeList(pages,filename="wikifiles.txt"):
|
|
pages = cleanList(pages)
|
|
f = open(filename,"wb")
|
|
for p in pages:
|
|
f.write(p+"\n")
|
|
f.close()
|
|
if VERBOSE: print "written ",filename
|
|
|
|
if __name__ == "__main__":
|
|
crawl(sys.argv[1:])
|
|
|