
Legacy __print__ statements are syntax errors in Python 3 but __print()__ function works as expected in both Python 2 and Python 3.
349 lines
12 KiB
Python
Executable File
349 lines
12 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
#***************************************************************************
|
|
#* *
|
|
#* Copyright (c) 2009 Yorik van Havre <yorik@uncreated.net> *
|
|
#* *
|
|
#* This program is free software; you can redistribute it and/or modify *
|
|
#* it under the terms of the GNU Lesser General Public License (LGPL) *
|
|
#* as published by the Free Software Foundation; either version 2 of *
|
|
#* the License, or (at your option) any later version. *
|
|
#* for detail see the LICENCE text file. *
|
|
#* *
|
|
#* This program is distributed in the hope that it will be useful, *
|
|
#* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
|
#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
|
#* GNU Library General Public License for more details. *
|
|
#* *
|
|
#* You should have received a copy of the GNU Library General Public *
|
|
#* License along with this program; if not, write to the Free Software *
|
|
#* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 *
|
|
#* USA *
|
|
#* *
|
|
#***************************************************************************
|
|
|
|
from __future__ import print_function
|
|
__title__="downloadwiki"
|
|
__author__ = "Yorik van Havre <yorik@uncreated.net>"
|
|
__url__ = "http://www.freecadweb.org"
|
|
|
|
"""
|
|
This script retrieves the contents of a wiki site from a pages list
|
|
"""
|
|
|
|
import sys, os, re, tempfile, getopt
|
|
from urllib2 import urlopen, HTTPError
|
|
|
|
# CONFIGURATION #################################################
|
|
|
|
DEFAULTURL = "https://www.freecadweb.org" #default URL if no URL is passed
|
|
INDEX = "Online_Help_Toc" # the start page from where to crawl the wiki
|
|
NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_documentation', 'User_hub','Main_Page','About_this_site','FreeCAD:General_disclaimer','FreeCAD:About','FreeCAD:Privacy_policy','Introduction_to_python'] # pages that won't be fetched (kept online)
|
|
GETTRANSLATIONS = False # Set true if you want to get the translations too.
|
|
MAXFAIL = 3 # max number of retries if download fails
|
|
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
|
|
|
|
# END CONFIGURATION ##############################################
|
|
|
|
FOLDER = "./localwiki"
|
|
LISTFILE = "wikifiles.txt"
|
|
URL = DEFAULTURL
|
|
wikiindex = "/wiki/index.php?title="
|
|
defaultfile = "<html><head><link type='text/css' href='wiki.css' rel='stylesheet'></head><body> </body></html>"
|
|
css = """/* Basic CSS for offline wiki rendering */
|
|
|
|
body {
|
|
font-family: Fira Sans,Arial,Helvetica,sans-serif;
|
|
font-size: 14px;
|
|
text-align: justify;
|
|
/*background: #fff;
|
|
color: #000;*/
|
|
max-width: 800px;
|
|
}
|
|
|
|
h1 {
|
|
font-size: 2.4em;
|
|
font-weight: bold;
|
|
padding: 5px;
|
|
border-radius: 5px;
|
|
}
|
|
|
|
h2 {
|
|
font-weight: normal;
|
|
font-size: 1.6em;
|
|
border-bottom: 1px solid #ddd;
|
|
}
|
|
|
|
h3 {
|
|
padding-left: 20px;
|
|
}
|
|
|
|
img {
|
|
max-width: 100%;
|
|
}
|
|
|
|
li {
|
|
margin-top: 10px;
|
|
}
|
|
|
|
pre, .mw-code {
|
|
text-align: left;
|
|
/*background: #eee;*/
|
|
padding: 5px 5px 5px 20px;
|
|
font-family: mono;
|
|
border-radius: 2px;
|
|
}
|
|
|
|
a:link, a:visited {
|
|
font-weight: bold;
|
|
text-decoration: none;
|
|
color: #2969C4;
|
|
}
|
|
|
|
a:hover {
|
|
text-decoration: underline;
|
|
}
|
|
|
|
.printfooter {
|
|
font-size: 0.8em;
|
|
color: #333333;
|
|
border-top: 1px solid #333;
|
|
margin-top: 20px;
|
|
}
|
|
|
|
.wikitable #toc {
|
|
font-size: 0.8em;
|
|
}
|
|
|
|
.ct, .ctTitle, .ctOdd, .ctEven th {
|
|
font-size: 1em;
|
|
text-align: left;
|
|
width: 190px;
|
|
float: right;
|
|
/*background: #eee;*/
|
|
margin-top: 10px;
|
|
border-radius: 2px;
|
|
}
|
|
|
|
.ct {
|
|
margin-left: 15px;
|
|
padding: 10px;
|
|
}
|
|
#mw-navigation {
|
|
display:none; /*TODO remove on next build (included below)*/
|
|
}
|
|
"""
|
|
|
|
def crawl():
|
|
"downloads an entire wiki site"
|
|
global processed
|
|
processed = []
|
|
if VERBOSE: print("crawling ", URL, ", saving in ", FOLDER)
|
|
if not os.path.isdir(FOLDER): os.mkdir(FOLDER)
|
|
file = open(FOLDER + os.sep + "wiki.css",'wb')
|
|
file.write(css)
|
|
file.close()
|
|
dfile = open(FOLDER + os.sep + "default.html",'wb')
|
|
dfile.write(defaultfile)
|
|
dfile.close()
|
|
lfile = open(LISTFILE)
|
|
global locallist
|
|
locallist = []
|
|
for l in lfile: locallist.append(l.replace("\n",""))
|
|
lfile.close()
|
|
todolist = locallist[:]
|
|
print("getting",len(todolist),"files...")
|
|
count = 1
|
|
indexpages = get(INDEX)
|
|
while todolist:
|
|
targetpage = todolist.pop()
|
|
if VERBOSE: print(count, ": Fetching ", targetpage)
|
|
get(targetpage)
|
|
count += 1
|
|
if VERBOSE: print("Fetched ", count, " pages")
|
|
if VERBOSE: print("All done!")
|
|
return 0
|
|
|
|
def get(page):
|
|
"downloads a single page, returns the other pages it links to"
|
|
localpage = page
|
|
if "Command_Reference" in localpage:
|
|
localpage = localpage.replace("Category:","")
|
|
localpage = localpage.replace("&pagefrom=","+")
|
|
localpage = localpage.replace("#mw-pages","")
|
|
if page[-4:] in [".png",".jpg",".svg",".gif","jpeg",".PNG",".JPG"]:
|
|
fetchimage(page)
|
|
elif not exists(localpage):
|
|
html = fetchpage(page)
|
|
html = cleanhtml(html)
|
|
pages = getlinks(html)
|
|
html = cleanlinks(html,pages)
|
|
html = cleanimagelinks(html)
|
|
output(html,page)
|
|
else:
|
|
if VERBOSE: print(" skipping",page)
|
|
|
|
def getlinks(html):
|
|
"returns a list of wikipage links in html file"
|
|
links = re.findall('<a[^>]*>.*?</a>',html)
|
|
pages = []
|
|
for l in links:
|
|
# rg = re.findall('php\?title=(.*)\" title',l)
|
|
rg = re.findall('href=.*?php\?title=(.*?)"',l)
|
|
if not rg:
|
|
rg = re.findall('href="\/wiki\/(.*?)"',l)
|
|
if rg:
|
|
rg = rg[0]
|
|
if not "Command_Reference" in rg:
|
|
if "#" in rg:
|
|
rg = rg.split('#')[0]
|
|
if ":" in rg:
|
|
NORETRIEVE.append(rg)
|
|
if ";" in rg:
|
|
NORETRIEVE.append(rg)
|
|
if "&" in rg:
|
|
NORETRIEVE.append(rg)
|
|
if "/" in rg:
|
|
if not GETTRANSLATIONS:
|
|
NORETRIEVE.append(rg)
|
|
pages.append(rg)
|
|
return pages
|
|
|
|
def getimagelinks(html):
|
|
"returns a list of image links found in an html file"
|
|
return re.findall('<img.*?src="(.*?)"',html)
|
|
|
|
def cleanhtml(html):
|
|
"cleans given html code from dirty script stuff"
|
|
html = html.replace('\n','Wlinebreak') # removing linebreaks for regex processing
|
|
html = html.replace('\t','') # removing tab marks
|
|
html = re.compile('(.*)<div id=\"content+[^>]+>').sub('',html) # stripping before content
|
|
html = re.compile('<div id="mw-head+[^>]+>.*').sub('',html) # stripping after content
|
|
html = re.compile('<!--[^>]+-->').sub('',html) # removing comment tags
|
|
html = re.compile('<script[^>]*>.*?</script>').sub('',html) # removing script tags
|
|
html = re.compile('<!--\[if[^>]*>.*?endif\]-->').sub('',html) # removing IE tags
|
|
html = re.compile('<div id="jump-to-nav"[^>]*>.*?</div>').sub('',html) # removing nav div
|
|
html = re.compile('<h3 id="siteSub"[^>]*>.*?</h3>').sub('',html) # removing print subtitle
|
|
html = re.compile('Retrieved from').sub('Online version:',html) # changing online title
|
|
html = re.compile('<div id="mw-normal-catlinks.*?</div>').sub('',html) # removing catlinks
|
|
html = re.compile('<div class="NavHead.*?</div>').sub('',html) # removing nav stuff
|
|
html = re.compile('<div class="NavContent.*?</div>').sub('',html) # removing nav stuff
|
|
html = re.compile('<div class="NavEnd.*?</div>').sub('',html) # removing nav stuff
|
|
html = re.compile('<div id="mw-navigation.*?</div>').sub('',html) # removing nav stuff
|
|
html = re.compile('<table id="toc.*?</table>').sub('',html) # removing toc
|
|
html = re.compile('width=\"100%\" style=\"float: right; width: 230px; margin-left: 1em\"').sub('',html) # removing command box styling
|
|
html = re.compile('<div class="docnav.*?</div>Wlinebreak</div>').sub('',html) # removing docnav
|
|
html = re.compile('<div class="mw-pt-translate-header.*?</div>').sub('',html) # removing translations links
|
|
if not GETTRANSLATIONS:
|
|
html = re.compile('<div class="languages.*?</div>').sub('',html) # removing translations links
|
|
html = re.compile('<div class="mw-pt-languages.*?</div>').sub('',html) # removing translations links
|
|
html = re.compile('Wlinebreak').sub('\n',html) # restoring original linebreaks
|
|
return html
|
|
|
|
|
|
def cleanlinks(html, pages=None):
|
|
"cleans page links found in html"
|
|
if not pages: pages = getlinks(html)
|
|
for page in pages:
|
|
if page in NORETRIEVE:
|
|
output = 'href="' + URL + wikiindex + page + '"'
|
|
else:
|
|
output = 'href="' + page.replace("/","-") + '.html"'
|
|
html = re.compile('href="[^"]+' + page + '"').sub(output,html)
|
|
if "Command_Reference" in output:
|
|
html = html.replace("Category:","")
|
|
html = html.replace("&pagefrom=","+")
|
|
html = html.replace("#mw-pages",".html")
|
|
html = html.replace("/wiki/index.php?title=Command_Reference","Command_Reference")
|
|
return html
|
|
|
|
def cleanimagelinks(html,links=None):
|
|
"cleans image links in given html"
|
|
if not links: links = getimagelinks(html)
|
|
if links:
|
|
for l in links:
|
|
nl = re.findall('.*/(.*)',l)
|
|
if nl: html = html.replace(l,nl[0])
|
|
# fetchimage(l)
|
|
return html
|
|
|
|
def fetchpage(page):
|
|
"retrieves given page from the wiki"
|
|
print(" fetching: ",page)
|
|
failcount = 0
|
|
while failcount < MAXFAIL:
|
|
try:
|
|
html = (urlopen(URL + wikiindex + page).read())
|
|
return html
|
|
except HTTPError:
|
|
failcount += 1
|
|
print('Error: unable to fetch page ' + page)
|
|
|
|
def fetchimage(imagelink):
|
|
"retrieves given image from the wiki and saves it"
|
|
if imagelink[0:5] == "File:":
|
|
print("Skipping file page link")
|
|
return
|
|
filename = re.findall('.*/(.*)',imagelink)[0]
|
|
if not exists(filename,image=True):
|
|
failcount = 0
|
|
while failcount < MAXFAIL:
|
|
try:
|
|
if VERBOSE: print(" fetching " + filename)
|
|
data = (urlopen(URL + imagelink).read())
|
|
path = local(filename,image=True)
|
|
file = open(path,'wb')
|
|
file.write(data)
|
|
file.close()
|
|
except:
|
|
failcount += 1
|
|
else:
|
|
processed.append(filename)
|
|
if VERBOSE: print(" saving",local(filename,image=True))
|
|
return
|
|
print('Error: unable to fetch file ' + filename)
|
|
else:
|
|
if VERBOSE: print(" skipping",filename)
|
|
|
|
def local(page,image=False):
|
|
"returns a local path for a given page/image"
|
|
if image:
|
|
return FOLDER + os.sep + page
|
|
else:
|
|
return FOLDER + os.sep + page + '.html'
|
|
|
|
def exists(page,image=False):
|
|
"checks if given page/image already exists"
|
|
path = local(page.replace("/","-"),image)
|
|
if os.path.exists(path): return True
|
|
return False
|
|
|
|
def webroot(url):
|
|
return re.findall('(http://.*?)/',url)[0]
|
|
|
|
def output(html,page):
|
|
"encapsulates raw html code into nice html body"
|
|
title = page.replace("_"," ")
|
|
header = "<html><head>"
|
|
header += "<title>" + title + "</title>"
|
|
header += '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'
|
|
header += "<link type='text/css' href='wiki.css' rel='stylesheet'>"
|
|
header += "</head><body>"
|
|
header += "<h1>" + title + "</h1>"
|
|
footer = "</body></html>"
|
|
html = header+html+footer
|
|
filename = local(page.replace("/","-"))
|
|
if "Command_Reference" in filename:
|
|
filename = filename.replace("Category:","")
|
|
filename = filename.replace("&pagefrom=","+")
|
|
filename = filename.replace("#mw-pages","")
|
|
filename = filename.replace(".html.html",".html")
|
|
print(" saving",filename)
|
|
file = open(filename,'wb')
|
|
file.write(html)
|
|
file.close()
|
|
|
|
if __name__ == "__main__":
|
|
crawl()
|
|
|