').sub('',html) # removing nav stuff html = re.compile('

').sub('',html) # removing toc html = re.compile('width=\"100%\" style=\"float: right; width: 230px; margin-left: 1em\"').sub('',html) # removing command box styling html = re.compile('

').sub('',html) # removing translations links if not GETTRANSLATIONS: html = re.compile('

').sub('',html) # removing translations links html = re.compile('Wlinebreak').sub('\n',html) # restoring original linebreaks return html def cleanlinks(html, pages=None): "cleans page links found in html" if not pages: pages = getlinks(html) for page in pages: if page in NORETRIEVE: output = 'href="' + URL + wikiindex + page + '"' else: output = 'href="' + page.replace("/","-") + '.html"' html = re.compile('href="[^"]+' + page + '"').sub(output,html) if "Command_Reference" in output: html = html.replace("Category:","") html = html.replace("&pagefrom=","+") html = html.replace("#mw-pages",".html") html = html.replace("/wiki/index.php?title=Command_Reference","Command_Reference") return html def cleanimagelinks(html,links=None): "cleans image links in given html" if not links: links = getimagelinks(html) if links: for l in links: nl = re.findall('.*/(.*)',l) if nl: html = html.replace(l,nl[0]) # fetchimage(l) return html def fetchpage(page): "retrieves given page from the wiki" print " fetching: ",page failcount = 0 while failcount < MAXFAIL: try: html = (urlopen(URL + wikiindex + page).read()) return html except HTTPError: failcount += 1 print 'Error: unable to fetch page ' + page def fetchimage(imagelink): "retrieves given image from the wiki and saves it" if imagelink[0:5] == "File:": print "Skipping file page link" return filename = re.findall('.*/(.*)',imagelink)[0] if not exists(filename,image=True): failcount = 0 while failcount < MAXFAIL: try: if VERBOSE: print " fetching " + filename data = (urlopen(URL + imagelink).read()) path = local(filename,image=True) file = open(path,'wb') file.write(data) file.close() except: failcount += 1 else: processed.append(filename) if VERBOSE: print " saving",local(filename,image=True) return print 'Error: unable to fetch file ' + filename else: if VERBOSE: print " skipping",filename def local(page,image=False): "returns a local path for a given page/image" if image: return FOLDER + os.sep + page else: return FOLDER + os.sep + page + '.html' def exists(page,image=False): "checks if given page/image already exists" path = local(page.replace("/","-"),image) if os.path.exists(path): return True return False def webroot(url): return re.findall('(http://.*?)/',url)[0] def output(html,page): "encapsulates raw html code into nice html body" title = page.replace("_"," ") header = "" header += "" + title + "" header += '' header += "" header += "" header += "

" + title + "

" footer = "" html = header+html+footer filename = local(page.replace("/","-")) if "Command_Reference" in filename: filename = filename.replace("Category:","") filename = filename.replace("&pagefrom=","+") filename = filename.replace("#mw-pages","") filename = filename.replace(".html.html",".html") print " saving",filename file = open(filename,'wb') file.write(html) file.close() if __name__ == "__main__": crawl()

]*>.*?

" + title + "

]>.?