Small fixes to wiki download scripts

This commit is contained in:
Yorik van Havre 2012-05-13 14:56:24 -03:00
parent 5cbe7a4baa
commit af3eefbb5b
2 changed files with 41 additions and 7 deletions

View File

@ -42,6 +42,7 @@ NORETRIEVE = ['Manual','Developer_hub','Power_users_hub','Users_hub','Source_doc
GETTRANSLATIONS = False # Set true if you want to get the translations too.
MAXFAIL = 3 # max number of retries if download fails
VERBOSE = True # to display what's going on. Otherwise, runs totally silent.
WRITETHROUGH = True # if true, fetched files are constantly written to disk, in case of failure.
# END CONFIGURATION ##############################################
@ -52,8 +53,24 @@ def crawl():
todolist = []
processed = []
count = 1
indexpages,imgs = get(INDEX)
todolist.extend(indexpages)
if os.path.exists("wikifiles.txt"):
f = open("wikifiles.txt","r")
if VERBOSE: print "Reading existing list..."
for l in f.readlines():
if l.strip() != "":
if VERBOSE: print "Adding ",l
processed.append(l.strip())
f.close()
if os.path.exists("todolist.txt"):
f = open("todolist.txt","r")
if VERBOSE: print "Reading existing todo list..."
for l in f.readlines():
if l.strip() != "":
todolist.append(l.strip())
f.close()
else:
indexpages,imgs = get(INDEX)
todolist.extend(indexpages)
while todolist:
targetpage = todolist.pop()
if not targetpage in NORETRIEVE:
@ -66,8 +83,12 @@ def crawl():
for p in pages:
if (not (p in todolist)) and (not (p in processed)):
todolist.append(p)
if WRITETHROUGH:
writeList(processed)
writeList(todolist,"todolist.txt")
if VERBOSE: print "Fetched ", count, " pages"
writeList(processed)
if not WRITETHROUGH:
writeList(processed)
return 0
def get(page):
@ -136,12 +157,22 @@ def fetchpage(page):
failcount += 1
print 'Error: unable to fetch page ' + page
def writeList(pages):
f = open("wikifiles.txt","wb")
def cleanList(pagelist):
"cleans the list"
npages = []
for p in pagelist:
if not p in npages:
if not "redlink" in p:
npages.append(p)
return npages
def writeList(pages,filename="wikifiles.txt"):
pages = cleanList(pages)
f = open(filename,"wb")
for p in pages:
f.write(p+"\n")
f.close()
if VERBOSE: print "written wikifiles.txt"
if VERBOSE: print "written ",filename
if __name__ == "__main__":
crawl()

View File

@ -236,6 +236,9 @@ def fetchpage(page):
def fetchimage(imagelink):
"retrieves given image from the wiki and saves it"
if imagelink[0:5] == "File:":
print "Skipping file page link"
return
filename = re.findall('.*/(.*)',imagelink)[0]
print "saving",filename
if not exists(filename,image=True):
@ -263,7 +266,7 @@ def local(page,image=False):
def exists(page,image=False):
"checks if given page/image already exists"
path = local(page,image)
path = local(page.replace("/","-"),image)
if os.path.exists(path): return True
return False