From 19e70b81b5d8b2ea2a86eaec2d451edece413782 Mon Sep 17 00:00:00 2001 From: Dan Stillman Date: Thu, 29 Apr 2010 21:54:01 +0000 Subject: [PATCH] Update Internet Archive translator to work with new bucket scheme --- translators/Internet Archive.js | 200 +++++++++++++++++++------------- 1 file changed, 118 insertions(+), 82 deletions(-) diff --git a/translators/Internet Archive.js b/translators/Internet Archive.js index 58ec1d44f..3884278d8 100644 --- a/translators/Internet Archive.js +++ b/translators/Internet Archive.js @@ -8,26 +8,13 @@ "maxVersion":"", "priority":100, "inRepository":true, - "lastUpdated":"2008-07-24 05:30:00" + "lastUpdated":"2010-04-29 21:53:40" } function detectWeb(doc, url) { var mediaType = "1"; - - // iterate through links under item/bucket name to check for zoterocommons (the collection name) - var links = doc.evaluate('//div/p/span/a', doc, null, XPathResult.ANY_TYPE, null); - var link = null; - while (next_link = links.iterateNext()) { - link = next_link.textContent; - if (link.match(/zoterocommons/)) { - mediaType = "commons"; - Zotero.debug("IA TRANS: scraping commons"); - } - } - if (mediaType == "commons") return "commons"; - - else if (doc.evaluate('//h3', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { + if (doc.evaluate('//h3', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { mediaType = doc.evaluate('//h3', doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent; } else if (doc.evaluate('//div[@class="box"][@id="spotlight"]/h1', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { @@ -58,6 +45,10 @@ function associateData (newItem, dataTags, field, zoteroField) { } } +var detailsURL = 'http://www.archive.org/details'; +var downloadURL = 'http://www.archive.org/download'; +var apiURL = 'http://s3.us.archive.org'; + function scrape(doc, url) { var namespace = doc.documentElement.namespaceURI; @@ -233,83 +224,125 @@ function scrape(doc, url) { } -function processRDFs(doc, url, articles) { - var namespace = doc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == 'x') return namespace; else return null; - } : null; - - var httpLink = doc.evaluate('//a[text()="HTTP"]/@href', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; +function processBuckets(doc, url, ids) { + var httpLink = doc.evaluate('//a[text()="HTTP"]/@href', doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent; - for (var i=0 ; i/, '')); + var files = xml.file; + + var attachments = []; + var titles = []; + // loop through files listed in bucket contents file + for each(var f in files) { + var fileName = f.@name.toString(); + + // Skip derivative files other than OCRed PDFs + if (f.@source.toString() != 'original' && !fileName.match(/_text\.pdf$/)) { + Zotero.debug("Skipping " + fileName); + continue; + } + + // Skip default files + if (fileName.indexOf(id) == 0) { + continue; + } + + // TEMP -- shouldn't be necessary after IA changes + if (fileName.match(/\.zip(_meta\.txt)?$/)) { + Zotero.debug("Skipping " + fileName); + continue; + } + + var title = f.title.toString(); + if (!title) { + title = fileName; + } + + attachments.push(fileName); + titles.push(title); + } + + for (var i=0; i