From 6626eba844e847a2490b7508973d1a3abbf68e4e Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Mon, 7 Aug 2006 05:15:30 +0000 Subject: [PATCH] addresses #83, figure out how to implement OpenURL OpenURL lookup now works for books. this means that all that's necessary to add scrapable book metadata to a page is an ISBN, as shown below: also, we can now scrape Open WorldCat and Wikipedia Book Sources pages with no specialized code involved. i'm still looking for a better way of looking up journal article metadata. it's currently implemented with CrossRef, but CrossRef simply will not work without a DOI, and is also incomplete (only holds the last name of the first author). --- .../content/scholar/xpcom/ingester.js | 296 +++++++++++++++++- .../content/scholar/xpcom/utilities.js | 8 + scrapers.sql | 220 +++++-------- 3 files changed, 390 insertions(+), 134 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js index b3f922fe5..fdebecc8f 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/ingester.js +++ b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -147,6 +147,8 @@ Scholar.OpenURL = new function() { this.resolve = resolve; this.discoverResolvers = discoverResolvers; this.createContextObject = createContextObject; + this.parseContextObject = parseContextObject; + this.lookupContextObject = lookupContextObject; /* * Returns a URL to look up an item in the OpenURL resolver @@ -224,7 +226,7 @@ Scholar.OpenURL = new function() { co += "&id="+escape(identifier); } } else { - var co = "ctx_ver=Z39.88-2004"; + var co = "url_ver=Z39.88-2004&ctx_ver=Z39.88-2004"; for each(identifier in identifiers) { co += "&rft_id="+escape(identifier); @@ -300,6 +302,298 @@ Scholar.OpenURL = new function() { return co; } + /* + * Generates an item in the format returned by item.fromArray() given an + * OpenURL version 1.0 contextObject + */ + function parseContextObject(co) { + var coParts = co.split("&"); + + var item = new Array(); + item.creators = new Array(); + + // get type + item.itemType = _determineResourceType(coParts); + if(!item.itemType) { + return false; + } + + var pagesKey = ""; + + for each(part in coParts) { + var keyVal = part.split("="); + var key = keyVal[0]; + var value = unescape(keyVal[1].replace(/\+|%2[bB]/g, " ")); + if(!value) { + continue; + } + + if(key == "rft_id") { + var firstEight = value.substr(0, 8).toLowerCase(); + if(firstEight == "info:doi") { + item.DOI = value; + } else if(firstEight == "urn:isbn") { + item.ISBN = value.substr(9); + } + } else if(key == "rft.btitle") { + if(item.itemType == "book") { + item.title = value; + } else if(item.itemType == "bookSection") { + item.publicationTitle = value; + } + } else if(key == "rft.atitle" && item.itemType != "book") { + item.title = value; + } else if(key == "rft.jtitle" && item.itemType == "journal") { + item.publcation = value; + } else if(key == "rft.stitle" && item.itemType == "journal") { + item.journalAbbreviation = value; + } else if(key == "rft.date") { + item.date = value; + } else if(key == "rft.volume") { + item.volume = value; + } else if(key == "rft.issue") { + item.issue = value; + } else if(key == "rft.pages") { + pagesKey = key; + item.pages = value; + } else if(key == "rft.spage") { + if(pagesKey != "rft.pages") { + pagesKey = key; + // make pages look like start-end + if(pagesKey == "rft.epage") { + if(value != item.pages) { + item.pages = value+"-"+item.pages; + } + } else { + item.pages = value; + } + } + } else if(key == "rft.epage") { + if(pagesKey != "rft.pages") { + pagesKey = key; + // make pages look like start-end + if(pagesKey == "rft.spage") { + if(value != item.pages) { + item.pages = +item.pages+"-"+value; + } + } else { + item.pages = value; + } + } + } else if(key == "issn" || (key == "eissn" && !item.ISSN)) { + item.ISSN = value; + } else if(key == "rft.aulast") { + var lastCreator = item.creators[item.creators.length-1]; + if(item.creators.length && !lastCreator.lastName && !lastCreator.institutional) { + lastCreator.lastName = value; + } else { + item.creators.push({lastName:value}); + } + } else if(key == "rft.aufirst") { + var lastCreator = item.creators[item.creators.length-1]; + if(item.creators.length && !lastCreator.firstName && !lastCreator.institutional) { + lastCreator.firstName = value; + } else { + item.creators.push({firstName:value}); + } + } else if(key == "rft.au") { + item.creators.push(Scholar.cleanAuthor(value, "author", true)); + } else if(key == "rft.aucorp") { + item.creators.push({lastName:value, institutional:true}); + } else if(key == "rft.isbn" && !item.ISBN) { + item.ISBN = value; + } else if(key == "rft.pub") { + item.publisher = value; + } else if(key == "rft.place") { + item.place = value; + } else if(key == "rft.edition") { + item.edition = value; + } else if(key == "rft.series") { + item.seriesTitle = value; + } + } + + return item; + } + + /* + * Looks up additional information on an item in the format returned by + * item.fromArray() in CrossRef or Open WorldCat given an OpenURL version + * 1.0 contextObject + */ + function lookupContextObject(co, done, error) { + // CrossRef requires a url_ver to work right + if(co.indexOf("url_ver=Z39.88-2004") == -1) { + co = "url_ver=Z39.88-2004&"+co; + } + + var type = _determineResourceType(co.split("&")); + if(!type) { + return false; + } + + if(type == "journal") { + // look up journals in CrossRef + Scholar.Utilities.HTTP.doGet("http://www.crossref.org/openurl/?"+co+"&noredirect=true", null, function(req) { + var items = _processCrossRef(req.responseText); + done(items); + }); + } else { + // look up books in Open WorldCat + Scholar.Utilities.HTTP.processDocuments(null, ["http://partneraccess.oclc.org/wcpa/servlet/OpenUrl?"+co], function(browser) { + var doc = browser.contentDocument; + // find new COinS in the Open WorldCat page + items = _processOWC(doc); + + if(items) { // we got a single item page; return the item + done(items); + } else { // assume we have a search results page + var items = new Array(); + + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == 'x') return namespace; else return null; + } : null; + + // first try to get only books + var elmts = doc.evaluate('//table[@class="tableLayout"]/tbody/tr/td[@class="content"]/table[@class="tableResults"]/tbody/tr[td/img[@alt="Book"]]/td/div[@class="title"]/a', doc, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); + var elmt = elmts.iterateNext(); + if(!elmt) { // if that fails, look for other options + var elmts = doc.evaluate('//table[@class="tableLayout"]/tbody/tr/td[@class="content"]/table[@class="tableResults"]/tbody/tr[td/img[@alt="Book"]]/td/div[@class="title"]/a', doc, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); + elmt = elmts.iterateNext() + } + + var urlsToProcess = new Array(); + do { + urlsToProcess.push(elmt.href); + } while(elmt = elmts.iterateNext()); + + Scholar.Utilities.HTTP.processDocuments(null, urlsToProcess, function(browser) { + // per URL + var newItems = _processOWC(browser.contentDocument); + if(newItems) { + items = items.concat(newItems); + } + }, function() { // done + done(items); + }, function() { // error + error(); + }); + } + }, null, function() { + error(); + }); + } + } + + /* + * Processes the XML format returned by CrossRef + */ + function _processCrossRef(xmlOutput) { + xmlOutput = xmlOutput.replace(/<\?xml[^>]*\?>/, ""); + + // parse XML with E4X + var qr = new Namespace("http://www.crossref.org/qrschema/2.0"); + try { + var xml = new XML(xmlOutput); + } catch(e) { + return false; + } + + // ensure status is valid + var status = xml.qr::body.qr::query.@status.toString(); + if(status != "resolved" && status != "multiresolved") { + return false; + } + + var query = xml.qr::body.qr::query; + var item = new Array(); + item.creators = new Array(); + + // try to get a DOI + item.DOI = query.qr::doi.(@type=="journal_article").toString(); + if(!item.DOI) { + item.DOI = query.qr::doi.(@type=="book_title").toString(); + } + if(!item.DOI) { + item.DOI = query.qr::doi.(@type=="book_content").toString(); + } + + // try to get an ISSN (no print/electronic preferences) + item.ISSN = query.qr::issn.toString(); + // get title + item.title = query.qr::article_title.toString(); + // get publicationTitle + item.publicationTitle = query.qr::journal_title.toString(); + // get author + item.creators.push(Scholar.Utilities.cleanAuthor(query.qr::author.toString(), "author", true)); + // get volume + item.volume = query.qr::volume.toString(); + // get issue + item.issue = query.qr::issue.toString(); + // get year + item.date = query.qr::year.toString(); + // get edition + item.edition = query.qr::edition_number.toString(); + // get first page + item.pages = query.qr::first_page.toString(); + + return [item]; + } + + /* + * Parses a document object referring to an Open WorldCat entry for its + * OpenURL contextObject, then returns an item generated from this + * contextObject + */ + function _processOWC(doc) { + var spanTags = doc.getElementsByTagName("span"); + for(var i=0; i 1) { + var selectArray = new Array(); + + for(var i in newItems) { + selectArray[i] = newItems.title; + } + selectArray = Scholar.selectItems(selectArray); + for(var i in selectArray) { + addAsItem(newItems[i]); + } + } else if(newItems.length) { + addAsItem(newItems[0]); } - - return item; } function doWeb(doc, url) { var newItems = new Array(); + var needFullItems = new Array(); var spanTags = doc.getElementsByTagName("span"); @@ -2486,28 +2438,30 @@ function doWeb(doc, url) { var spanClasses = spanClass.split(" "); if(Scholar.Utilities.inArray("Z3988", spanClasses)) { var spanTitle = spanTags[i].getAttribute("title"); - if(spanTitle.indexOf("rft_val_fmt=info:ofi/fmt:kev:mtx:journal") != -1 - || spanTitle.indexOf("rft_val_fmt=info:ofi/fmt:kev:mtx:book") != -1) { - newItems.push(parseContextObject(spanTitle)); + var newItem = Scholar.Utilities.parseContextObject(spanTitle); + if(newItem) { + if(newItem.title && newItem.creators.length) { + // title and creators are minimum data to avoid looking up + newItems.push(newItem); + } else { + // retrieve full item + newItem.contextObject = spanTitle; + needFullItems.push(newItem); + } } } } } - if(newItems.length > 1) { - var selectArray = new Array(); - - for(var i in newItems) { - selectArray[i] = newItems.title; - } - selectArray = Scholar.selectItems(selectArray); - for(var i in selectArray) { - newItems[i].complete(); - } + if(needFullItems.length) { + // retrieve full items asynchronously + Scholar.wait(); + retrieveNextCOinS(needFullItems, newItems); } else { - newItems[0].complete(); + completeCOinS(newItems); } }'); + REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 4, 'Google Books Scraper', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)', 'function detect(doc, url) { var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i'');