diff --git a/scrapers.sql b/scrapers.sql index c3efc28b2..b6b78d34c 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 115 +-- 116 -- ***** BEGIN LICENSE BLOCK ***** -- @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-11-27 12:00:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-12-06 23:30:00')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b3.r1', '', '2006-11-26 09:05:00', 1, 100, 4, 'Amazon', 'Sean Takats', '^http://(?:www\.)amazon', 'function detectWeb(doc, url) { @@ -1007,21 +1007,44 @@ function doWeb(doc, url) { Zotero.wait(); }'); -REPLACE INTO translators VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '1.0.0b3.r1', '', '2006-10-25 18:40:43', 1, 100, 4, 'SIRSI 2003+', 'Simon Kornblith', '/uhtbin/cgisirsi', +REPLACE INTO translators VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '1.0.0b3.r1', '', '2006-12-06 23:30:00', 1, 100, 4, 'SIRSI', 'Sean Takats', '/uhtbin/cgisirsi', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; + var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]''; if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + Zotero.Utilities.debug("SIRSI detectWeb: viewmarctags"); return "book"; } + var xpath = ''//input[@name="VOPTIONS"]''; + if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + Zotero.Utilities.debug("SIRSI detectWeb: VOPTIONS"); + return "book"; + } + var elmts = doc.evaluate(''/html/body/form//text()'', doc, nsResolver, + XPathResult.ANY_TYPE, null); + while(elmt = elmts.iterateNext()) { + if(Zotero.Utilities.superCleanString(elmt.nodeValue) == "Viewing record") { + Zotero.Utilities.debug("SIRSI detectWeb: Viewing record"); + return "book"; + } + } + var xpath = ''//td[@class="searchsum"]/table''; if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + Zotero.Utilities.debug("SIRSI detectWeb: searchsum"); return "multiple"; } + var xpath = ''//form[@name="hitlist"]/table/tbody/tr''; + if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + Zotero.Utilities.debug("SIRSI detectWeb: hitlist"); + return "multiple"; + } + // var xpath = ''//input[@type="checkbox"]'' }', 'function scrape(doc) { var namespace = doc.documentElement.namespaceURI; @@ -1102,51 +1125,176 @@ REPLACE INTO translators VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '1.0.0b return true; } -function doWeb(doc, url) { +function doWeb(doc, url){ var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; - - if(!scrape(doc)) { - var checkboxes = new Array(); - var urls = new Array(); - var availableItems = new Array(); - - var tableRows = doc.evaluate(''//td[@class="searchsum"]/table[//input[@value="Details"]]'', doc, nsResolver, XPathResult.ANY_TYPE, null); - var tableRow = tableRows.iterateNext(); // skip first row - // Go through table rows - while(tableRow = tableRows.iterateNext()) { - var input = doc.evaluate(''.//input[@value="Details"]'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - var text = Zotero.Utilities.getNodeString(doc, tableRow, ''.//label/strong//text()'', nsResolver); - if(text) { - availableItems[input.name] = text; + + var sirsiNew = true; //toggle between SIRSI -2003 and SIRSI 2003+ + var xpath = ''//td[@class="searchsum"]/table''; + if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + Zotero.Utilities.debug("SIRSI doWeb: searchsum"); + sirsiNew = true; + } else if (doc.evaluate(''//form[@name="hitlist"]/table/tbody/tr'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + Zotero.Utilities.debug("SIRSI doWeb: hitlist"); + sirsiNew = false; + } else if (doc.evaluate(''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + Zotero.Utilities.debug("SIRSI doWeb: viewmarctags"); + sirsiNew = true; + } else if (doc.evaluate(''//input[@name="VOPTIONS"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + Zotero.Utilities.debug("SIRSI doWeb: VOPTIONS"); + sirsiNew = false; + } else { + var elmts = doc.evaluate(''/html/body/form//text()'', doc, nsResolver, + XPathResult.ANY_TYPE, null); + while(elmt = elmts.iterateNext()) { + if(Zotero.Utilities.superCleanString(elmt.nodeValue) == "Viewing record") { + Zotero.Utilities.debug("SIRSI doWeb: Viewing record"); + sirsiNew = false; } } - - var items = Zotero.selectItems(availableItems); - - if(!items) { - return true; + } + + if (sirsiNew) { //executes Simon''s SIRSI 2003+ scraper code + Zotero.Utilities.debug("Running SIRSI 2003+ code"); + if(!scrape(doc)) { + var checkboxes = new Array(); + var urls = new Array(); + var availableItems = new Array(); + var tableRows = doc.evaluate(''//td[@class="searchsum"]/table[//input[@value="Details"]]'', doc, nsResolver, XPathResult.ANY_TYPE, null); + var tableRow = tableRows.iterateNext(); // skip first row + // Go through table rows + while(tableRow = tableRows.iterateNext()) { + var input = doc.evaluate(''.//input[@value="Details"]'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + var text = Zotero.Utilities.getNodeString(doc, tableRow, ''.//label/strong//text()'', nsResolver); + if(text) { + availableItems[input.name] = text; + } + } + var items = Zotero.selectItems(availableItems); + if(!items) { + return true; + } + var hostRe = new RegExp("^http(?:s)?://[^/]+"); + var m = hostRe.exec(doc.location.href); + Zotero.Utilities.debug("href: " + doc.location.href); + var hitlist = doc.forms.namedItem("hitlist"); + var baseUrl = m[0]+hitlist.getAttribute("action")+"?first_hit="+hitlist.elements.namedItem("first_hit").value+"&last_hit="+hitlist.elements.namedItem("last_hit").value; + var uris = new Array(); + for(var i in items) { + uris.push(baseUrl+"&"+i+"=Details"); + } + Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, + function() { Zotero.done() }, null); + Zotero.wait(); + } + } else{ //executes Simon''s SIRSI -2003 translator code + Zotero.Utilities.debug("Running SIRSI 2003+ code"); + var uri = doc.location.href; + var recNumbers = new Array(); + var xpath = ''//form[@name="hitlist"]/table/tbody/tr''; + var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var elmt = elmts.iterateNext(); + if(elmt) { // Search results page + var uriRegexp = /^http:\/\/[^\/]+/; + var m = uriRegexp.exec(uri); + var postAction = doc.forms.namedItem("hitlist").getAttribute("action"); + var newUri = m[0]+postAction.substr(0, postAction.length-1)+"40"; + var titleRe = /
\s*(.*[^\s])\s*
/i; + var items = new Array(); + do { + var checkbox = doc.evaluate(''.//input[@type="checkbox"]'', elmt, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext(); + // Collect title + var title = Zotero.Utilities.getNodeString(doc, elmt, "./td[2]/text()", nsResolver); + if(checkbox && title) { + items[checkbox.name] = Zotero.Utilities.cleanString(title); + } + } while(elmt = elmts.iterateNext()); + items = Zotero.selectItems(items); + + if(!items) { + return true; + } + + for(var i in items) { + recNumbers.push(i); + } + } else { // Normal page + // this regex will fail about 1/100,000,000 tries + var uriRegexp = /^((.*?)\/([0-9]+?))\//; + var m = uriRegexp.exec(uri); + var newUri = m[1]+"/40" + + var elmts = doc.evaluate(''/html/body/form'', doc, nsResolver, + XPathResult.ANY_TYPE, null); + while(elmt = elmts.iterateNext()) { + var initialText = doc.evaluate(''.//text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(initialText && initialText.nodeValue && Zotero.Utilities.superCleanString(initialText.nodeValue) == "Viewing record") { + recNumbers.push(doc.evaluate(''./b[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); + break; + } + } + // begin Emory compatibility + var elmts = doc.evaluate(''//input[@name="first_hit"]'', doc, nsResolver, + XPathResult.ANY_TYPE, null); + while (elmt = elmts.iterateNext()) { + recNumbers.length = 0; + var recNumber = doc.evaluate(''./@value'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue + recNumbers.push(recNumber); + break; + } + // end Emory compatibility } - - var hostRe = new RegExp("^http://[^/]+"); - var m = hostRe.exec(doc.location.href); - var hitlist = doc.forms.namedItem("hitlist"); - var baseUrl = m[0]+hitlist.getAttribute("action")+"?first_hit="+hitlist.elements.namedItem("first_hit").value+"&last_hit="+hitlist.elements.namedItem("last_hit").value; - - var uris = new Array(); - for(var i in items) { - uris.push(baseUrl+"&"+i+"=Details"); - } - - Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, - function() { Zotero.done() }, null); - - Zotero.wait(); + var translator = Zotero.loadTranslator("import"); + translator.setTranslator("a6ee60df-1ddc-4aae-bb25-45e0537be973"); + var marc = translator.getTranslatorObject(); + Zotero.Utilities.loadDocument(newUri+''?marks=''+recNumbers.join(",")+''&shadow=NO&format=FLAT+ASCII&sort=TITLE&vopt_elst=ALL&library=ALL&display_rule=ASCENDING&duedate_code=l&holdcount_code=t&DOWNLOAD_x=22&DOWNLOAD_y=12&address=&form_type='', function(doc) { + var pre = doc.getElementsByTagName("pre"); + var text = pre[0].textContent; + var documents = text.split("*** DOCUMENT BOUNDARY ***"); + for(var j=1; j\s*(.*[^\s])\s*
/i; - - var items = new Array(); - - do { - var checkbox = doc.evaluate(''.//input[@type="checkbox"]'', elmt, nsResolver, - XPathResult.ANY_TYPE, null).iterateNext(); - // Collect title - var title = Zotero.Utilities.getNodeString(doc, elmt, "./td[2]/text()", nsResolver); - - if(checkbox && title) { - items[checkbox.name] = Zotero.Utilities.cleanString(title); - } - } while(elmt = elmts.iterateNext()); - - - items = Zotero.selectItems(items); - - if(!items) { - return true; - } - - for(var i in items) { - recNumbers.push(i); - } - } else { // Normal page - var uriRegexp = /^(.*)(\/[0-9]+)$/; - var m = uriRegexp.exec(uri); - var newUri = m[1]+"/40" - - var elmts = doc.evaluate(''/html/body/form/p'', doc, nsResolver, - XPathResult.ANY_TYPE, null); - while(elmt = elmts.iterateNext()) { - var initialText = doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - if(initialText && initialText.nodeValue && Zotero.Utilities.superCleanString(initialText.nodeValue) == "Viewing record") { - recNumbers.push(doc.evaluate(''./b[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); - break; - } - } - } - - var translator = Zotero.loadTranslator("import"); - translator.setTranslator("a6ee60df-1ddc-4aae-bb25-45e0537be973"); - var marc = translator.getTranslatorObject(); - - Zotero.Utilities.loadDocument(newUri+''?marks=''+recNumbers.join(",")+''&shadow=NO&format=FLAT+ASCII&sort=TITLE&vopt_elst=ALL&library=ALL&display_rule=ASCENDING&duedate_code=l&holdcount_code=t&DOWNLOAD_x=22&DOWNLOAD_y=12&address=&form_type='', function(doc) { - var pre = doc.getElementsByTagName("pre"); - var text = pre[0].textContent; - - var documents = text.split("*** DOCUMENT BOUNDARY ***"); - - for(var j=1; j