From 3dcd3c8ca72246a675809635da82d6873cb374e2 Mon Sep 17 00:00:00 2001 From: Avram Lyon Date: Sun, 30 Jan 2011 14:18:59 +0000 Subject: [PATCH] Trans: Adding new TalisPrism translator by Emma Reisz and Will Smith. --- translators/TalisPrism.js | 434 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 434 insertions(+) create mode 100644 translators/TalisPrism.js diff --git a/translators/TalisPrism.js b/translators/TalisPrism.js new file mode 100644 index 000000000..4fbe0b08a --- /dev/null +++ b/translators/TalisPrism.js @@ -0,0 +1,434 @@ +{ + "translatorID":"53f8d182-4edc-4eab-b5a1-141698a20202", + "label":"TalisPrism", + "creator":"William Smith and Emma Reisz", + "target":"/TalisPrism/(browseResults|doSearch)", + "minVersion":"1.0.0b4.r5", + "maxVersion":"", + "priority":100, + "inRepository":"1", + "translatorType":4, + "lastUpdated":"2010-11-15 11:35:54" +} + +/* TalisPrism translator. + Version 1.1 + By William Smith (http://www.willsmith.org/contactme) + and Emma Reisz + +TalisPrism is a library management system used by a number of universities +and public bodies in the UK, Ireland and elsewhere. +For example: http://qu-prism.qub.ac.uk/TalisPrism/ +and http://http://star.shef.ac.uk/TalisPrism/ + +This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +*/ + + +// TalisPrism doesn't use metadata so everything must be scraped. + +function detectWeb(doc, url){ + + /* Can't differentiate multiple from single results by URL + as single search results have a search URL but display as browse. + Can't scrape the titles to differentiate between single and multiple as the display format + is too different to be scraped consistently. + Instead we differentiate by URL but make an exception for a solo result. + */ + var search=searchTest(doc, url); + + if (search==1) { + var doctype = 'multiple'; + } else {doctype=docType(doc, url); + } + return doctype; +} + +function docType (doc,url){ + //Need xpaths to detect type. + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x" ) return namespace; else return null; + } : null; + + // Best way to identify item type on an entry page is by its icon. + if (getXPath(doc, '//img[@alt="sound - disc"]/@alt').length) { + doctype = 'audioRecording'; + } else if (getXPath(doc, '//img[@alt="Book"]/@alt').length) { + doctype = 'book'; + } else if (getXPath(doc, '//img[@alt="video - disc"]/@alt').length) { + doctype = 'videoRecording'; + } else { + doctype = 'document'; + } + return doctype; +} + + +function searchTest (doc, url){ + + //Need xpaths to differentiate search and item pages. + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x" ) return namespace; else return null; + } : null; + + var searchPage; + var search; + if (url.match(/doSearch/)) { + var resultCount; + var resultCountElements = new Array(); + var resultCountText; + var resultCountPath = '//table/tbody/tr/td/table/tbody/tr/td[1]/font/span[@class="text"]/font'; + var resultCountObject = doc.evaluate(resultCountPath, doc, nsResolver, XPathResult.ANY_TYPE, null); + while (resultCountText = resultCountObject.iterateNext()) { + resultCountElements.push(resultCountText.textContent); + } + resultCount=resultCountElements[0]; + if (resultCount == 1) { + search=0; + } else { + search=1; + } + } else { + var pageCount; + var pageCountElements = new Array(); + var pageCountText; + var pageCountPath= '//tbody/tr/td[2]/font/span[@class="text"]/table/tbody/tr[2]/td/font/span[@class="text"]/table/tbody/tr/td[4]'; + var pageCountObject = doc.evaluate(pageCountPath, doc, nsResolver, XPathResult.ANY_TYPE, null); + while (pageCountText = pageCountObject.iterateNext()) { + pageCountElements.push(pageCountText.textContent); + } + pageCount=pageCountElements[0]; + if (pageCount==undefined){ + search=0; + } else if (pageCount.match(/Page/)){ + search=1 + } else { + search=0; + } + } + return search; +} + +function getXPath ( doc, field ) { + xpath = field; + + content = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext(); + + if (content) + return content.textContent; + else + return ''; + +} + +//TalisPrism displays with labels. The getField function searches for the next different field after a label. + +function getField (doc, field) { + + xpath='//span[@class="text"]'; + + content = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null); + + while (c = content.iterateNext()) + { + if (c.textContent == field) + { + // OK, find the next field + while (val = content.iterateNext()) { + + if (val && val.textContent != c.textContent) + { + return val.textContent; + } + } + } + } + return ''; +} + +function multiscrape(doc, url) { + url=doc.documentURI; + var item; + var doctype = docType(doc, url); + item = new Zotero.Item(doctype); + scrape(doc,url, item); +} + + +function soloscrape(doc, url) { + url=doc.documentURI; + var item; + item = new Zotero.Item(doctype); + scrape(doc,url, item); + return ''; +} + + +function scrape(doc, url, item){ + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == 'x') return namespace; else return null; + } : null; + + // The fields often contain multiple data types and need some cleanup. + var title = getField(doc, 'Title'); + + if (title.length == 0) { + title = 'Unknown Title'; + } + // If title includes a forward slash, omit the last bit. + if (title.match('/')) { + title = title.substring(0, title.lastIndexOf('/')); + } + title = title.replace(/^\s+|\s+$/g, ''); + item.title = title; + + var author = getField(doc, 'Author'); + if (author.length) { + item.creators.push(Zotero.Utilities.cleanAuthor(author, "author", 1)); + } else { + author = getField(doc, 'Other Author(s) / Title(s)'); + if (author.length) { + item.creators.push(Zotero.Utilities.cleanAuthor(author, "author", 1)); + } + } + + + // Place, publisher and publish date are in the same field. Format is usually "Place : Publisher, yyyy". + + var publishing = getField(doc, 'Publisher'); + if (publishing.length == 0) { + publishing = getField(doc, 'Published'); + } + if (publishing.length == 0) { + publishing = getField(doc, 'Publication details'); + } + + if (publishing.match(/(13|14|15|16|17|18|19|20)\d\d/)) { + var pos = publishing.search(/(13|14|15|16|17|18|19|20)\d\d/); + item.date = publishing.substring(pos, publishing.lastIndexOf('.')).match(/\d\d\d\d/); + var place = publishing.substring(0, publishing.indexOf(':')); + item.place = place.replace(/^\s+|\s+$/g, ''); + var publisher = publishing.substring(publishing.indexOf(':')+1, pos); + item.publisher = publisher.replace(/^\s+|\s+$|\,\s+$/g, ''); + } + + + var isbn = getField(doc, 'ISBN'); + if (isbn.length == 0) { + isbn = getField(doc, 'ISBN, etc.'); + } + + isbn=isbn.replace(/^\D+|\D+$/g, ""); + item.ISBN = isbn.substring(0).match(/\d+/); + + var series = getField(doc, 'Series'); + var pos2 =series.lastIndexOf(';'); + if (pos2==-1){ + item.series=series.replace(/^\s+|\s+$/g, ''); + }else{ + var seriesName = series.substring(0, pos2); + item.series = seriesName.replace(/^\s+|\s+$/g, ''); + var seriesNumber = series.substring(pos2+1); + item.seriesNumber = seriesNumber.replace(/^\s+|\s+$/g, ''); + } + + item.edition = getField(doc, 'Edition'); + + var physical = getField(doc, 'Physical details'); + var numPages = physical.substring(0, physical.indexOf(':')); + item.numPages = numPages.replace(/^\s+|\s+$/g, ''); + + var physicaldetails = physical.substring(physical.indexOf(':')+1, physical.lastIndexOf('.')); + physicaldetails = physicaldetails.replace(/^\s+|\s+$/g, ''); + + var databasedetails = getField(doc, 'Cited/indexed in'); + databasedetails = databasedetails.replace(/^\s+|\s+$/g, ''); + + item.extra = databasedetails + physicaldetails + + item.attachments.push({url:url, title:"Snapshot of Library Page", mimeType:"text/html"}); + + var doctitle + doctitle = doc.title + if (doctitle == "TalisPrism"){ + item.libraryCatalog =url.substring(url.indexOf('http'), url.indexOf('/TalisPrism')); + } else { + item.libraryCatalog = doctitle + } + + + /* We need to XPath to the call number as we cannot be sure about the previous cell, + so the label method won't work. Some items have multiple call numbers, + but a generalised XPath which retrieves multiple sets of location data (tr[2], tr[3] etc.) + also retrieves tr [1], which contains all the rest of the bibliographic entry. + The size of tr[1] varies and there is no consistent final item, + so instead of using a general XPath, we scrape tr[2], tr[3] and tr[4] successively into an array; + tr[5] is also scraped into the array, but if non-null, 'See record for additional call numbers.' + is returned as the final shelfmark. Note that each call number is itself scraped into an + array ('shelfmarkElements'), as we need both the Library and Shelfmark elements. + */ + + var shelfmark = new Array(); + var callNumber = ""; + + //Need to test whether the search page has a sidebar showing as this shifts the classmarks. + + var authorModePath='//td/table/tbody/tr/td[1]/font/span[@class="text"]/table/tbody/tr[2]/td/font/span[@class="text"]/font/b/span[@class="text"]/table/tbody/tr/td[2]'; + var authorModeObject=doc.evaluate(authorModePath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + var browseModePath='//td/table/tbody/tr/td[1]/font/span[@class="text"]/table/tbody/tr/td[2]/font/span[@class="text"]/table/tbody/tr/td[1]'; + var browseModeObject=doc.evaluate(browseModePath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + var shelfmarkPath = new Array(); + shelfmarkPath[0] = '//td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr[2]/td'; + shelfmarkPath[1] = '//td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr[3]/td'; + shelfmarkPath[2] = '//td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr[4]/td'; + shelfmarkPath[3] = '//td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr[5]/td'; + var shelfmarkText; + if (authorModeObject==null||authorModeObject.innerHTML==null){ + if (browseModeObject==null||browseModeObject.innerHTML==null){ + for (var i=0; i < 4; i ++){ + var shelfmarkObject = new Array(); + var shelfmarkElements = new Array(); + shelfmarkObject[i] = doc.evaluate(shelfmarkPath[i], doc, nsResolver, XPathResult.ANY_TYPE, null); + while (shelfmarkText = shelfmarkObject[i].iterateNext()) { + shelfmarkElements.push(shelfmarkText.textContent); + } + shelfmark[i]=shelfmarkElements[0]+" "+shelfmarkElements[1]; + //Need to remove junk text scraped when there is a request button in the call number field. + shelfmark[i] = shelfmark[i].replace(/\s*\/*(?:xc_d.write.*\;)/, ''); + } + } else if (browseModeObject.innerHTML.match(/arrow/)) { + for (var i=0; i < 4; i ++){ + var shelfmarkObject = new Array(); + var shelfmarkElements = new Array(); + shelfmarkObject[i] = doc.evaluate(shelfmarkPath[i], doc, nsResolver, XPathResult.ANY_TYPE, null); + while (shelfmarkText = shelfmarkObject[i].iterateNext()) { + shelfmarkElements.push(shelfmarkText.textContent); + } + shelfmark[i]=shelfmarkElements[1]+" "+shelfmarkElements[2]; + shelfmark[i] = shelfmark[i].replace(/\s*\/*(?:xc_d.write.*\;)/, ''); + } + } + }else if (authorModeObject.innerHTML.match(/arrow/)){ + for (var i=0; i < 4; i ++){ + var shelfmarkObject = new Array(); + var shelfmarkElements = new Array(); + shelfmarkObject[i] = doc.evaluate(shelfmarkPath[i], doc, nsResolver, XPathResult.ANY_TYPE, null); + while (shelfmarkText = shelfmarkObject[i].iterateNext()) { + shelfmarkElements.push(shelfmarkText.textContent); + } + shelfmark[i]=shelfmarkElements[1]+" "+shelfmarkElements[2]; + shelfmark[i] = shelfmark[i].replace(/\s*\/*(?:xc_d.write.*\;)/, ''); + } + } + if (shelfmark[0] != "undefined undefined"){ + callNumber = shelfmark[0]; + } + for (var i=1; i<3; i++){ + if (shelfmark[i] != "undefined undefined"){ + callNumber = callNumber + "; " + shelfmark[i]; + } + } + if (shelfmark[3] != "undefined undefined"){ + callNumber = callNumber + ". See record for additional call numbers."; + } + + item.callNumber = callNumber; + + var link = getField (doc, 'Link to'); + if (link.length == 0) { + var linkPath='//span[@class="text"]/table/tbody/tr/td/table/tbody/tr/td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr/td[2]/font/span[@class="text"]/a'; + var linkObject=doc.evaluate(linkPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (linkObject==null){ + } else { + var linkTitle=linkObject.textContent; + var linkLink=linkObject.href; + if (linkTitle=="Link to electronic text"){ + link=linkLink; + } + } + } + item.url = link; + + item.complete(); + return ''; +} + +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x" ) return namespace; else return null; + } : null; + + var articles = new Array (); + var names = new Array (); + var items = new Object (); + var nextTitle; + doctype=detectWeb(doc, url); + + /* Typically scrapers process both search pages and item pages in the same way; + the processDocuments function is used, calling the scraped result link URLs for a search page, + and for an item page calling the item page's own URL. + But Talis displays solo search results with an unstable URL and with no link to an item page. + So we cannot call the URL for a solo search result as it will yield a null page. + Instead we must process solo search results directly without using processDocuments. + We want to process item pages in the same way as solo search pages because + waiting for the URL on an item page to be called noticeably slows down the scrape. + */ + + var indexPath ='//span[@class="text"]/x:table/x:tbody/x:tr/x:td/x:table/x:tbody/x:tr/x:td[1]' + var index; + var indexElements = new Array(); + var indexText; + var indexObject = doc.evaluate(indexPath, doc, nsResolver, XPathResult.ANY_TYPE, null); + while (indexText = indexObject.iterateNext()) { + indexElements.push(indexText.textContent); + } + index=indexElements[0]; + index1=indexElements[1]; + if (doctype == "multiple" && index.match(/Index/) && index1 == ""){ + var titlePath = '//td[3]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr/td[1]/font/span[@class="text"]/a'; + var titles = doc.evaluate(titlePath, doc, nsResolver, XPathResult.ANY_TYPE, null); + while (nextTitle = titles.iterateNext()) { + items[nextTitle.href] = nextTitle.textContent; + names.push(nextTitle.textContent); + } + items = Zotero.selectItems(items); + for (var i in items) { + articles.push(i); + } + Zotero.Utilities.processDocuments(articles, multiscrape, function(){Zotero.done();}); + + } else if (doctype == "multiple") { + var titlePath = '//td[4]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr/td[1]/font/span[@class="text"]/a'; + var titles = doc.evaluate(titlePath, doc, nsResolver, XPathResult.ANY_TYPE, null); + while (nextTitle = titles.iterateNext()) { + items[nextTitle.href] = nextTitle.textContent; + names.push(nextTitle.textContent); + } + items = Zotero.selectItems(items); + for (var i in items) { + articles.push(i); + } + Zotero.Utilities.processDocuments(articles, multiscrape, function(){Zotero.done();}); + } + else { + soloscrape(doc, url); + } + Zotero.wait(); + +} \ No newline at end of file