diff --git a/scrapers.sql b/scrapers.sql index df4efde6e..b0116de3e 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 165 +-- 166 -- ***** BEGIN LICENSE BLOCK ***** -- @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-01-20 00:20:00')); +REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-01-23 23:15:00')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b3.r1', '', '2006-12-15 03:40:00', 1, 100, 4, 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', 'function detectWeb(doc, url) { @@ -6288,6 +6288,196 @@ function doWeb(doc, url) { Zotero.wait(); }'); + +REPLACE INTO translators VALUES ('1b9ed730-69c7-40b0-8a06-517a89a3a278', '1.0.0b3r1', '', '2007-01-23 23:15:00', '0', '100', '4', 'Sudoc', 'Sean Takats', '^http://www\.sudoc\.abes\.fr', +'function detectWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var xpath = ''//table/tbody/tr/td[1][@class="preslabel"]/strong''; + var multxpath = ''//a[@id="InitialFocusPoint"]''; + var elt; + + if (elt = doc.evaluate(multxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + return "multiple"; + } + else if (elt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) + { + var contenu = elt.textContent; + var numRegexp = /(Num?ro.*de.*notice|Record.*number)/; + var m = numRegexp.exec(contenu); + if (m) { + // On a bien une notice d"ouvrage, on doit chercher limage + // pour choisir le type de document + var imgXpath = ''/html/body/table/tbody/tr/td[1]/p/img/@src''; + var imgsrc = doc.evaluate(imgXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; + if (imgsrc){ + if (imgsrc.indexOf("icon_per.gif") > 0){ + return "book"; + } else if (imgsrc.indexOf("icon_books.gif") > 0){ + return "book"; + } else if (imgsrc.indexOf("icon_thesis.gif") > 0){ + return "thesis"; + } else if (imgsrc.indexOf("icon_art.gif") > 0){ + return "journalArticle"; + } else { + return "book"; + } + } + } + } +}', +'function scrape(doc) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var rowXpath = ''//tr[td[@class="preslabel"]]''; + var tableRows = doc.evaluate(rowXpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var tableRow; + + var newItem = new Zotero.Item(); + // TODO add other item types using detectWeb''s icon checking code + newItem.itemType = "book"; + var imgXpath = ''/html/body/table/tbody/tr/td[1]/p/img/@src''; + var imgsrc = doc.evaluate(imgXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; + if (imgsrc){ + if (imgsrc.indexOf("icon_per.gif") > 0){ + newItem.itemType = "book"; + } else if (imgsrc.indexOf("icon_books.gif") > 0){ + newItem.itemType = "book"; + } else if (imgsrc.indexOf("icon_thesis.gif") > 0){ + newItem.itemType = "thesis"; + } else if (imgsrc.indexOf("icon_art.gif") > 0){ + newItem.itemType = "journalArticle"; + } else { + newItem.itemType = "book"; + } + } else { + newItem.itemType = "book"; + } + while (tableRow = tableRows.iterateNext()) + { + var field = doc.evaluate(''./td[1]'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + var value = doc.evaluate(''./td[2]'', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + field = Zotero.Utilities.superCleanString(field); + + if (field == "Titre" || field == "Title"){ + Zotero.debug("title = " + value); + value = value.replace(/(\[[^\]]+\])/g,""); + newItem.title = value.split(" / ")[0]; + } + if (field.substr(0,6) == "Auteur" || field.substr(0,6) == "Author"){ + var authors = doc.evaluate(''./td[2]/a'', tableRow, nsResolver, XPathResult.ANY_TYPE, null); + var author; + while (author = authors.iterateNext()){ + var authorText = author.textContent; + var authorParts = authorText.split(" ("); + newItem.creators.push(Zotero.Utilities.cleanAuthor(authorParts[0], 1, true)); + } + } + if (field.substr(0,4) == "Date"){ + newItem.date = value; + } + if (field.substr(0,7) == "Editeur" || field.substr(0,9) == "Publisher"){ + var pubParts = value.split(" : "); + newItem.place = pubParts[0]; + // needs error checking below to avoid error + if (pubParts[1] ) { + pubParts = pubParts[1].split(", "); + newItem.publisher = pubParts[0]; + } + } + if (field.substr(0,4) == "ISBN" || field.substr(0,4) == "ISSN"){ + newItem.ISBN = value.split(" (")[0]; + } + if (field == "Description") { + var m = value.match(/([0-9]+) (?:[pP])/); + if (m) { + newItem.pages = m[1]; + } + } + if (field.substr(0,5) == "Serie" || field.substr(0,10) == "Collection"){ + newItem.series = value; + } + if (field.substr(0,6) == "Sujets" || field.substr(0,8) == "Subjects"){ + var subjectElmts = doc.evaluate(''./td[2]/a'', tableRow, nsResolver, XPathResult.ANY_TYPE, null); + var subject; + var subjects; + while (subject = subjectElmts.iterateNext()){ + subjects = subject.textContent.split(" -- "); + newItem.tags = newItem.tags.concat(subjects); + } + } + if (field == "In" || field == "Dans"){ + var jtitle = value.replace(/(\[[^\]]+\])/g,""); + jtitle = jtitle.split(" / ")[0]; + jtitle = jtitle.split(" - ")[0]; + newItem.publicationTitle = jtitle; + //get page numbers + var m = value.match(/(?:[Pp]\. )([0-9\-]+)/); + if (m) { + newItem.pages = m[1]; + } + //get ISBN or ISSN + m = value.match(/(?:ISSN|ISBN) ([0-9Xx\-]+)/); + if (m) { + newItem.ISBN = m[1]; + newItem.ISSN = m[1]; + } + // publicationTitle, issue/volume + } + // TODO Pages, Notes, Description, Language, Annexes + } + newItem.complete(); +} + +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var multxpath = ''//a[@id="InitialFocusPoint"]''; + var elt; + + if (elt = doc.evaluate(multxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + var newUrl = doc.evaluate(''//base/@href'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; + var xpath = ''//tr/td[3]/a''; + var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var elmt = elmts.iterateNext(); + var links = new Array(); + var availableItems = new Array(); + var i = 0; + do { + var link = doc.evaluate(''./@href'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; + var searchTitle = elmt.textContent; + availableItems[i] = searchTitle; + links[i] = link; + i++; + } while (elmt = elmts.iterateNext()); + var items = Zotero.selectItems(availableItems); + + if(!items) { + return true; + } + var uris = new Array(); + for(var i in items) { + uris.push(newUrl + links[i]); + } + Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, + function() { Zotero.done(); }, null); + Zotero.wait(); + } + else { + scrape(doc); + } +}'); + + REPLACE INTO translators VALUES ('e07e9b8c-0e98-4915-bb5a-32a08cb2f365', '1.0.0b3.r1', '', '2006-10-02 17:00:00', 1, 100, 8, 'Open WorldCat', 'Simon Kornblith', 'http://partneraccess.oclc.org/', 'function detectSearch(item) { if(item.itemType == "book" || item.itemType == "bookSection") {