diff --git a/scrapers.sql b/scrapers.sql index 8ac636334..1597bf3a9 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-02-22 16:30:00')); +REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-02-22 20:30:00')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-06-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', 'function detectWeb(doc, url) { @@ -1579,127 +1579,92 @@ function doWeb(doc, url) { } }'); -REPLACE INTO translators VALUES ('0abd577b-ec45-4e9f-9081-448737e2fd34', '1.0.0b4.r5', '', '2008-02-06 21:00:00', '0', '100', '12', 'DSpace', 'Ramesh Srigiriraju', '', -'function detectWeb(doc, url) { - var namespace=doc.documentElement.namespaceURI; - var nsResolver=namespace?function(prefix) { - return (prefix=="x")?namespace:null; - }:null; - var xpath=''//tr[@class="navigationBarItem"]/td/a[contains(@href, "/mydspace")]''; - if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { - var singpath=''//center/table[@class="itemDisplayTable"]''; - if(doc.evaluate(singpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) - return "book"; - var searchpath=''//h3[text()="Item hits:"]''; - var browsdate=''//form[@method="get"][@action="browse-date"]''; - var browstitl=''//form[@method="get"][@action="browse-title"]''; - var other=''//tr/th[@class="oddRowEvenCol"]''; - if(doc.evaluate(searchpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() - ||doc.evaluate(browsdate, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() - ||doc.evaluate(browstitl, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() - ||doc.evaluate(other, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) - return "multiple"; +REPLACE INTO translators VALUES ('c073446d-b90b-47be-a29d-785995d38d87', '1.0.0b4.r5', '', '2008-02-22 20:30:00', '0', '100', '4', 'DSpace', 'Michael Berkowitz', 'dspace', +'function detectWeb(doc, url) { + if (doc.evaluate(''//center/table[@class="itemDisplayTable"]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { + return "document"; + } else if (doc.evaluate(''//table[@class="miscTable"]//td[2]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { + return "multiple"; } }', -'function scrape(doc) { - var namespace=doc.documentElement.namespaceURI; - var nsResolver=namespace?function(prefix) { - return (prefix=="x")?namespace:null; - }:null; - var singpath=''//center/table[@class="itemDisplayTable"]''; - if(doc.evaluate(singpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { - var labelpath=''//tr/td[@class="metadataFieldLabel"]''; - var labels=doc.evaluate(labelpath, doc, nsResolver, XPathResult.ANY_TYPE, null); - var valpath=''//tr/td[@class="metadataFieldValue"]''; - var values=doc.evaluate(valpath, doc, nsResolver, XPathResult.ANY_TYPE, null); - var temp - var newItem=new Zotero.Item("book"); - while(temp=labels.iterateNext()) { - var field=temp.textContent; - field=Zotero.Utilities.cleanString(field); - var temp2=values.iterateNext(); - var value=temp2.textContent; - if(field.substring(0, 5)=="Title") - newItem.title=value; - else if(field.substring(0, 6)=="Author") { - var authtext=temp2.innerHTML; - authtext=authtext.replace(/
/g, "\n"); - var authors=authtext.split("\n"); - for(var i=0; i<=authors.length-1; i++) { - var comma=authors[i].indexOf(","); - if(comma!=-1) - newItem.creators.push({lastName:authors[i].substring(0, comma), - firstName:authors[i].substring(comma+2), creatorType:"author"}); - else - newItem.creators.push({lastName:authors[i], creatorType:"author"}); - } - } - else if(field.substring(0, 4)=="URI") - newItem.url=value; - else if(field.substring(0, 10)=="Issue Date") - newItem.date=value; - else if(field.substring(0, 9)=="Publisher") - newItem.publisher=value; - else if(field.substring(0, 8)=="Abstract") - newItem.abstract=value; - else if(field.substring(0, 11)=="Description") { - var pullre=new RegExp("\\|Pages ([^\\|]+)\\|"); - Zotero.debug(pullre); - var matches=pullre.exec(value); - if(matches) - newItem.pages=matches[1]; - } - } - var pdfpath=''//tr[td/text()="Adobe PDF"]/td/a''; - var pdflink=doc.evaluate(pdfpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); - if(pdflink) - newItem.attachments.push({url:pdflink.href, title:newItem.title, mimeType:"application/pdf"}); - newItem.complete(); - } +'var itemTypes = { + "Article":"journalArticle", + "Book":"book", + "Thesis":"thesis", + "Working Paper":"report", + "Technical Report":"report" } -function doWeb(doc, url) { - var namespace=doc.documentElement.namespaceURI; - var nsResolver=namespace?function(prefix) { - return (prefix=="x")?namespace:null; - }:null; - var singpath=''//center/table[@class="itemDisplayTable"]''; - if(doc.evaluate(singpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) - scrape(doc); - var searchpath=''//h3[text()="Item hits:"]''; - var browsdate=''//form[@method="get"][@action="browse-date"]''; - var browstitl=''//form[@method="get"][@action="browse-title"]''; - var other=''//tr/th[@class="oddRowEvenCol"]''; - if(doc.evaluate(searchpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() - ||doc.evaluate(browsdate, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() - ||doc.evaluate(browstitl, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext() - ||doc.evaluate(other, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { - var headerpath=''//tr[th/@class="oddRowEvenCol"]/th''; - var headers=doc.evaluate(headerpath, doc, nsResolver, XPathResult.ANY_TYPE, null); - var header; - var index=1; - while(header=headers.iterateNext()) - if(header.textContent.substring(0, 5)=="Title") - break; - else - index++; - var titlpath=''//tbody[tr/th/@class="oddRowOddCol"]/tr/td[''+index+'']//a''; - var titles=doc.evaluate(titlpath, doc, nsResolver, XPathResult.ANY_TYPE, null); - var title; - var items=new Array(); - while(title=titles.iterateNext()) - items[title.href]=title.textContent; - items=Zotero.selectItems(items); - var urls=new Array(); - for(var link in items) - urls.push(link); - Zotero.Utilities.processDocuments(urls, function(doc) { - scrape(doc); - }, function() { - Zotero.done(); - }); +function doWeb(doc, url) { + var records = new Array(); + if (detectWeb(doc, url) == "multiple") { + var items = new Object(); + var xpath = ''//table[@class="miscTable"]/tbody/tr/td[2]/a''; + var rows = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null); + var row; + while (row = rows.iterateNext()) { + items[row.href] = row.textContent; + } + items = Zotero.selectItems(items); + for (var i in items) { + records.push(i + ''?mode=full''); + } + } else { + records = [url.match(/^([^?]+)\??/)[1] + "?mode=full"]; } - Zotero.wait(); + Zotero.debug(records); + Zotero.Utilities.processDocuments(records, function(newDoc) { + Zotero.debug(newDoc.location.href); + var values = new Object(); + var fields = newDoc.evaluate(''//table[@class="itemDisplayTable"]/tbody/tr/td[1]'', newDoc, null, XPathResult.ANY_TYPE, null); + var data = newDoc.evaluate(''//table[@class="itemDisplayTable"]/tbody/tr/td[2]'', newDoc, null, XPathResult.ANY_TYPE, null); + var field2; + var datum2; + var newItem = new Zotero.Item(); + while ((field2 = fields.iterateNext()) && (datum2 = data.iterateNext())) { + var field = field2.textContent.replace(/^dc\./, ""); + var datum = datum2.textContent; + if (field == "contributor.author") { + var name = datum.split(","); + newItem.creators.push({firstName:name[1], lastName:name[0], creatorType:"author"}); + } else if (field == "dentifier.uri") { + newItem.url = datum; + } else if (field == "title") { + newItem.title = datum; + } else if (field == "type") { + if (itemTypes[datum]) { + newItem.itemType = itemTypes[datum]; + } else { + newItem.itemType = "document"; + } + } else if (field == "description.abstract") { + newItem.abstractNote = datum; + } else if (field == "date.available") { + newItem.date = datum.replace(/T.*$/, ""); + } else if (field == "subject") { + newItem.tags.push(datum); + } else if (field == "publisher") { + newItem.publisher = datum; + } else if (field == "identifier.issn") { + newItem.ISSN = datum; + } else if (field == "relation.ispartofseries") { + if (datum.match(/Vol/)) { + newItem.volume = datum.match(/\d+/)[0]; + } else if (datum.match(/No/)) { + newItem.issue = datum.match(/\d+/)[0]; + } + } else if (field == "rights") { + newItem.rights = datum; + } + } + var pdf = newDoc.evaluate(''//td[@class="standard"]/a'', newDoc, null, XPathResult.ANY_TYPE, null).iterateNext().href; + newItem.attachments = [ + {url:newDoc.location.href, title:"DSpace Snapshot", mimeType:"text/html"}, + {url:pdf, title:"DSpace PDF", mimeType:"application/pdf"} + ]; + Zotero.debug(newItem); + newItem.complete(); + }, function() {Zotero.done;}); }'); REPLACE INTO translators VALUES ('7987b420-e8cb-4bea-8ef7-61c2377cd686', '1.0.0b4.r1', '', '2008-02-06 20:00:00', '0', '100', '4', 'NASA ADS', 'Asa Kusuma and Ramesh Srigiriraju', 'http://(ukads|cdsads|ads|adsabs|esoads|adswww|www.ads)\.(inasan|iucaa.ernet|nottingham.ac|harvard|eso|u-strasbg|nao.ac|astro.puc|bao.ac|on|kasi.re|grangenet|lipi.go|mao.kiev)\.(edu|org|net|fr|jp|cl|id|uk|cn|ua|in|ru|br|kr)/(?:cgi-bin|abs)/',