-First draft of a DSpace translator using Dublin Core metadata.

This commit is contained in:
Michael Berkowitz 2008-02-22 19:17:53 +00:00
parent 81cbc50622
commit 7d8806fa97

View File

@ -22,7 +22,7 @@
-- Set the following timestamp to the most recent scraper update date
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-02-22 16:30:00'));
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-02-22 20:30:00'));
REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-06-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon',
'function detectWeb(doc, url) {
@ -1579,127 +1579,92 @@ function doWeb(doc, url) {
}
}');
REPLACE INTO translators VALUES ('0abd577b-ec45-4e9f-9081-448737e2fd34', '1.0.0b4.r5', '', '2008-02-06 21:00:00', '0', '100', '12', 'DSpace', 'Ramesh Srigiriraju', '',
'function detectWeb(doc, url) {
var namespace=doc.documentElement.namespaceURI;
var nsResolver=namespace?function(prefix) {
return (prefix=="x")?namespace:null;
}:null;
var xpath=''//tr[@class="navigationBarItem"]/td/a[contains(@href, "/mydspace")]'';
if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
var singpath=''//center/table[@class="itemDisplayTable"]'';
if(doc.evaluate(singpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
return "book";
var searchpath=''//h3[text()="Item hits:"]'';
var browsdate=''//form[@method="get"][@action="browse-date"]'';
var browstitl=''//form[@method="get"][@action="browse-title"]'';
var other=''//tr/th[@class="oddRowEvenCol"]'';
if(doc.evaluate(searchpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
||doc.evaluate(browsdate, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
||doc.evaluate(browstitl, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
||doc.evaluate(other, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
return "multiple";
REPLACE INTO translators VALUES ('c073446d-b90b-47be-a29d-785995d38d87', '1.0.0b4.r5', '', '2008-02-22 20:30:00', '0', '100', '4', 'DSpace', 'Michael Berkowitz', 'dspace',
'function detectWeb(doc, url) {
if (doc.evaluate(''//center/table[@class="itemDisplayTable"]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
return "document";
} else if (doc.evaluate(''//table[@class="miscTable"]//td[2]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
return "multiple";
}
}',
'function scrape(doc) {
var namespace=doc.documentElement.namespaceURI;
var nsResolver=namespace?function(prefix) {
return (prefix=="x")?namespace:null;
}:null;
var singpath=''//center/table[@class="itemDisplayTable"]'';
if(doc.evaluate(singpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
var labelpath=''//tr/td[@class="metadataFieldLabel"]'';
var labels=doc.evaluate(labelpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var valpath=''//tr/td[@class="metadataFieldValue"]'';
var values=doc.evaluate(valpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var temp
var newItem=new Zotero.Item("book");
while(temp=labels.iterateNext()) {
var field=temp.textContent;
field=Zotero.Utilities.cleanString(field);
var temp2=values.iterateNext();
var value=temp2.textContent;
if(field.substring(0, 5)=="Title")
newItem.title=value;
else if(field.substring(0, 6)=="Author") {
var authtext=temp2.innerHTML;
authtext=authtext.replace(/<br>/g, "\n");
var authors=authtext.split("\n");
for(var i=0; i<=authors.length-1; i++) {
var comma=authors[i].indexOf(",");
if(comma!=-1)
newItem.creators.push({lastName:authors[i].substring(0, comma),
firstName:authors[i].substring(comma+2), creatorType:"author"});
else
newItem.creators.push({lastName:authors[i], creatorType:"author"});
}
}
else if(field.substring(0, 4)=="URI")
newItem.url=value;
else if(field.substring(0, 10)=="Issue Date")
newItem.date=value;
else if(field.substring(0, 9)=="Publisher")
newItem.publisher=value;
else if(field.substring(0, 8)=="Abstract")
newItem.abstract=value;
else if(field.substring(0, 11)=="Description") {
var pullre=new RegExp("\\|Pages ([^\\|]+)\\|");
Zotero.debug(pullre);
var matches=pullre.exec(value);
if(matches)
newItem.pages=matches[1];
}
}
var pdfpath=''//tr[td/text()="Adobe PDF"]/td/a'';
var pdflink=doc.evaluate(pdfpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(pdflink)
newItem.attachments.push({url:pdflink.href, title:newItem.title, mimeType:"application/pdf"});
newItem.complete();
}
'var itemTypes = {
"Article":"journalArticle",
"Book":"book",
"Thesis":"thesis",
"Working Paper":"report",
"Technical Report":"report"
}
function doWeb(doc, url) {
var namespace=doc.documentElement.namespaceURI;
var nsResolver=namespace?function(prefix) {
return (prefix=="x")?namespace:null;
}:null;
var singpath=''//center/table[@class="itemDisplayTable"]'';
if(doc.evaluate(singpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
scrape(doc);
var searchpath=''//h3[text()="Item hits:"]'';
var browsdate=''//form[@method="get"][@action="browse-date"]'';
var browstitl=''//form[@method="get"][@action="browse-title"]'';
var other=''//tr/th[@class="oddRowEvenCol"]'';
if(doc.evaluate(searchpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
||doc.evaluate(browsdate, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
||doc.evaluate(browstitl, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
||doc.evaluate(other, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
var headerpath=''//tr[th/@class="oddRowEvenCol"]/th'';
var headers=doc.evaluate(headerpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var header;
var index=1;
while(header=headers.iterateNext())
if(header.textContent.substring(0, 5)=="Title")
break;
else
index++;
var titlpath=''//tbody[tr/th/@class="oddRowOddCol"]/tr/td[''+index+'']//a'';
var titles=doc.evaluate(titlpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var title;
var items=new Array();
while(title=titles.iterateNext())
items[title.href]=title.textContent;
items=Zotero.selectItems(items);
var urls=new Array();
for(var link in items)
urls.push(link);
Zotero.Utilities.processDocuments(urls, function(doc) {
scrape(doc);
}, function() {
Zotero.done();
});
function doWeb(doc, url) {
var records = new Array();
if (detectWeb(doc, url) == "multiple") {
var items = new Object();
var xpath = ''//table[@class="miscTable"]/tbody/tr/td[2]/a'';
var rows = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null);
var row;
while (row = rows.iterateNext()) {
items[row.href] = row.textContent;
}
items = Zotero.selectItems(items);
for (var i in items) {
records.push(i + ''?mode=full'');
}
} else {
records = [url.match(/^([^?]+)\??/)[1] + "?mode=full"];
}
Zotero.wait();
Zotero.debug(records);
Zotero.Utilities.processDocuments(records, function(newDoc) {
Zotero.debug(newDoc.location.href);
var values = new Object();
var fields = newDoc.evaluate(''//table[@class="itemDisplayTable"]/tbody/tr/td[1]'', newDoc, null, XPathResult.ANY_TYPE, null);
var data = newDoc.evaluate(''//table[@class="itemDisplayTable"]/tbody/tr/td[2]'', newDoc, null, XPathResult.ANY_TYPE, null);
var field2;
var datum2;
var newItem = new Zotero.Item();
while ((field2 = fields.iterateNext()) && (datum2 = data.iterateNext())) {
var field = field2.textContent.replace(/^dc\./, "");
var datum = datum2.textContent;
if (field == "contributor.author") {
var name = datum.split(",");
newItem.creators.push({firstName:name[1], lastName:name[0], creatorType:"author"});
} else if (field == "dentifier.uri") {
newItem.url = datum;
} else if (field == "title") {
newItem.title = datum;
} else if (field == "type") {
if (itemTypes[datum]) {
newItem.itemType = itemTypes[datum];
} else {
newItem.itemType = "document";
}
} else if (field == "description.abstract") {
newItem.abstractNote = datum;
} else if (field == "date.available") {
newItem.date = datum.replace(/T.*$/, "");
} else if (field == "subject") {
newItem.tags.push(datum);
} else if (field == "publisher") {
newItem.publisher = datum;
} else if (field == "identifier.issn") {
newItem.ISSN = datum;
} else if (field == "relation.ispartofseries") {
if (datum.match(/Vol/)) {
newItem.volume = datum.match(/\d+/)[0];
} else if (datum.match(/No/)) {
newItem.issue = datum.match(/\d+/)[0];
}
} else if (field == "rights") {
newItem.rights = datum;
}
}
var pdf = newDoc.evaluate(''//td[@class="standard"]/a'', newDoc, null, XPathResult.ANY_TYPE, null).iterateNext().href;
newItem.attachments = [
{url:newDoc.location.href, title:"DSpace Snapshot", mimeType:"text/html"},
{url:pdf, title:"DSpace PDF", mimeType:"application/pdf"}
];
Zotero.debug(newItem);
newItem.complete();
}, function() {Zotero.done;});
}');
REPLACE INTO translators VALUES ('7987b420-e8cb-4bea-8ef7-61c2377cd686', '1.0.0b4.r1', '', '2008-02-06 20:00:00', '0', '100', '4', 'NASA ADS', 'Asa Kusuma and Ramesh Srigiriraju', 'http://(ukads|cdsads|ads|adsabs|esoads|adswww|www.ads)\.(inasan|iucaa.ernet|nottingham.ac|harvard|eso|u-strasbg|nao.ac|astro.puc|bao.ac|on|kasi.re|grangenet|lipi.go|mao.kiev)\.(edu|org|net|fr|jp|cl|id|uk|cn|ua|in|ru|br|kr)/(?:cgi-bin|abs)/',