-First draft of a DSpace translator using Dublin Core metadata.
This commit is contained in:
parent
81cbc50622
commit
7d8806fa97
199
scrapers.sql
199
scrapers.sql
|
@ -22,7 +22,7 @@
|
|||
|
||||
|
||||
-- Set the following timestamp to the most recent scraper update date
|
||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-02-22 16:30:00'));
|
||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-02-22 20:30:00'));
|
||||
|
||||
REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-06-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon',
|
||||
'function detectWeb(doc, url) {
|
||||
|
@ -1579,127 +1579,92 @@ function doWeb(doc, url) {
|
|||
}
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('0abd577b-ec45-4e9f-9081-448737e2fd34', '1.0.0b4.r5', '', '2008-02-06 21:00:00', '0', '100', '12', 'DSpace', 'Ramesh Srigiriraju', '',
|
||||
'function detectWeb(doc, url) {
|
||||
var namespace=doc.documentElement.namespaceURI;
|
||||
var nsResolver=namespace?function(prefix) {
|
||||
return (prefix=="x")?namespace:null;
|
||||
}:null;
|
||||
var xpath=''//tr[@class="navigationBarItem"]/td/a[contains(@href, "/mydspace")]'';
|
||||
if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
var singpath=''//center/table[@class="itemDisplayTable"]'';
|
||||
if(doc.evaluate(singpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
|
||||
return "book";
|
||||
var searchpath=''//h3[text()="Item hits:"]'';
|
||||
var browsdate=''//form[@method="get"][@action="browse-date"]'';
|
||||
var browstitl=''//form[@method="get"][@action="browse-title"]'';
|
||||
var other=''//tr/th[@class="oddRowEvenCol"]'';
|
||||
if(doc.evaluate(searchpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
|
||||
||doc.evaluate(browsdate, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
|
||||
||doc.evaluate(browstitl, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
|
||||
||doc.evaluate(other, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
|
||||
return "multiple";
|
||||
REPLACE INTO translators VALUES ('c073446d-b90b-47be-a29d-785995d38d87', '1.0.0b4.r5', '', '2008-02-22 20:30:00', '0', '100', '4', 'DSpace', 'Michael Berkowitz', 'dspace',
|
||||
'function detectWeb(doc, url) {
|
||||
if (doc.evaluate(''//center/table[@class="itemDisplayTable"]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
return "document";
|
||||
} else if (doc.evaluate(''//table[@class="miscTable"]//td[2]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
return "multiple";
|
||||
}
|
||||
}',
|
||||
'function scrape(doc) {
|
||||
var namespace=doc.documentElement.namespaceURI;
|
||||
var nsResolver=namespace?function(prefix) {
|
||||
return (prefix=="x")?namespace:null;
|
||||
}:null;
|
||||
var singpath=''//center/table[@class="itemDisplayTable"]'';
|
||||
if(doc.evaluate(singpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
var labelpath=''//tr/td[@class="metadataFieldLabel"]'';
|
||||
var labels=doc.evaluate(labelpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var valpath=''//tr/td[@class="metadataFieldValue"]'';
|
||||
var values=doc.evaluate(valpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var temp
|
||||
var newItem=new Zotero.Item("book");
|
||||
while(temp=labels.iterateNext()) {
|
||||
var field=temp.textContent;
|
||||
field=Zotero.Utilities.cleanString(field);
|
||||
var temp2=values.iterateNext();
|
||||
var value=temp2.textContent;
|
||||
if(field.substring(0, 5)=="Title")
|
||||
newItem.title=value;
|
||||
else if(field.substring(0, 6)=="Author") {
|
||||
var authtext=temp2.innerHTML;
|
||||
authtext=authtext.replace(/<br>/g, "\n");
|
||||
var authors=authtext.split("\n");
|
||||
for(var i=0; i<=authors.length-1; i++) {
|
||||
var comma=authors[i].indexOf(",");
|
||||
if(comma!=-1)
|
||||
newItem.creators.push({lastName:authors[i].substring(0, comma),
|
||||
firstName:authors[i].substring(comma+2), creatorType:"author"});
|
||||
else
|
||||
newItem.creators.push({lastName:authors[i], creatorType:"author"});
|
||||
}
|
||||
}
|
||||
else if(field.substring(0, 4)=="URI")
|
||||
newItem.url=value;
|
||||
else if(field.substring(0, 10)=="Issue Date")
|
||||
newItem.date=value;
|
||||
else if(field.substring(0, 9)=="Publisher")
|
||||
newItem.publisher=value;
|
||||
else if(field.substring(0, 8)=="Abstract")
|
||||
newItem.abstract=value;
|
||||
else if(field.substring(0, 11)=="Description") {
|
||||
var pullre=new RegExp("\\|Pages ([^\\|]+)\\|");
|
||||
Zotero.debug(pullre);
|
||||
var matches=pullre.exec(value);
|
||||
if(matches)
|
||||
newItem.pages=matches[1];
|
||||
}
|
||||
}
|
||||
var pdfpath=''//tr[td/text()="Adobe PDF"]/td/a'';
|
||||
var pdflink=doc.evaluate(pdfpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
if(pdflink)
|
||||
newItem.attachments.push({url:pdflink.href, title:newItem.title, mimeType:"application/pdf"});
|
||||
newItem.complete();
|
||||
}
|
||||
'var itemTypes = {
|
||||
"Article":"journalArticle",
|
||||
"Book":"book",
|
||||
"Thesis":"thesis",
|
||||
"Working Paper":"report",
|
||||
"Technical Report":"report"
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var namespace=doc.documentElement.namespaceURI;
|
||||
var nsResolver=namespace?function(prefix) {
|
||||
return (prefix=="x")?namespace:null;
|
||||
}:null;
|
||||
var singpath=''//center/table[@class="itemDisplayTable"]'';
|
||||
if(doc.evaluate(singpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
|
||||
scrape(doc);
|
||||
var searchpath=''//h3[text()="Item hits:"]'';
|
||||
var browsdate=''//form[@method="get"][@action="browse-date"]'';
|
||||
var browstitl=''//form[@method="get"][@action="browse-title"]'';
|
||||
var other=''//tr/th[@class="oddRowEvenCol"]'';
|
||||
if(doc.evaluate(searchpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
|
||||
||doc.evaluate(browsdate, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
|
||||
||doc.evaluate(browstitl, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()
|
||||
||doc.evaluate(other, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
var headerpath=''//tr[th/@class="oddRowEvenCol"]/th'';
|
||||
var headers=doc.evaluate(headerpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var header;
|
||||
var index=1;
|
||||
while(header=headers.iterateNext())
|
||||
if(header.textContent.substring(0, 5)=="Title")
|
||||
break;
|
||||
else
|
||||
index++;
|
||||
var titlpath=''//tbody[tr/th/@class="oddRowOddCol"]/tr/td[''+index+'']//a'';
|
||||
var titles=doc.evaluate(titlpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var title;
|
||||
var items=new Array();
|
||||
while(title=titles.iterateNext())
|
||||
items[title.href]=title.textContent;
|
||||
items=Zotero.selectItems(items);
|
||||
var urls=new Array();
|
||||
for(var link in items)
|
||||
urls.push(link);
|
||||
Zotero.Utilities.processDocuments(urls, function(doc) {
|
||||
scrape(doc);
|
||||
}, function() {
|
||||
Zotero.done();
|
||||
});
|
||||
function doWeb(doc, url) {
|
||||
var records = new Array();
|
||||
if (detectWeb(doc, url) == "multiple") {
|
||||
var items = new Object();
|
||||
var xpath = ''//table[@class="miscTable"]/tbody/tr/td[2]/a'';
|
||||
var rows = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null);
|
||||
var row;
|
||||
while (row = rows.iterateNext()) {
|
||||
items[row.href] = row.textContent;
|
||||
}
|
||||
items = Zotero.selectItems(items);
|
||||
for (var i in items) {
|
||||
records.push(i + ''?mode=full'');
|
||||
}
|
||||
} else {
|
||||
records = [url.match(/^([^?]+)\??/)[1] + "?mode=full"];
|
||||
}
|
||||
Zotero.wait();
|
||||
Zotero.debug(records);
|
||||
Zotero.Utilities.processDocuments(records, function(newDoc) {
|
||||
Zotero.debug(newDoc.location.href);
|
||||
var values = new Object();
|
||||
var fields = newDoc.evaluate(''//table[@class="itemDisplayTable"]/tbody/tr/td[1]'', newDoc, null, XPathResult.ANY_TYPE, null);
|
||||
var data = newDoc.evaluate(''//table[@class="itemDisplayTable"]/tbody/tr/td[2]'', newDoc, null, XPathResult.ANY_TYPE, null);
|
||||
var field2;
|
||||
var datum2;
|
||||
var newItem = new Zotero.Item();
|
||||
while ((field2 = fields.iterateNext()) && (datum2 = data.iterateNext())) {
|
||||
var field = field2.textContent.replace(/^dc\./, "");
|
||||
var datum = datum2.textContent;
|
||||
if (field == "contributor.author") {
|
||||
var name = datum.split(",");
|
||||
newItem.creators.push({firstName:name[1], lastName:name[0], creatorType:"author"});
|
||||
} else if (field == "dentifier.uri") {
|
||||
newItem.url = datum;
|
||||
} else if (field == "title") {
|
||||
newItem.title = datum;
|
||||
} else if (field == "type") {
|
||||
if (itemTypes[datum]) {
|
||||
newItem.itemType = itemTypes[datum];
|
||||
} else {
|
||||
newItem.itemType = "document";
|
||||
}
|
||||
} else if (field == "description.abstract") {
|
||||
newItem.abstractNote = datum;
|
||||
} else if (field == "date.available") {
|
||||
newItem.date = datum.replace(/T.*$/, "");
|
||||
} else if (field == "subject") {
|
||||
newItem.tags.push(datum);
|
||||
} else if (field == "publisher") {
|
||||
newItem.publisher = datum;
|
||||
} else if (field == "identifier.issn") {
|
||||
newItem.ISSN = datum;
|
||||
} else if (field == "relation.ispartofseries") {
|
||||
if (datum.match(/Vol/)) {
|
||||
newItem.volume = datum.match(/\d+/)[0];
|
||||
} else if (datum.match(/No/)) {
|
||||
newItem.issue = datum.match(/\d+/)[0];
|
||||
}
|
||||
} else if (field == "rights") {
|
||||
newItem.rights = datum;
|
||||
}
|
||||
}
|
||||
var pdf = newDoc.evaluate(''//td[@class="standard"]/a'', newDoc, null, XPathResult.ANY_TYPE, null).iterateNext().href;
|
||||
newItem.attachments = [
|
||||
{url:newDoc.location.href, title:"DSpace Snapshot", mimeType:"text/html"},
|
||||
{url:pdf, title:"DSpace PDF", mimeType:"application/pdf"}
|
||||
];
|
||||
Zotero.debug(newItem);
|
||||
newItem.complete();
|
||||
}, function() {Zotero.done;});
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('7987b420-e8cb-4bea-8ef7-61c2377cd686', '1.0.0b4.r1', '', '2008-02-06 20:00:00', '0', '100', '4', 'NASA ADS', 'Asa Kusuma and Ramesh Srigiriraju', 'http://(ukads|cdsads|ads|adsabs|esoads|adswww|www.ads)\.(inasan|iucaa.ernet|nottingham.ac|harvard|eso|u-strasbg|nao.ac|astro.puc|bao.ac|on|kasi.re|grangenet|lipi.go|mao.kiev)\.(edu|org|net|fr|jp|cl|id|uk|cn|ua|in|ru|br|kr)/(?:cgi-bin|abs)/',
|
||||
|
|
Loading…
Reference in New Issue
Block a user