migrate sciencedirect to use processAsycn(), something strange with the DOI xpath, using the data from RIS and regex instead

This commit is contained in:
Matt Burton 2009-06-30 17:20:10 +00:00
parent 5e080b78f0
commit fb104160d5

View File

@ -58,34 +58,63 @@ function doWeb(doc, url) {
Zotero.debug('no items'); Zotero.debug('no items');
return; return;
} }
Zotero.Utilities.processDocuments(articles, function(newDoc) {
var doi = newDoc.evaluate('//div[@class="articleHeaderInner"][@id="articleHeader"]/a[contains(text(), "doi")]', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.substr(4);
var tempPDF = newDoc.evaluate('//a[@class="noul" and div/div[contains(text(), "PDF")]]', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); var sets = [];
if (!tempPDF) { // PDF xpath failed, lets try another for each (article in articles) {
tempPDF = newDoc.evaluate('//a[@class="noul" and contains(text(), "PDF")]', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); sets.push({article:article});
if (!tempPDF) { // second PDF xpath failed set PDF to null to avoid item.attachments }
var PDF = null; var first = function(set, next) {
var article = set.article;
Zotero.Utilities.processDocuments(article, function(newDoc) {
var tempPDF = newDoc.evaluate('//a[@class="noul" and div/div[contains(text(), "PDF")]]', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if (!tempPDF) { // PDF xpath failed, lets try another
tempPDF = newDoc.evaluate('//a[@class="noul" and contains(text(), "PDF")]', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if (!tempPDF) { // second PDF xpath failed set PDF to null to avoid item.attachments
var PDF = null;
} else {
var PDF = tempPDF.href; // second xpath succeeded, use that link
}
} else { } else {
var PDF = tempPDF.href; // second xpath succeeded, use that link var PDF = tempPDF.href; // first xpath succeeded, use that link
} }
} else {
var PDF = tempPDF.href; // first xpath succeeded, use that link
}
var url = newDoc.location.href; var url = newDoc.location.href;
var get = newDoc.evaluate('//a[img[contains(@src, "exportarticle_a.gif")]]', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().href; var get = newDoc.evaluate('//a[img[contains(@src, "exportarticle_a.gif")]]', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().href;
// if the PDF is available make it an attachment otherwise only use snapshot. // if the PDF is available make it an attachment otherwise only use snapshot.
if (PDF) { if (PDF) {
var attachments = [ var attachments = [
{url:url, title:"ScienceDirect Snapshot", mimeType:"text/html"}, {url:url, title:"ScienceDirect Snapshot", mimeType:"text/html"},
{url:PDF, title:"ScienceDirect Full Text PDF", mimeType:"application/pdf"} // Sometimes PDF is null...I hope that is ok {url:PDF, title:"ScienceDirect Full Text PDF", mimeType:"application/pdf"} // Sometimes PDF is null...I hope that is ok
]; ];
} else { } else {
var attachments = [ var attachments = [
{url:url, title:"ScienceDirect Snapshot", mimeType:"text/html"}, {url:url, title:"ScienceDirect Snapshot", mimeType:"text/html"},
]; ];
} }
// This does not work, not sure why.
//var doi = newDoc.evaluate('//a[contains(text(), "doi")]/text()', newDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
//Zotero.debug(doi);
//doi = doi.textContent.substr(4);
// pass these values to the next function
//set.doi = doi;
set.url = url;
set.get = get;
set.attachments = attachments;
next();
});
};
var second = function(set, next) {
var url = set.url;
var get = set.get;
Zotero.Utilities.HTTP.doGet(get, function(text) { Zotero.Utilities.HTTP.doGet(get, function(text) {
var md5 = text.match(/<input type=hidden name=md5 value=([^>]+)>/)[1]; var md5 = text.match(/<input type=hidden name=md5 value=([^>]+)>/)[1];
var acct = text.match(/<input type=hidden name=_acct value=([^>]+)>/)[1]; var acct = text.match(/<input type=hidden name=_acct value=([^>]+)>/)[1];
@ -101,26 +130,50 @@ function doWeb(doc, url) {
} }
var post = "_ob=DownloadURL&_method=finish&_acct=" + acct + "&_userid=" + userid + "&_docType=FLA&" + docID + "&md5=" + md5 + "&count=1&JAVASCRIPT_ON=Y&format=cite-abs&citation-type=RIS&Export=Export&x=26&y=17"; var post = "_ob=DownloadURL&_method=finish&_acct=" + acct + "&_userid=" + userid + "&_docType=FLA&" + docID + "&md5=" + md5 + "&count=1&JAVASCRIPT_ON=Y&format=cite-abs&citation-type=RIS&Export=Export&x=26&y=17";
var baseurl = url.match(/https?:\/\/[^/]+\//)[0]; var baseurl = url.match(/https?:\/\/[^/]+\//)[0];
Zotero.Utilities.HTTP.doPost(baseurl + 'science', post, function(text) {
var translator = Zotero.loadTranslator("import");
translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7");
translator.setString(text);
translator.setHandler("itemDone", function(obj, item) {
item.attachments = attachments;
if(item.notes[0]) { set.post = post;
item.abstractNote = item.notes[0].note; set.baseurl = baseurl;
item.notes = new Array();
} next();
if (doi) {
item.DOI = doi;
}
item.complete();
});
translator.translate();
}, false, 'windows-1252');
}); });
}, function() {Zotero.done();});
};
var third = function(set, next) {
var baseurl = set.baseurl;
var post = set.post;
var attachments = set.attachments;
//var doi = set.doi;
Zotero.Utilities.HTTP.doPost(baseurl + 'science', post, function(text) {
var translator = Zotero.loadTranslator("import");
translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7");
translator.setString(text);
translator.setHandler("itemDone", function(obj, item) {
item.attachments = attachments;
if(item.notes[0]) {
item.abstractNote = item.notes[0].note;
item.notes = new Array();
}
item.DOI = item.DOI.substr(10);
//if (doi) {
// item.DOI = doi;
//}
item.complete();
});
translator.translate();
next();
}, false, 'windows-1252');
};
var functioncallbacks = [first, second, third];
Zotero.Utilities.processAsync(sets, functioncallbacks, function() {Zotero.done()});
} else { } else {
var articles = new Array(); var articles = new Array();
if (detectWeb(doc, url) == "multiple") { if (detectWeb(doc, url) == "multiple") {
@ -149,54 +202,89 @@ function doWeb(doc, url) {
Zotero.debug('no items'); Zotero.debug('no items');
return; return;
} }
Zotero.Utilities.processDocuments(articles, function(doc2) {
var item = new Zotero.Item("journalArticle");
item.repository = "ScienceDirect"; var sets = [];
item.url = doc2.location.href; for each (article in articles) {
var title = doc2.title.match(/^[^-]+\-([^:]+):(.*)$/); sets.push({article:article});
item.title = Zotero.Utilities.trimInternal(title[2]); }
item.publicationTitle = Zotero.Utilities.trimInternal(title[1]);
voliss = doc2.evaluate('//div[@class="pageText"][@id="sdBody"]/table/tbody/tr/td[1]', doc2, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
if (voliss.match(/Volume\s+\d+/)) item.volume = voliss.match(/Volume\s+(\d+)/)[1]; var first = function(set, next) {
if (voliss.match(/Issues?\s+[^,]+/)) item.issue = voliss.match(/Issues?\s+([^,]+)/)[1];
if (voliss.match(/(J|F|M|A|S|O|N|D)\w+\s+\d{4}/)) item.date = voliss.match(/(J|F|M|A|S|O|N|D)\w+\s+\d{4}/)[0]; var article = set.article;
if (voliss.match(/Pages?\s+[^,^\s]+/)) item.pages = voliss.match(/Pages?\s+([^,^\s]+)/)[1];
item.DOI = doc2.evaluate('//div[@class="articleHeaderInner"][@id="articleHeader"]/a[contains(text(), "doi")]', doc2, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.substr(4); Zotero.Utilities.processDocuments(article, function(doc2) {
var abspath = '//div[@class="articleHeaderInner"][@id="articleHeader"]/div[@class="articleText"]/p'; var item = new Zotero.Item("journalArticle");
var absx = doc2.evaluate(abspath, doc2, nsResolver, XPathResult.ANY_TYPE, null); item.repository = "ScienceDirect";
var ab; item.url = doc2.location.href;
item.abstractNote = "" var title = doc2.title.match(/^[^-]+\-([^:]+):(.*)$/);
while (ab = absx.iterateNext()) { item.title = Zotero.Utilities.trimInternal(title[2]);
item.abstractNote += Zotero.Utilities.trimInternal(ab.textContent) + " "; item.publicationTitle = Zotero.Utilities.trimInternal(title[1]);
} voliss = doc2.evaluate('//div[@class="pageText"][@id="sdBody"]/table/tbody/tr/td[1]', doc2, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
if (item.abstractNote.substr(0, 7) == "Summary") { if (voliss.match(/Volume\s+\d+/)) item.volume = voliss.match(/Volume\s+(\d+)/)[1];
item.abstractNote = item.abstractNote.substr(9); if (voliss.match(/Issues?\s+[^,]+/)) item.issue = voliss.match(/Issues?\s+([^,]+)/)[1];
} if (voliss.match(/(J|F|M|A|S|O|N|D)\w+\s+\d{4}/)) item.date = voliss.match(/(J|F|M|A|S|O|N|D)\w+\s+\d{4}/)[0];
var tagpath = '//div[@class="articleText"]/p[strong[starts-with(text(), "Keywords:")]]'; if (voliss.match(/Pages?\s+[^,^\s]+/)) item.pages = voliss.match(/Pages?\s+([^,^\s]+)/)[1];
if (doc2.evaluate(tagpath, doc2, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { // why doesn't this work?
if (doc2.evaluate(tagpath, doc2, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(":")[1]) { //item.DOI = doc2.evaluate('//a[contains(text(), "doi")]/text()', doc2, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.substr(4);
var tags = doc2.evaluate(tagpath, doc2, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(":")[1].split(";"); var abspath = '//div[@class="articleHeaderInner"][@id="articleHeader"]/div[@class="articleText"]/p';
for (var i in tags) { var absx = doc2.evaluate(abspath, doc2, nsResolver, XPathResult.ANY_TYPE, null);
item.tags.push(Zotero.Utilities.trimInternal(tags[i])); var ab;
item.abstractNote = "";
while (ab = absx.iterateNext()) {
item.abstractNote += Zotero.Utilities.trimInternal(ab.textContent) + " ";
}
if (item.abstractNote.substr(0, 7) == "Summary") {
item.abstractNote = item.abstractNote.substr(9);
}
var tagpath = '//div[@class="articleText"]/p[strong[starts-with(text(), "Keywords:")]]';
if (doc2.evaluate(tagpath, doc2, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
if (doc2.evaluate(tagpath, doc2, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(":")[1]) {
var tags = doc2.evaluate(tagpath, doc2, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(":")[1].split(";");
for (var i in tags) {
item.tags.push(Zotero.Utilities.trimInternal(tags[i]));
}
} }
} }
} item.attachments.push({url:doc2.location.href, title:"ScienceDirect Snapshot", mimeType:"text/html"});
item.attachments.push({url:doc2.location.href, title:"ScienceDirect Snapshot", mimeType:"text/html"});
set.item = item;
next();
});
};
var second = function(set, next) {
var item = set.item;
Zotero.Utilities.HTTP.doGet(item.url, function(text) { Zotero.Utilities.HTTP.doGet(item.url, function(text) {
var aus = text.match(/<strong>\s+<p>.*<\/strong>/)[0].replace(/<sup>/g, "$").replace(/<\/sup>/g, "$"); item.DOI = text.match(/>doi:([^<]*)/)[1];
aus = aus.replace(/\$[^$]*\$/g, "");
aus = aus.replace(/<a[^>]*>/g, "$").replace(/<\/a[^>]*>/g, "$"); try {
aus = aus.replace(/\$[^$]*\$/g, ""); var aus = text.match(/<strong>\s+<p>.*<\/strong>/)[0].replace(/<sup>/g, "$").replace(/<\/sup>/g, "$");
aus = Zotero.Utilities.cleanTags(aus); aus = aus.replace(/\$[^$]*\$/g, "");
aus = aus.split(/(,|and)/); aus = aus.replace(/<a[^>]*>/g, "$").replace(/<\/a[^>]*>/g, "$");
for (var a in aus) { aus = aus.replace(/\$[^$]*\$/g, "");
if (aus[a] != "," && aus[a] != "and" && aus[a].match(/\w+/)) { aus = Zotero.Utilities.cleanTags(aus);
item.creators.push(Zotero.Utilities.cleanAuthor(Zotero.Utilities.unescapeHTML(Zotero.Utilities.trimInternal(aus[a]), "author"))); aus = aus.split(/(,|and)/);
for (var a in aus) {
if (aus[a] != "," && aus[a] != "and" && aus[a].match(/\w+/)) {
item.creators.push(Zotero.Utilities.cleanAuthor(Zotero.Utilities.unescapeHTML(Zotero.Utilities.trimInternal(aus[a]), "author")));
}
} }
} catch(e) {
Zotero.debug("No Authors listed.");
} }
item.complete(); item.complete();
next();
}); });
}, function() {Zotero.done();}); };
var functioncallbacks = [first, second];
Zotero.Utilities.processAsync(sets, functioncallbacks, function() {Zotero.done()});
} }
Zotero.wait(); Zotero.wait();
} }