Trans: Nearly brand-new ACM translator submitted by John McCaffery
This commit is contained in:
parent
f4f7ab93e9
commit
9d6c7740c5
|
@ -1,138 +1,408 @@
|
|||
{
|
||||
"translatorID":"e78d20f7-488-4023-831-dfe39679f3f",
|
||||
"translatorType":4,
|
||||
"label":"ACM",
|
||||
"creator":"Simon Kornblith and Michael Berkowitz",
|
||||
"target":"https?://[^/]*portal\\.acm\\.org[^/]*/(?:results\\.cfm|citation\\.cfm)",
|
||||
"minVersion":"1.0.0b3.r1",
|
||||
"maxVersion":"",
|
||||
"priority":100,
|
||||
"inRepository":true,
|
||||
"lastUpdated":"2009-05-05 07:15:00"
|
||||
"translatorID":"f3f092bf-ae09-4be6-8855-a22ddd817925",
|
||||
"label":"ACM Digital Library",
|
||||
"creator":"Simon Kornblith, Michael Berkowitz and John McCaffery",
|
||||
"target":"^https?://[^/]*portal\\.acm\\.org[^/]*/(?:results\\.cfm|citation\\.cfm)",
|
||||
"minVersion":"1.0",
|
||||
"maxVersion":"",
|
||||
"priority":100,
|
||||
"inRepository":"1",
|
||||
"translatorType":4,
|
||||
"lastUpdated":"2010-11-10 23:55:19"
|
||||
}
|
||||
|
||||
/**
|
||||
* The XPath for all the search result <a> elements
|
||||
*/
|
||||
var searchResultX = '//td[@colspan="3"]/a[@class="medium-text" and @target="_self"]';
|
||||
/**
|
||||
* The XPath for all the journal TOC <a> elements
|
||||
*/
|
||||
var tocResultX = '//td[@colspan="1"]/span[@style]/a[contains(@href,"citation.cfm")]';
|
||||
|
||||
/**
|
||||
* The XPath for the tag elements in a justified format tags list
|
||||
*/
|
||||
var justifiedTagX = '//div[@id="divtags"]/p/a';
|
||||
/**
|
||||
* The XPath for the tag elements in an un-justified format tags list
|
||||
*/
|
||||
var unjustifiedTagX = '//div[@id="divtags"]/a';
|
||||
/**
|
||||
* the XPath for the "more tags" link element
|
||||
*/
|
||||
var moreTagsX = '//a[@href="javascript:ColdFusion.Window.show(' + "'thetags'" + ')"]';
|
||||
/**
|
||||
* the XPath for the tag elements in the "more tags" popup
|
||||
*/
|
||||
var moreTagX = '//a/span[@class="small-text"]';
|
||||
/**
|
||||
* the XPath for the title heading element - not strictly necessary, more helpful for debugging
|
||||
*/
|
||||
var titleX = '//div[@class="large-text"]/h1[@class="mediumb-text"]/strong';
|
||||
/**
|
||||
* XPath for Table of Contents headline for journal issue
|
||||
*/
|
||||
var tocX = "//div[@id='citationdetails']//h5[@class='medium-text' and contains(.,'Table of Contents')]";
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Scan to see what type of page this is
|
||||
* @param doc The XML document describing the page
|
||||
* @param url The URL of the page being scanned
|
||||
* @return What type of article this page is (multiple, journal or conference proceedings)
|
||||
*/
|
||||
function detectWeb(doc, url) {
|
||||
var nsResolver = getNsResolver(doc);
|
||||
var title = getText(titleX, doc, nsResolver);
|
||||
Zotero.debug("Title: " + title);
|
||||
|
||||
if(url.indexOf("/results.cfm") != -1) {
|
||||
var items = Zotero.Utilities.getItemArray(doc, doc, '^https?://[^/]+/citation.cfm\\?[^#]+$');
|
||||
// hack to return multiple if there are items
|
||||
for(var i in items) {
|
||||
return "multiple";
|
||||
}
|
||||
} else {
|
||||
var onClick = doc.evaluate('//a[substring(text(), 5, 7) = "EndNote"]', doc, null, XPathResult.ANY_TYPE,
|
||||
null).iterateNext().getAttribute("onClick");
|
||||
if(onClick.match("proceeding.article")) {
|
||||
Zotero.debug("Multiple items detected");
|
||||
return "multiple";
|
||||
} else if (url.indexOf("/citation.cfm") != -1) {
|
||||
Zotero.debug("Single item detected");
|
||||
return getArticleType(doc, url, nsResolver);
|
||||
/*
|
||||
var type = getArticleType(doc, url, nsResolver);
|
||||
if (type .indexOf("conferencePaper") != -1) {
|
||||
return "conferencePaper";
|
||||
} else {
|
||||
} else
|
||||
return "journalArticle";
|
||||
}
|
||||
}*/
|
||||
}
|
||||
}
|
||||
|
||||
var urls = new Array();
|
||||
|
||||
// this handles sequential loading, since first we need to process a document (to get the abstract), then
|
||||
// get the Refer metadata, then process the next document, etc.
|
||||
function getNext() {
|
||||
if(urls.length) {
|
||||
var url = urls.shift();
|
||||
Zotero.Utilities.processDocuments([url], function(doc) { scrape(doc); });
|
||||
} else {
|
||||
Zotero.done();
|
||||
}
|
||||
}
|
||||
|
||||
function scrape(doc) {
|
||||
var onClick = doc.evaluate('//a[substring(text(), 5, 7) = "EndNote"]', doc, null, XPathResult.ANY_TYPE,
|
||||
null).iterateNext().getAttribute("onClick");
|
||||
var m = onClick.match(/'([^']+)'/);
|
||||
|
||||
if (doc.evaluate('//div[@class="abstract"]/p[@class="abstract"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
var abstract = doc.evaluate('//div[@class="abstract"]/p[@class="abstract"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
if (!abstract.textContent.match(/\w+/)) {
|
||||
var abstract = doc.evaluate('//div[@class="abstract"]/p[2]', doc, null, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
}
|
||||
if(abstract) abstract = Zotero.Utilities.trimInternal(abstract.textContent);
|
||||
}
|
||||
var snapshot = doc.location.href;
|
||||
var attachments = new Array();
|
||||
var url;
|
||||
var typeLinks = doc.evaluate('//td[@class="smaller-text"]/a[img]', doc, null,
|
||||
XPathResult.ANY_TYPE, null);
|
||||
var typeLink;
|
||||
while(typeLink = typeLinks.iterateNext()) {
|
||||
var linkText = typeLink.textContent.toLowerCase();
|
||||
linkText = linkText.replace(/(\t|\n| )/g, "");
|
||||
if(linkText == "pdf") {
|
||||
attachments.push({title:"ACM Full Text PDF", mimeType:"application/pdf", url:typeLink.href});
|
||||
url = typeLink.href;
|
||||
} else if(linkText == "html") {
|
||||
url = snapshot = typeLink.href;
|
||||
}
|
||||
}
|
||||
|
||||
attachments.push({title:"ACM Snapshot", mimeType:"text/html", url:snapshot});
|
||||
|
||||
var keywords = new Array();
|
||||
var keywordLinks = doc.evaluate('//p[@class="keywords"]/a', doc, null,
|
||||
XPathResult.ANY_TYPE, null);
|
||||
var keywordLink;
|
||||
while(keywordLink = keywordLinks.iterateNext()) {
|
||||
keywords.push(Zotero.Utilities.trimInternal(keywordLink.textContent.toLowerCase()));
|
||||
}
|
||||
var doi = "";
|
||||
var doiElmt = doc.evaluate('/html/body/div/table/tbody/tr[4]/td/table/tbody/tr/td/table/tbody/tr[3]/td[2][@class="small-text"]/a', doc, null, XPathResult.ANY_TYPE, null).iterateNext()
|
||||
if (doiElmt){
|
||||
var match = doiElmt.textContent.match(/org\/(.*)/);
|
||||
if (match){
|
||||
doi = match[1];
|
||||
}
|
||||
}
|
||||
|
||||
Zotero.Utilities.HTTP.doGet("http://portal.acm.org/"+m[1], function(text) {
|
||||
// split() may no longer be necessary
|
||||
var m = text.split(/<\/?pre[^>]*>/ig);
|
||||
if (m[1]) {
|
||||
var text = m[1];
|
||||
}
|
||||
// unescape HTML for extended characters
|
||||
function unescapeHTML(str, p1){
|
||||
return Zotero.Utilities.unescapeHTML("&#"+p1);
|
||||
}
|
||||
text = text.replace(/\\&\\#([^;]+;)/g, unescapeHTML);
|
||||
// load Refer translator
|
||||
var translator = Zotero.loadTranslator("import");
|
||||
translator.setTranslator("881f60f2-0802-411a-9228-ce5f47b64c7d");
|
||||
translator.setString(text);
|
||||
translator.setHandler("itemDone", function(obj, item) {
|
||||
if(abstract) item.abstractNote = abstract;
|
||||
item.attachments = attachments;
|
||||
item.tags = keywords;
|
||||
item.DOI = doi;
|
||||
item.url = snapshot;
|
||||
item.complete();
|
||||
});
|
||||
translator.translate();
|
||||
|
||||
getNext();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the page
|
||||
* @param doc The XML document describing the page
|
||||
* @param url The URL of the page being scanned
|
||||
*/
|
||||
function doWeb(doc, url) {
|
||||
if(url.indexOf("/results.cfm") != -1) {
|
||||
var items = Zotero.Utilities.getItemArray(doc, doc, '^https?://[^/]+/citation.cfm\\?[^#]+$');
|
||||
|
||||
items = Zotero.selectItems(items);
|
||||
if(!items) return true;
|
||||
|
||||
for(var url in items) {
|
||||
urls.push(url);
|
||||
}
|
||||
|
||||
getNext();
|
||||
} else {
|
||||
scrape(doc);
|
||||
Zotero.debug("test do");
|
||||
var nsResolver = getNsResolver(doc, url);
|
||||
|
||||
//If there are multiple pages
|
||||
if (getArticleType(doc, url) == "multiple") {
|
||||
//If this is a search results page
|
||||
if (url.indexOf("results.cfm") != -1)
|
||||
scrapeMulti(doc, url, nsResolver, "search");
|
||||
else if(getText(tocX, doc, nsResolver) =="Table of Contents")
|
||||
scrapeMulti(doc, url, nsResolver, "toc");
|
||||
Zotero.wait();
|
||||
} //If this is a single page
|
||||
else
|
||||
scrape(doc, url, nsResolver);
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape search results and journal tables of contents
|
||||
* @param doc The XML document describing the page
|
||||
* @param url The URL of the page being scanned
|
||||
* @param nsResolver the namespace resolver function
|
||||
* @param type Type of result-- "search" or "toc"
|
||||
*/
|
||||
function scrapeMulti(doc, url, nsResolver, type) {
|
||||
switch(type) {
|
||||
case "toc":
|
||||
Zotero.debug("Scraping journal TOC");
|
||||
var resultPath= doc.evaluate(tocResultX, doc, null, XPathResult.ANY_TYPE, null);
|
||||
break;
|
||||
case "search":
|
||||
Zotero.debug("Scraping search");
|
||||
default:
|
||||
var resultPath= doc.evaluate(searchResultX, doc, null, XPathResult.ANY_TYPE, null);
|
||||
}
|
||||
Zotero.debug("hi"+resultPath.iterateNext().textContent);
|
||||
|
||||
//Count how mange pages have been scraped
|
||||
var node;
|
||||
var urls = {};
|
||||
//Iterate through all the results
|
||||
while(node= resultPath.iterateNext()) {
|
||||
urls[node.href] = node.textContent;
|
||||
}
|
||||
|
||||
Zotero.wait();
|
||||
}
|
||||
var items = Zotero.selectItems(urls);
|
||||
if(!items) return true;
|
||||
|
||||
var i;
|
||||
urls = [];
|
||||
for (i in items) urls.push(i);
|
||||
|
||||
Zotero.Utilities.processDocuments(urls, scrape, function(){Zotero.done()});
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape a single page
|
||||
* @param doc The XML document describing the page
|
||||
*/
|
||||
function scrape(doc) {
|
||||
var url = doc.location.href;
|
||||
var nsResolver = getNsResolver(doc, url);
|
||||
|
||||
//Get all the details not scraped from the bibtex file
|
||||
var tags = scrapeKeywords(doc);
|
||||
var attachments = scrapeAttachments(doc, url);
|
||||
var abs = scrapeAbstract(doc);
|
||||
var type = getArticleType(doc, url, nsResolver);
|
||||
var journal = getText("//meta[@name='citation_journal_title']/@content",doc, nsResolver);
|
||||
//Get the bibtex reference for this document as a string
|
||||
var bibtex = scrapeBibtex(url, nsResolver);
|
||||
|
||||
//Create the new item
|
||||
var newItem = new Zotero.Item(type);
|
||||
|
||||
//Use the bibtex translator to parse the bibtex string
|
||||
var translator = Zotero.loadTranslator("import");
|
||||
translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
|
||||
translator.setString(bibtex );
|
||||
//Set the function to run when the bibtex string has been parsed
|
||||
translator.setHandler("itemDone", function(obj, newItem) {
|
||||
//Store all details not parsed from the bibtex
|
||||
if(abs) newItem.abstractNote = abs;
|
||||
newItem.tags = tags;
|
||||
newItem.attachments = attachments;
|
||||
newItem.itemType= type;
|
||||
if (journal && journal != newItem.publicationTitle) {
|
||||
newItem.journalAbbreviation = newItem.publicationTitle;
|
||||
newItem.publicationTitle = journal;
|
||||
}
|
||||
// If the URL is just a DOI, clear it.
|
||||
if (newItem.url.match(/^http:\/\/doi\.acm\.org\//)) newItem.url = "";
|
||||
newItem.DOI = newItem.DOI.replace(/^http:\/\/doi\.acm\.org\//, '');
|
||||
var acmid = bibtex.match(/acmid = {(\d+)}/);
|
||||
if(acmid) newItem.extra = "ACM ID: "+ acmid[1];
|
||||
//Complete the parsing of the page
|
||||
newItem.complete();
|
||||
});
|
||||
|
||||
//Trigger the translation
|
||||
translator.translate();
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape all keywords attached to this document
|
||||
* @param doc The XML document describing the page
|
||||
* @return an array of all keywords attached to this document
|
||||
*/
|
||||
function scrapeKeywords(doc) {
|
||||
Zotero.debug("Scraping Keywords");
|
||||
//Try scraping keywords from the "more keywords" popup
|
||||
var keywords = scrapeMoreTagsKeywords(doc);
|
||||
|
||||
if (keywords) return keywords;
|
||||
|
||||
keywords = new Array();
|
||||
|
||||
//Otherwise look for the keywords - check justified format
|
||||
var keywordPath = doc.evaluate(justifiedTagX, doc, null, XPathResult.ANY_TYPE, null);
|
||||
var keywordNode = keywordPath.iterateNext();
|
||||
//If justified format didn't work check unjustified
|
||||
if (!keywordNode) {
|
||||
keywordPath = doc.evaluate(unjustifiedTagX, doc, null, XPathResult.ANY_TYPE, null);
|
||||
keywordNode = keywordPath.iterateNext();
|
||||
}
|
||||
//Iterate through all the keywords
|
||||
while(keywordNode) {
|
||||
keywords.push(Zotero.Utilities.trimInternal(keywordNode .textContent.toLowerCase()));
|
||||
Zotero.debug("Keyword: " + keywordNode .textContent.toLowerCase());
|
||||
keywordNode = keywordPath.iterateNext();
|
||||
}
|
||||
|
||||
return keywords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape keywords from a "more tags" popup
|
||||
* @param doc The XML document describing the page
|
||||
* @return an array of all the keywords attached to the page which will be used as the tags for the document
|
||||
*/
|
||||
function scrapeMoreTagsKeywords(doc) {
|
||||
var keywords = new Array();
|
||||
|
||||
//Look for a link for a javascript code for a "more tags" popup
|
||||
var morePath = doc.evaluate(moreTagsX, doc, null, XPathResult.ANY_TYPE, null);
|
||||
var moreNode = morePath ? morePath.iterateNext() : null;
|
||||
//If there is no "more tags" popup
|
||||
if (!moreNode)
|
||||
return null;
|
||||
|
||||
var keywordPath = doc.evaluate(moreTagX, doc, null, XPathResult.ANY_TYPE, null);
|
||||
|
||||
var keywordNode;
|
||||
//Iterate through all the keywords
|
||||
while(keywordNode = keywordPath.iterateNext()) {
|
||||
keywords.push(Zotero.Utilities.trimInternal(keywordNode .textContent.toLowerCase()));
|
||||
Zotero.debug("Keyword: " + keywordNode .textContent.toLowerCase());
|
||||
}
|
||||
return keywords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape all the relevant attachments from the page.
|
||||
* Firstly grabs a snapshot of the ACM page then looks for any links to the full text
|
||||
* @param doc The XML document describing the page
|
||||
* @param url The URL of the page being scanned
|
||||
* @return an array of all the attachments
|
||||
*/
|
||||
function scrapeAttachments(doc, url) {
|
||||
Zotero.debug("Scraping attachments");
|
||||
var attachments = new Array();
|
||||
|
||||
//Add the scrapeshot of this page
|
||||
attachments.push({title:"ACM Snapshot", mimeType:"text/html", url:url});
|
||||
|
||||
//XPath for the full text links
|
||||
var textPath = doc.evaluate('//a[@name="FullTextPdf" or @name="FullTextHtml" or @name="FullText Html"]', doc, null, XPathResult.ANY_TYPE, null);
|
||||
|
||||
var textNode;
|
||||
//Iterate through all the links
|
||||
while (textNode= textPath .iterateNext()) {
|
||||
var textURL= textNode.href;
|
||||
|
||||
//If the full text is a pdf
|
||||
if (textNode.name == "FullTextPdf") {
|
||||
Zotero.debug("Text PDF: " + textURL);
|
||||
attachments.push({title:"ACM Full Text PDF", mimeType:"application/pdf", url:textURL});
|
||||
} else { //Otherwise the text is an HTML link
|
||||
Zotero.debug("Text Page: " + textURL);
|
||||
attachments.push({title:"ACM Full Text HTML", mimeType:"text/html", url:textURL});
|
||||
}
|
||||
}
|
||||
|
||||
return attachments;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrape the abstract from the page
|
||||
* @param doc The XML document describing the page
|
||||
* @param url The URL of the page being scanned
|
||||
* @return a string with the text of the abstract
|
||||
*/
|
||||
function scrapeAbstract(doc) {
|
||||
Zotero.debug("Scraping abstract");
|
||||
var text = getText('//div[@style="display: inline;"]', doc);
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the text of the bibtex format reference
|
||||
* @param url The URL of the page being scanned
|
||||
* @param nsResolver the namespace resolver function
|
||||
* @return the bibtex reference as a trimmed string
|
||||
*/
|
||||
function scrapeBibtex(url, nsResolver) {
|
||||
Zotero.debug("Scraping full details from bibtex");
|
||||
//Get the ID of this document
|
||||
var id = getId(url);
|
||||
//The link of the bibtex popup
|
||||
var bibtex = "http://portal.acm.org/exportformats.cfm?id=" + id + "&expformat=bibtex";
|
||||
|
||||
Zotero.debug("Bibtex: " + bibtex);
|
||||
|
||||
//Get the xml document which will be loaded into the popup box
|
||||
var texDoc = Zotero.Utilities.retrieveDocument(bibtex);
|
||||
//Find the node with the bibtex text in it
|
||||
var path = texDoc.evaluate('//pre', texDoc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var node = path.iterateNext();
|
||||
|
||||
if (node != null && node.textContent != null) {
|
||||
var ref = node.textContent;
|
||||
Zotero.debug("\nref : " + (ref == null ? "null":ref));
|
||||
ref = Zotero.Utilities.trimInternal(ref);
|
||||
ref = Zotero.Utilities.trim(ref);
|
||||
|
||||
return ref;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the unique identifier of this document
|
||||
* @param url The URL of the page being scanned
|
||||
* @param journal [optional]whether to get the ID of the journal the document is in or of the document itself
|
||||
* @return a string containing the identifier of the document or journal the document is in
|
||||
*/
|
||||
function getId(url, journal) {
|
||||
if (journal=== undefined)
|
||||
journal= false;
|
||||
|
||||
var cfmIndex = url.indexOf(".cfm");
|
||||
var atIndex = url.indexOf('&');
|
||||
|
||||
var id = url.substr(cfmIndex + 8);
|
||||
|
||||
if (atIndex != -1)
|
||||
id = id.replace(url.substring(atIndex), "");
|
||||
|
||||
var dotIndex = id.indexOf('.');
|
||||
if (dotIndex != -1)
|
||||
if (!journal)
|
||||
id = id.replace(id .substring(0, (dotIndex+1)), "");
|
||||
else
|
||||
id = id.replace(id .substring(dotIndex), "");
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find out what kind of document this is
|
||||
* @param doc The XML document describing the page
|
||||
* @param url The URL of the page being scanned
|
||||
* @param nsResolver the namespace resolver function
|
||||
* @return a string with either "multiple", "journalArticle" or "conferencePaper" in it, depending on the type of document
|
||||
*/
|
||||
function getArticleType(doc, url, nsResolver) {
|
||||
var toc = doc.evaluate(tocX, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
if (url.indexOf("results.cfm") != -1 || toc) {
|
||||
Zotero.debug("Type: multiple");
|
||||
return "multiple";
|
||||
}
|
||||
|
||||
//XPath for the table cell which has either "Journal" or "Proceeding" in it
|
||||
var text = getText('//td[@nowrap="nowrap" and @style="padding-bottom: 0px;"]', doc, nsResolver);
|
||||
|
||||
Zotero.debug("Type: " + text);
|
||||
if (text.indexOf("Proceeding") != -1)
|
||||
return "conferencePaper";
|
||||
else if (text.indexOf("Magazine") != -1)
|
||||
return "magazineArticle";
|
||||
else
|
||||
return "journalArticle";
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the text from the first node defined by the given xPathString
|
||||
* @param pathString the XPath indicating which node to get the text from
|
||||
* @param doc The XML document describing the page
|
||||
* @param nsResolver the namespace resolver function
|
||||
* @return the text in the defined node or "Unable to scrape text" if the node was not found or if there was no text content
|
||||
*/
|
||||
function getText(pathString, doc, nsResolver) {
|
||||
var path = doc.evaluate(pathString, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var node = path.iterateNext();
|
||||
|
||||
if (node == null || node.textContent == undefined || node.textContent == null) {
|
||||
Zotero.debug("Unable to retrieve text for XPath: "+pathString);
|
||||
return "";
|
||||
}
|
||||
|
||||
return node.textContent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a function for returning the namespace of a given document given its prefix
|
||||
* @param nsResolver the namespace resolver function
|
||||
*/
|
||||
function getNsResolver(doc) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == 'x') return namespace;
|
||||
else return null;
|
||||
} : null;
|
||||
|
||||
return nsResolver;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user