From 9d6c7740c57f9bfea5320746150cf072f40f43f4 Mon Sep 17 00:00:00 2001 From: Avram Lyon Date: Tue, 16 Nov 2010 19:17:59 +0000 Subject: [PATCH] Trans: Nearly brand-new ACM translator submitted by John McCaffery --- translators/ACM.js | 520 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 395 insertions(+), 125 deletions(-) diff --git a/translators/ACM.js b/translators/ACM.js index 598b420b2..6c003fd6b 100644 --- a/translators/ACM.js +++ b/translators/ACM.js @@ -1,138 +1,408 @@ { - "translatorID":"e78d20f7-488-4023-831-dfe39679f3f", - "translatorType":4, - "label":"ACM", - "creator":"Simon Kornblith and Michael Berkowitz", - "target":"https?://[^/]*portal\\.acm\\.org[^/]*/(?:results\\.cfm|citation\\.cfm)", - "minVersion":"1.0.0b3.r1", - "maxVersion":"", - "priority":100, - "inRepository":true, - "lastUpdated":"2009-05-05 07:15:00" + "translatorID":"f3f092bf-ae09-4be6-8855-a22ddd817925", + "label":"ACM Digital Library", + "creator":"Simon Kornblith, Michael Berkowitz and John McCaffery", + "target":"^https?://[^/]*portal\\.acm\\.org[^/]*/(?:results\\.cfm|citation\\.cfm)", + "minVersion":"1.0", + "maxVersion":"", + "priority":100, + "inRepository":"1", + "translatorType":4, + "lastUpdated":"2010-11-10 23:55:19" } +/** + * The XPath for all the search result elements + */ +var searchResultX = '//td[@colspan="3"]/a[@class="medium-text" and @target="_self"]'; +/** + * The XPath for all the journal TOC elements + */ +var tocResultX = '//td[@colspan="1"]/span[@style]/a[contains(@href,"citation.cfm")]'; + +/** + * The XPath for the tag elements in a justified format tags list + */ +var justifiedTagX = '//div[@id="divtags"]/p/a'; +/** + * The XPath for the tag elements in an un-justified format tags list + */ +var unjustifiedTagX = '//div[@id="divtags"]/a'; +/** + * the XPath for the "more tags" link element + */ +var moreTagsX = '//a[@href="javascript:ColdFusion.Window.show(' + "'thetags'" + ')"]'; +/** + * the XPath for the tag elements in the "more tags" popup + */ +var moreTagX = '//a/span[@class="small-text"]'; +/** + * the XPath for the title heading element - not strictly necessary, more helpful for debugging + */ +var titleX = '//div[@class="large-text"]/h1[@class="mediumb-text"]/strong'; +/** + * XPath for Table of Contents headline for journal issue + */ +var tocX = "//div[@id='citationdetails']//h5[@class='medium-text' and contains(.,'Table of Contents')]"; + + + +/** + * Scan to see what type of page this is + * @param doc The XML document describing the page + * @param url The URL of the page being scanned + * @return What type of article this page is (multiple, journal or conference proceedings) + */ function detectWeb(doc, url) { + var nsResolver = getNsResolver(doc); + var title = getText(titleX, doc, nsResolver); + Zotero.debug("Title: " + title); + if(url.indexOf("/results.cfm") != -1) { - var items = Zotero.Utilities.getItemArray(doc, doc, '^https?://[^/]+/citation.cfm\\?[^#]+$'); - // hack to return multiple if there are items - for(var i in items) { - return "multiple"; - } - } else { - var onClick = doc.evaluate('//a[substring(text(), 5, 7) = "EndNote"]', doc, null, XPathResult.ANY_TYPE, - null).iterateNext().getAttribute("onClick"); - if(onClick.match("proceeding.article")) { + Zotero.debug("Multiple items detected"); + return "multiple"; + } else if (url.indexOf("/citation.cfm") != -1) { + Zotero.debug("Single item detected"); + return getArticleType(doc, url, nsResolver); + /* + var type = getArticleType(doc, url, nsResolver); + if (type .indexOf("conferencePaper") != -1) { return "conferencePaper"; - } else { + } else return "journalArticle"; - } + }*/ } } -var urls = new Array(); - -// this handles sequential loading, since first we need to process a document (to get the abstract), then -// get the Refer metadata, then process the next document, etc. -function getNext() { - if(urls.length) { - var url = urls.shift(); - Zotero.Utilities.processDocuments([url], function(doc) { scrape(doc); }); - } else { - Zotero.done(); - } -} - -function scrape(doc) { - var onClick = doc.evaluate('//a[substring(text(), 5, 7) = "EndNote"]', doc, null, XPathResult.ANY_TYPE, - null).iterateNext().getAttribute("onClick"); - var m = onClick.match(/'([^']+)'/); - - if (doc.evaluate('//div[@class="abstract"]/p[@class="abstract"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - var abstract = doc.evaluate('//div[@class="abstract"]/p[@class="abstract"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext(); - if (!abstract.textContent.match(/\w+/)) { - var abstract = doc.evaluate('//div[@class="abstract"]/p[2]', doc, null, XPathResult.ANY_TYPE, null).iterateNext(); - } - if(abstract) abstract = Zotero.Utilities.trimInternal(abstract.textContent); - } - var snapshot = doc.location.href; - var attachments = new Array(); - var url; - var typeLinks = doc.evaluate('//td[@class="smaller-text"]/a[img]', doc, null, - XPathResult.ANY_TYPE, null); - var typeLink; - while(typeLink = typeLinks.iterateNext()) { - var linkText = typeLink.textContent.toLowerCase(); - linkText = linkText.replace(/(\t|\n| )/g, ""); - if(linkText == "pdf") { - attachments.push({title:"ACM Full Text PDF", mimeType:"application/pdf", url:typeLink.href}); - url = typeLink.href; - } else if(linkText == "html") { - url = snapshot = typeLink.href; - } - } - - attachments.push({title:"ACM Snapshot", mimeType:"text/html", url:snapshot}); - - var keywords = new Array(); - var keywordLinks = doc.evaluate('//p[@class="keywords"]/a', doc, null, - XPathResult.ANY_TYPE, null); - var keywordLink; - while(keywordLink = keywordLinks.iterateNext()) { - keywords.push(Zotero.Utilities.trimInternal(keywordLink.textContent.toLowerCase())); - } - var doi = ""; - var doiElmt = doc.evaluate('/html/body/div/table/tbody/tr[4]/td/table/tbody/tr/td/table/tbody/tr[3]/td[2][@class="small-text"]/a', doc, null, XPathResult.ANY_TYPE, null).iterateNext() - if (doiElmt){ - var match = doiElmt.textContent.match(/org\/(.*)/); - if (match){ - doi = match[1]; - } - } - - Zotero.Utilities.HTTP.doGet("http://portal.acm.org/"+m[1], function(text) { - // split() may no longer be necessary - var m = text.split(/<\/?pre[^>]*>/ig); - if (m[1]) { - var text = m[1]; - } - // unescape HTML for extended characters - function unescapeHTML(str, p1){ - return Zotero.Utilities.unescapeHTML("&#"+p1); - } - text = text.replace(/\\&\\#([^;]+;)/g, unescapeHTML); - // load Refer translator - var translator = Zotero.loadTranslator("import"); - translator.setTranslator("881f60f2-0802-411a-9228-ce5f47b64c7d"); - translator.setString(text); - translator.setHandler("itemDone", function(obj, item) { - if(abstract) item.abstractNote = abstract; - item.attachments = attachments; - item.tags = keywords; - item.DOI = doi; - item.url = snapshot; - item.complete(); - }); - translator.translate(); - - getNext(); - }); -} - +/** + * Parse the page + * @param doc The XML document describing the page + * @param url The URL of the page being scanned + */ function doWeb(doc, url) { - if(url.indexOf("/results.cfm") != -1) { - var items = Zotero.Utilities.getItemArray(doc, doc, '^https?://[^/]+/citation.cfm\\?[^#]+$'); - - items = Zotero.selectItems(items); - if(!items) return true; - - for(var url in items) { - urls.push(url); - } - - getNext(); - } else { - scrape(doc); + Zotero.debug("test do"); + var nsResolver = getNsResolver(doc, url); + + //If there are multiple pages + if (getArticleType(doc, url) == "multiple") { + //If this is a search results page + if (url.indexOf("results.cfm") != -1) + scrapeMulti(doc, url, nsResolver, "search"); + else if(getText(tocX, doc, nsResolver) =="Table of Contents") + scrapeMulti(doc, url, nsResolver, "toc"); + Zotero.wait(); + } //If this is a single page + else + scrape(doc, url, nsResolver); +} + +/** + * Scrape search results and journal tables of contents + * @param doc The XML document describing the page + * @param url The URL of the page being scanned + * @param nsResolver the namespace resolver function + * @param type Type of result-- "search" or "toc" + */ +function scrapeMulti(doc, url, nsResolver, type) { + switch(type) { + case "toc": + Zotero.debug("Scraping journal TOC"); + var resultPath= doc.evaluate(tocResultX, doc, null, XPathResult.ANY_TYPE, null); + break; + case "search": + Zotero.debug("Scraping search"); + default: + var resultPath= doc.evaluate(searchResultX, doc, null, XPathResult.ANY_TYPE, null); + } + Zotero.debug("hi"+resultPath.iterateNext().textContent); + + //Count how mange pages have been scraped + var node; + var urls = {}; + //Iterate through all the results + while(node= resultPath.iterateNext()) { + urls[node.href] = node.textContent; } - Zotero.wait(); -} \ No newline at end of file + var items = Zotero.selectItems(urls); + if(!items) return true; + + var i; + urls = []; + for (i in items) urls.push(i); + + Zotero.Utilities.processDocuments(urls, scrape, function(){Zotero.done()}); +} + +/** + * Scrape a single page + * @param doc The XML document describing the page + */ +function scrape(doc) { + var url = doc.location.href; + var nsResolver = getNsResolver(doc, url); + + //Get all the details not scraped from the bibtex file + var tags = scrapeKeywords(doc); + var attachments = scrapeAttachments(doc, url); + var abs = scrapeAbstract(doc); + var type = getArticleType(doc, url, nsResolver); + var journal = getText("//meta[@name='citation_journal_title']/@content",doc, nsResolver); + //Get the bibtex reference for this document as a string + var bibtex = scrapeBibtex(url, nsResolver); + + //Create the new item + var newItem = new Zotero.Item(type); + + //Use the bibtex translator to parse the bibtex string + var translator = Zotero.loadTranslator("import"); + translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4"); + translator.setString(bibtex ); + //Set the function to run when the bibtex string has been parsed + translator.setHandler("itemDone", function(obj, newItem) { + //Store all details not parsed from the bibtex + if(abs) newItem.abstractNote = abs; + newItem.tags = tags; + newItem.attachments = attachments; + newItem.itemType= type; + if (journal && journal != newItem.publicationTitle) { + newItem.journalAbbreviation = newItem.publicationTitle; + newItem.publicationTitle = journal; + } + // If the URL is just a DOI, clear it. + if (newItem.url.match(/^http:\/\/doi\.acm\.org\//)) newItem.url = ""; + newItem.DOI = newItem.DOI.replace(/^http:\/\/doi\.acm\.org\//, ''); + var acmid = bibtex.match(/acmid = {(\d+)}/); + if(acmid) newItem.extra = "ACM ID: "+ acmid[1]; + //Complete the parsing of the page + newItem.complete(); + }); + + //Trigger the translation + translator.translate(); +} + +/** + * Scrape all keywords attached to this document + * @param doc The XML document describing the page + * @return an array of all keywords attached to this document + */ +function scrapeKeywords(doc) { + Zotero.debug("Scraping Keywords"); + //Try scraping keywords from the "more keywords" popup + var keywords = scrapeMoreTagsKeywords(doc); + + if (keywords) return keywords; + + keywords = new Array(); + + //Otherwise look for the keywords - check justified format + var keywordPath = doc.evaluate(justifiedTagX, doc, null, XPathResult.ANY_TYPE, null); + var keywordNode = keywordPath.iterateNext(); + //If justified format didn't work check unjustified + if (!keywordNode) { + keywordPath = doc.evaluate(unjustifiedTagX, doc, null, XPathResult.ANY_TYPE, null); + keywordNode = keywordPath.iterateNext(); + } + //Iterate through all the keywords + while(keywordNode) { + keywords.push(Zotero.Utilities.trimInternal(keywordNode .textContent.toLowerCase())); + Zotero.debug("Keyword: " + keywordNode .textContent.toLowerCase()); + keywordNode = keywordPath.iterateNext(); + } + + return keywords; +} + +/** + * Scrape keywords from a "more tags" popup + * @param doc The XML document describing the page + * @return an array of all the keywords attached to the page which will be used as the tags for the document + */ +function scrapeMoreTagsKeywords(doc) { + var keywords = new Array(); + + //Look for a link for a javascript code for a "more tags" popup + var morePath = doc.evaluate(moreTagsX, doc, null, XPathResult.ANY_TYPE, null); + var moreNode = morePath ? morePath.iterateNext() : null; + //If there is no "more tags" popup + if (!moreNode) + return null; + + var keywordPath = doc.evaluate(moreTagX, doc, null, XPathResult.ANY_TYPE, null); + + var keywordNode; + //Iterate through all the keywords + while(keywordNode = keywordPath.iterateNext()) { + keywords.push(Zotero.Utilities.trimInternal(keywordNode .textContent.toLowerCase())); + Zotero.debug("Keyword: " + keywordNode .textContent.toLowerCase()); + } + return keywords; +} + +/** + * Scrape all the relevant attachments from the page. + * Firstly grabs a snapshot of the ACM page then looks for any links to the full text + * @param doc The XML document describing the page + * @param url The URL of the page being scanned + * @return an array of all the attachments + */ +function scrapeAttachments(doc, url) { + Zotero.debug("Scraping attachments"); + var attachments = new Array(); + + //Add the scrapeshot of this page + attachments.push({title:"ACM Snapshot", mimeType:"text/html", url:url}); + + //XPath for the full text links + var textPath = doc.evaluate('//a[@name="FullTextPdf" or @name="FullTextHtml" or @name="FullText Html"]', doc, null, XPathResult.ANY_TYPE, null); + + var textNode; + //Iterate through all the links + while (textNode= textPath .iterateNext()) { + var textURL= textNode.href; + + //If the full text is a pdf + if (textNode.name == "FullTextPdf") { + Zotero.debug("Text PDF: " + textURL); + attachments.push({title:"ACM Full Text PDF", mimeType:"application/pdf", url:textURL}); + } else { //Otherwise the text is an HTML link + Zotero.debug("Text Page: " + textURL); + attachments.push({title:"ACM Full Text HTML", mimeType:"text/html", url:textURL}); + } + } + + return attachments; +} + +/** + * Scrape the abstract from the page + * @param doc The XML document describing the page + * @param url The URL of the page being scanned + * @return a string with the text of the abstract + */ +function scrapeAbstract(doc) { + Zotero.debug("Scraping abstract"); + var text = getText('//div[@style="display: inline;"]', doc); + return text; +} + +/** + * Get the text of the bibtex format reference + * @param url The URL of the page being scanned + * @param nsResolver the namespace resolver function + * @return the bibtex reference as a trimmed string + */ +function scrapeBibtex(url, nsResolver) { + Zotero.debug("Scraping full details from bibtex"); + //Get the ID of this document + var id = getId(url); + //The link of the bibtex popup + var bibtex = "http://portal.acm.org/exportformats.cfm?id=" + id + "&expformat=bibtex"; + + Zotero.debug("Bibtex: " + bibtex); + + //Get the xml document which will be loaded into the popup box + var texDoc = Zotero.Utilities.retrieveDocument(bibtex); + //Find the node with the bibtex text in it + var path = texDoc.evaluate('//pre', texDoc, nsResolver, XPathResult.ANY_TYPE, null); + var node = path.iterateNext(); + + if (node != null && node.textContent != null) { + var ref = node.textContent; + Zotero.debug("\nref : " + (ref == null ? "null":ref)); + ref = Zotero.Utilities.trimInternal(ref); + ref = Zotero.Utilities.trim(ref); + + return ref; + } + return null; +} + +/** + * Get the unique identifier of this document + * @param url The URL of the page being scanned + * @param journal [optional]whether to get the ID of the journal the document is in or of the document itself + * @return a string containing the identifier of the document or journal the document is in + */ +function getId(url, journal) { + if (journal=== undefined) + journal= false; + + var cfmIndex = url.indexOf(".cfm"); + var atIndex = url.indexOf('&'); + + var id = url.substr(cfmIndex + 8); + + if (atIndex != -1) + id = id.replace(url.substring(atIndex), ""); + + var dotIndex = id.indexOf('.'); + if (dotIndex != -1) + if (!journal) + id = id.replace(id .substring(0, (dotIndex+1)), ""); + else + id = id.replace(id .substring(dotIndex), ""); + + return id; +} + +/** + * Find out what kind of document this is + * @param doc The XML document describing the page + * @param url The URL of the page being scanned + * @param nsResolver the namespace resolver function + * @return a string with either "multiple", "journalArticle" or "conferencePaper" in it, depending on the type of document + */ +function getArticleType(doc, url, nsResolver) { + var toc = doc.evaluate(tocX, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (url.indexOf("results.cfm") != -1 || toc) { + Zotero.debug("Type: multiple"); + return "multiple"; + } + + //XPath for the table cell which has either "Journal" or "Proceeding" in it + var text = getText('//td[@nowrap="nowrap" and @style="padding-bottom: 0px;"]', doc, nsResolver); + + Zotero.debug("Type: " + text); + if (text.indexOf("Proceeding") != -1) + return "conferencePaper"; + else if (text.indexOf("Magazine") != -1) + return "magazineArticle"; + else + return "journalArticle"; + +} + +/** + * Get the text from the first node defined by the given xPathString + * @param pathString the XPath indicating which node to get the text from + * @param doc The XML document describing the page + * @param nsResolver the namespace resolver function + * @return the text in the defined node or "Unable to scrape text" if the node was not found or if there was no text content + */ +function getText(pathString, doc, nsResolver) { + var path = doc.evaluate(pathString, doc, nsResolver, XPathResult.ANY_TYPE, null); + var node = path.iterateNext(); + + if (node == null || node.textContent == undefined || node.textContent == null) { + Zotero.debug("Unable to retrieve text for XPath: "+pathString); + return ""; + } + + return node.textContent; +} + +/** + * Get a function for returning the namespace of a given document given its prefix + * @param nsResolver the namespace resolver function + */ +function getNsResolver(doc) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == 'x') return namespace; + else return null; + } : null; + + return nsResolver; +}