diff --git a/translators/ACM.js b/translators/ACM.js
index 598b420b2..6c003fd6b 100644
--- a/translators/ACM.js
+++ b/translators/ACM.js
@@ -1,138 +1,408 @@
{
- "translatorID":"e78d20f7-488-4023-831-dfe39679f3f",
- "translatorType":4,
- "label":"ACM",
- "creator":"Simon Kornblith and Michael Berkowitz",
- "target":"https?://[^/]*portal\\.acm\\.org[^/]*/(?:results\\.cfm|citation\\.cfm)",
- "minVersion":"1.0.0b3.r1",
- "maxVersion":"",
- "priority":100,
- "inRepository":true,
- "lastUpdated":"2009-05-05 07:15:00"
+ "translatorID":"f3f092bf-ae09-4be6-8855-a22ddd817925",
+ "label":"ACM Digital Library",
+ "creator":"Simon Kornblith, Michael Berkowitz and John McCaffery",
+ "target":"^https?://[^/]*portal\\.acm\\.org[^/]*/(?:results\\.cfm|citation\\.cfm)",
+ "minVersion":"1.0",
+ "maxVersion":"",
+ "priority":100,
+ "inRepository":"1",
+ "translatorType":4,
+ "lastUpdated":"2010-11-10 23:55:19"
}
+/**
+ * The XPath for all the search result elements
+ */
+var searchResultX = '//td[@colspan="3"]/a[@class="medium-text" and @target="_self"]';
+/**
+ * The XPath for all the journal TOC elements
+ */
+var tocResultX = '//td[@colspan="1"]/span[@style]/a[contains(@href,"citation.cfm")]';
+
+/**
+ * The XPath for the tag elements in a justified format tags list
+ */
+var justifiedTagX = '//div[@id="divtags"]/p/a';
+/**
+ * The XPath for the tag elements in an un-justified format tags list
+ */
+var unjustifiedTagX = '//div[@id="divtags"]/a';
+/**
+ * the XPath for the "more tags" link element
+ */
+var moreTagsX = '//a[@href="javascript:ColdFusion.Window.show(' + "'thetags'" + ')"]';
+/**
+ * the XPath for the tag elements in the "more tags" popup
+ */
+var moreTagX = '//a/span[@class="small-text"]';
+/**
+ * the XPath for the title heading element - not strictly necessary, more helpful for debugging
+ */
+var titleX = '//div[@class="large-text"]/h1[@class="mediumb-text"]/strong';
+/**
+ * XPath for Table of Contents headline for journal issue
+ */
+var tocX = "//div[@id='citationdetails']//h5[@class='medium-text' and contains(.,'Table of Contents')]";
+
+
+
+/**
+ * Scan to see what type of page this is
+ * @param doc The XML document describing the page
+ * @param url The URL of the page being scanned
+ * @return What type of article this page is (multiple, journal or conference proceedings)
+ */
function detectWeb(doc, url) {
+ var nsResolver = getNsResolver(doc);
+ var title = getText(titleX, doc, nsResolver);
+ Zotero.debug("Title: " + title);
+
if(url.indexOf("/results.cfm") != -1) {
- var items = Zotero.Utilities.getItemArray(doc, doc, '^https?://[^/]+/citation.cfm\\?[^#]+$');
- // hack to return multiple if there are items
- for(var i in items) {
- return "multiple";
- }
- } else {
- var onClick = doc.evaluate('//a[substring(text(), 5, 7) = "EndNote"]', doc, null, XPathResult.ANY_TYPE,
- null).iterateNext().getAttribute("onClick");
- if(onClick.match("proceeding.article")) {
+ Zotero.debug("Multiple items detected");
+ return "multiple";
+ } else if (url.indexOf("/citation.cfm") != -1) {
+ Zotero.debug("Single item detected");
+ return getArticleType(doc, url, nsResolver);
+ /*
+ var type = getArticleType(doc, url, nsResolver);
+ if (type .indexOf("conferencePaper") != -1) {
return "conferencePaper";
- } else {
+ } else
return "journalArticle";
- }
+ }*/
}
}
-var urls = new Array();
-
-// this handles sequential loading, since first we need to process a document (to get the abstract), then
-// get the Refer metadata, then process the next document, etc.
-function getNext() {
- if(urls.length) {
- var url = urls.shift();
- Zotero.Utilities.processDocuments([url], function(doc) { scrape(doc); });
- } else {
- Zotero.done();
- }
-}
-
-function scrape(doc) {
- var onClick = doc.evaluate('//a[substring(text(), 5, 7) = "EndNote"]', doc, null, XPathResult.ANY_TYPE,
- null).iterateNext().getAttribute("onClick");
- var m = onClick.match(/'([^']+)'/);
-
- if (doc.evaluate('//div[@class="abstract"]/p[@class="abstract"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
- var abstract = doc.evaluate('//div[@class="abstract"]/p[@class="abstract"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext();
- if (!abstract.textContent.match(/\w+/)) {
- var abstract = doc.evaluate('//div[@class="abstract"]/p[2]', doc, null, XPathResult.ANY_TYPE, null).iterateNext();
- }
- if(abstract) abstract = Zotero.Utilities.trimInternal(abstract.textContent);
- }
- var snapshot = doc.location.href;
- var attachments = new Array();
- var url;
- var typeLinks = doc.evaluate('//td[@class="smaller-text"]/a[img]', doc, null,
- XPathResult.ANY_TYPE, null);
- var typeLink;
- while(typeLink = typeLinks.iterateNext()) {
- var linkText = typeLink.textContent.toLowerCase();
- linkText = linkText.replace(/(\t|\n| )/g, "");
- if(linkText == "pdf") {
- attachments.push({title:"ACM Full Text PDF", mimeType:"application/pdf", url:typeLink.href});
- url = typeLink.href;
- } else if(linkText == "html") {
- url = snapshot = typeLink.href;
- }
- }
-
- attachments.push({title:"ACM Snapshot", mimeType:"text/html", url:snapshot});
-
- var keywords = new Array();
- var keywordLinks = doc.evaluate('//p[@class="keywords"]/a', doc, null,
- XPathResult.ANY_TYPE, null);
- var keywordLink;
- while(keywordLink = keywordLinks.iterateNext()) {
- keywords.push(Zotero.Utilities.trimInternal(keywordLink.textContent.toLowerCase()));
- }
- var doi = "";
- var doiElmt = doc.evaluate('/html/body/div/table/tbody/tr[4]/td/table/tbody/tr/td/table/tbody/tr[3]/td[2][@class="small-text"]/a', doc, null, XPathResult.ANY_TYPE, null).iterateNext()
- if (doiElmt){
- var match = doiElmt.textContent.match(/org\/(.*)/);
- if (match){
- doi = match[1];
- }
- }
-
- Zotero.Utilities.HTTP.doGet("http://portal.acm.org/"+m[1], function(text) {
- // split() may no longer be necessary
- var m = text.split(/<\/?pre[^>]*>/ig);
- if (m[1]) {
- var text = m[1];
- }
- // unescape HTML for extended characters
- function unescapeHTML(str, p1){
- return Zotero.Utilities.unescapeHTML(""+p1);
- }
- text = text.replace(/\\&\\#([^;]+;)/g, unescapeHTML);
- // load Refer translator
- var translator = Zotero.loadTranslator("import");
- translator.setTranslator("881f60f2-0802-411a-9228-ce5f47b64c7d");
- translator.setString(text);
- translator.setHandler("itemDone", function(obj, item) {
- if(abstract) item.abstractNote = abstract;
- item.attachments = attachments;
- item.tags = keywords;
- item.DOI = doi;
- item.url = snapshot;
- item.complete();
- });
- translator.translate();
-
- getNext();
- });
-}
-
+/**
+ * Parse the page
+ * @param doc The XML document describing the page
+ * @param url The URL of the page being scanned
+ */
function doWeb(doc, url) {
- if(url.indexOf("/results.cfm") != -1) {
- var items = Zotero.Utilities.getItemArray(doc, doc, '^https?://[^/]+/citation.cfm\\?[^#]+$');
-
- items = Zotero.selectItems(items);
- if(!items) return true;
-
- for(var url in items) {
- urls.push(url);
- }
-
- getNext();
- } else {
- scrape(doc);
+ Zotero.debug("test do");
+ var nsResolver = getNsResolver(doc, url);
+
+ //If there are multiple pages
+ if (getArticleType(doc, url) == "multiple") {
+ //If this is a search results page
+ if (url.indexOf("results.cfm") != -1)
+ scrapeMulti(doc, url, nsResolver, "search");
+ else if(getText(tocX, doc, nsResolver) =="Table of Contents")
+ scrapeMulti(doc, url, nsResolver, "toc");
+ Zotero.wait();
+ } //If this is a single page
+ else
+ scrape(doc, url, nsResolver);
+}
+
+/**
+ * Scrape search results and journal tables of contents
+ * @param doc The XML document describing the page
+ * @param url The URL of the page being scanned
+ * @param nsResolver the namespace resolver function
+ * @param type Type of result-- "search" or "toc"
+ */
+function scrapeMulti(doc, url, nsResolver, type) {
+ switch(type) {
+ case "toc":
+ Zotero.debug("Scraping journal TOC");
+ var resultPath= doc.evaluate(tocResultX, doc, null, XPathResult.ANY_TYPE, null);
+ break;
+ case "search":
+ Zotero.debug("Scraping search");
+ default:
+ var resultPath= doc.evaluate(searchResultX, doc, null, XPathResult.ANY_TYPE, null);
+ }
+ Zotero.debug("hi"+resultPath.iterateNext().textContent);
+
+ //Count how mange pages have been scraped
+ var node;
+ var urls = {};
+ //Iterate through all the results
+ while(node= resultPath.iterateNext()) {
+ urls[node.href] = node.textContent;
}
- Zotero.wait();
-}
\ No newline at end of file
+ var items = Zotero.selectItems(urls);
+ if(!items) return true;
+
+ var i;
+ urls = [];
+ for (i in items) urls.push(i);
+
+ Zotero.Utilities.processDocuments(urls, scrape, function(){Zotero.done()});
+}
+
+/**
+ * Scrape a single page
+ * @param doc The XML document describing the page
+ */
+function scrape(doc) {
+ var url = doc.location.href;
+ var nsResolver = getNsResolver(doc, url);
+
+ //Get all the details not scraped from the bibtex file
+ var tags = scrapeKeywords(doc);
+ var attachments = scrapeAttachments(doc, url);
+ var abs = scrapeAbstract(doc);
+ var type = getArticleType(doc, url, nsResolver);
+ var journal = getText("//meta[@name='citation_journal_title']/@content",doc, nsResolver);
+ //Get the bibtex reference for this document as a string
+ var bibtex = scrapeBibtex(url, nsResolver);
+
+ //Create the new item
+ var newItem = new Zotero.Item(type);
+
+ //Use the bibtex translator to parse the bibtex string
+ var translator = Zotero.loadTranslator("import");
+ translator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
+ translator.setString(bibtex );
+ //Set the function to run when the bibtex string has been parsed
+ translator.setHandler("itemDone", function(obj, newItem) {
+ //Store all details not parsed from the bibtex
+ if(abs) newItem.abstractNote = abs;
+ newItem.tags = tags;
+ newItem.attachments = attachments;
+ newItem.itemType= type;
+ if (journal && journal != newItem.publicationTitle) {
+ newItem.journalAbbreviation = newItem.publicationTitle;
+ newItem.publicationTitle = journal;
+ }
+ // If the URL is just a DOI, clear it.
+ if (newItem.url.match(/^http:\/\/doi\.acm\.org\//)) newItem.url = "";
+ newItem.DOI = newItem.DOI.replace(/^http:\/\/doi\.acm\.org\//, '');
+ var acmid = bibtex.match(/acmid = {(\d+)}/);
+ if(acmid) newItem.extra = "ACM ID: "+ acmid[1];
+ //Complete the parsing of the page
+ newItem.complete();
+ });
+
+ //Trigger the translation
+ translator.translate();
+}
+
+/**
+ * Scrape all keywords attached to this document
+ * @param doc The XML document describing the page
+ * @return an array of all keywords attached to this document
+ */
+function scrapeKeywords(doc) {
+ Zotero.debug("Scraping Keywords");
+ //Try scraping keywords from the "more keywords" popup
+ var keywords = scrapeMoreTagsKeywords(doc);
+
+ if (keywords) return keywords;
+
+ keywords = new Array();
+
+ //Otherwise look for the keywords - check justified format
+ var keywordPath = doc.evaluate(justifiedTagX, doc, null, XPathResult.ANY_TYPE, null);
+ var keywordNode = keywordPath.iterateNext();
+ //If justified format didn't work check unjustified
+ if (!keywordNode) {
+ keywordPath = doc.evaluate(unjustifiedTagX, doc, null, XPathResult.ANY_TYPE, null);
+ keywordNode = keywordPath.iterateNext();
+ }
+ //Iterate through all the keywords
+ while(keywordNode) {
+ keywords.push(Zotero.Utilities.trimInternal(keywordNode .textContent.toLowerCase()));
+ Zotero.debug("Keyword: " + keywordNode .textContent.toLowerCase());
+ keywordNode = keywordPath.iterateNext();
+ }
+
+ return keywords;
+}
+
+/**
+ * Scrape keywords from a "more tags" popup
+ * @param doc The XML document describing the page
+ * @return an array of all the keywords attached to the page which will be used as the tags for the document
+ */
+function scrapeMoreTagsKeywords(doc) {
+ var keywords = new Array();
+
+ //Look for a link for a javascript code for a "more tags" popup
+ var morePath = doc.evaluate(moreTagsX, doc, null, XPathResult.ANY_TYPE, null);
+ var moreNode = morePath ? morePath.iterateNext() : null;
+ //If there is no "more tags" popup
+ if (!moreNode)
+ return null;
+
+ var keywordPath = doc.evaluate(moreTagX, doc, null, XPathResult.ANY_TYPE, null);
+
+ var keywordNode;
+ //Iterate through all the keywords
+ while(keywordNode = keywordPath.iterateNext()) {
+ keywords.push(Zotero.Utilities.trimInternal(keywordNode .textContent.toLowerCase()));
+ Zotero.debug("Keyword: " + keywordNode .textContent.toLowerCase());
+ }
+ return keywords;
+}
+
+/**
+ * Scrape all the relevant attachments from the page.
+ * Firstly grabs a snapshot of the ACM page then looks for any links to the full text
+ * @param doc The XML document describing the page
+ * @param url The URL of the page being scanned
+ * @return an array of all the attachments
+ */
+function scrapeAttachments(doc, url) {
+ Zotero.debug("Scraping attachments");
+ var attachments = new Array();
+
+ //Add the scrapeshot of this page
+ attachments.push({title:"ACM Snapshot", mimeType:"text/html", url:url});
+
+ //XPath for the full text links
+ var textPath = doc.evaluate('//a[@name="FullTextPdf" or @name="FullTextHtml" or @name="FullText Html"]', doc, null, XPathResult.ANY_TYPE, null);
+
+ var textNode;
+ //Iterate through all the links
+ while (textNode= textPath .iterateNext()) {
+ var textURL= textNode.href;
+
+ //If the full text is a pdf
+ if (textNode.name == "FullTextPdf") {
+ Zotero.debug("Text PDF: " + textURL);
+ attachments.push({title:"ACM Full Text PDF", mimeType:"application/pdf", url:textURL});
+ } else { //Otherwise the text is an HTML link
+ Zotero.debug("Text Page: " + textURL);
+ attachments.push({title:"ACM Full Text HTML", mimeType:"text/html", url:textURL});
+ }
+ }
+
+ return attachments;
+}
+
+/**
+ * Scrape the abstract from the page
+ * @param doc The XML document describing the page
+ * @param url The URL of the page being scanned
+ * @return a string with the text of the abstract
+ */
+function scrapeAbstract(doc) {
+ Zotero.debug("Scraping abstract");
+ var text = getText('//div[@style="display: inline;"]', doc);
+ return text;
+}
+
+/**
+ * Get the text of the bibtex format reference
+ * @param url The URL of the page being scanned
+ * @param nsResolver the namespace resolver function
+ * @return the bibtex reference as a trimmed string
+ */
+function scrapeBibtex(url, nsResolver) {
+ Zotero.debug("Scraping full details from bibtex");
+ //Get the ID of this document
+ var id = getId(url);
+ //The link of the bibtex popup
+ var bibtex = "http://portal.acm.org/exportformats.cfm?id=" + id + "&expformat=bibtex";
+
+ Zotero.debug("Bibtex: " + bibtex);
+
+ //Get the xml document which will be loaded into the popup box
+ var texDoc = Zotero.Utilities.retrieveDocument(bibtex);
+ //Find the node with the bibtex text in it
+ var path = texDoc.evaluate('//pre', texDoc, nsResolver, XPathResult.ANY_TYPE, null);
+ var node = path.iterateNext();
+
+ if (node != null && node.textContent != null) {
+ var ref = node.textContent;
+ Zotero.debug("\nref : " + (ref == null ? "null":ref));
+ ref = Zotero.Utilities.trimInternal(ref);
+ ref = Zotero.Utilities.trim(ref);
+
+ return ref;
+ }
+ return null;
+}
+
+/**
+ * Get the unique identifier of this document
+ * @param url The URL of the page being scanned
+ * @param journal [optional]whether to get the ID of the journal the document is in or of the document itself
+ * @return a string containing the identifier of the document or journal the document is in
+ */
+function getId(url, journal) {
+ if (journal=== undefined)
+ journal= false;
+
+ var cfmIndex = url.indexOf(".cfm");
+ var atIndex = url.indexOf('&');
+
+ var id = url.substr(cfmIndex + 8);
+
+ if (atIndex != -1)
+ id = id.replace(url.substring(atIndex), "");
+
+ var dotIndex = id.indexOf('.');
+ if (dotIndex != -1)
+ if (!journal)
+ id = id.replace(id .substring(0, (dotIndex+1)), "");
+ else
+ id = id.replace(id .substring(dotIndex), "");
+
+ return id;
+}
+
+/**
+ * Find out what kind of document this is
+ * @param doc The XML document describing the page
+ * @param url The URL of the page being scanned
+ * @param nsResolver the namespace resolver function
+ * @return a string with either "multiple", "journalArticle" or "conferencePaper" in it, depending on the type of document
+ */
+function getArticleType(doc, url, nsResolver) {
+ var toc = doc.evaluate(tocX, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (url.indexOf("results.cfm") != -1 || toc) {
+ Zotero.debug("Type: multiple");
+ return "multiple";
+ }
+
+ //XPath for the table cell which has either "Journal" or "Proceeding" in it
+ var text = getText('//td[@nowrap="nowrap" and @style="padding-bottom: 0px;"]', doc, nsResolver);
+
+ Zotero.debug("Type: " + text);
+ if (text.indexOf("Proceeding") != -1)
+ return "conferencePaper";
+ else if (text.indexOf("Magazine") != -1)
+ return "magazineArticle";
+ else
+ return "journalArticle";
+
+}
+
+/**
+ * Get the text from the first node defined by the given xPathString
+ * @param pathString the XPath indicating which node to get the text from
+ * @param doc The XML document describing the page
+ * @param nsResolver the namespace resolver function
+ * @return the text in the defined node or "Unable to scrape text" if the node was not found or if there was no text content
+ */
+function getText(pathString, doc, nsResolver) {
+ var path = doc.evaluate(pathString, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ var node = path.iterateNext();
+
+ if (node == null || node.textContent == undefined || node.textContent == null) {
+ Zotero.debug("Unable to retrieve text for XPath: "+pathString);
+ return "";
+ }
+
+ return node.textContent;
+}
+
+/**
+ * Get a function for returning the namespace of a given document given its prefix
+ * @param nsResolver the namespace resolver function
+ */
+function getNsResolver(doc) {
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace;
+ else return null;
+ } : null;
+
+ return nsResolver;
+}