diff --git a/chrome/chromeFiles/content/scholar/fileInterface.js b/chrome/chromeFiles/content/scholar/fileInterface.js index d6b60530c..d48e5963f 100644 --- a/chrome/chromeFiles/content/scholar/fileInterface.js +++ b/chrome/chromeFiles/content/scholar/fileInterface.js @@ -12,8 +12,6 @@ var Scholar_File_Interface = new function() { * Creates Scholar.Translate instance and shows file picker for file export */ function exportFile(items) { - Scholar.debug(items); - var translation = new Scholar.Translate("export"); var translators = translation.getTranslators(); diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js index e6b849f79..1ce0cdd02 100644 --- a/chrome/chromeFiles/content/scholar/ingester/browser.js +++ b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -97,14 +97,12 @@ Scholar_Ingester_Interface.contentLoad = function(event) { var rootDoc = doc; // get the appropriate root document to check which browser we're on - Scholar.debug("getting root document"); while(rootDoc.defaultView.frameElement) { rootDoc = rootDoc.defaultView.frameElement.ownerDocument; } // Figure out what browser this contentDocument is associated with var browser; - Scholar.debug("getting browser"); for(var i=0; i/; var newItem = new Scholar.Item("journalArticle"); + newItem.attachments.push(getJSTORAttachment(viewPages[k])); for(var i in lines) { if(lines[i].substring(0,3) == "<1>") { haveStarted = true; } else if(newItemRe.test(lines[i])) { itemComplete(newItem, url); + k++; + newItem = new Scholar.Item("journalArticle"); + newItem.attachments.push(getJSTORAttachment(viewPages[k])); } else if(lines[i].substring(2, 5) == " : " && haveStarted) { var fieldCode = lines[i].substring(0, 2); var fieldContent = Scholar.Utilities.cleanString(lines[i].substring(5)) if(fieldCode == "TI") { - newItem.title = fieldContent; + if(fieldContent) { + newItem.title = fieldContent; + } else { + newItem.title = "[untitled]"; + } } else if(fieldCode == "AU") { var authors = fieldContent.split(";"); for(j in authors) { @@ -565,7 +636,7 @@ function doWeb(doc, url) { } else if(fieldCode == "PP") { newItem.pages = fieldContent; } else if(fieldCode == "EI") { - newItem.source = fieldContent; + newItem.url = fieldContent; } else if(fieldCode == "IN") { newItem.ISSN = fieldContent; } else if(fieldCode == "PB") { @@ -581,13 +652,13 @@ function doWeb(doc, url) { Scholar.done(); }); - }, function() {}); + }); }); Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', +REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.s?html$|cgi-bin/search.cgi)', 'function detectWeb(doc, url) { if(doc.title == "History Cooperative: Search Results") { return "multiple"; @@ -604,14 +675,14 @@ REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006 function scrape(doc) { var newItem = new Scholar.Item("journalArticle"); - newItem.source = doc.location.href; + newItem.url = doc.location.href; var month, year; var metaTags = doc.getElementsByTagName("meta"); associateMeta(newItem, metaTags, "Title", "title"); - associateMeta(newItem, metaTags, "Journal", "publication"); + associateMeta(newItem, metaTags, "Journal", "publicationTitle"); associateMeta(newItem, metaTags, "Volume", "volume"); - associateMeta(newItem, metaTags, "Issue", "number"); + associateMeta(newItem, metaTags, "Issue", "issue"); var author = metaTags.namedItem("Author"); if(author) { @@ -621,14 +692,17 @@ function scrape(doc) { } } + newItem.attachments.push({document:doc, title:"History Cooperative Full Text", + downloadable:true}); + newItem.complete(); // don''t actually need date info for a journal article - /*var month = metaTags.namedItem("PublicationMonth"); + var month = metaTags.namedItem("PublicationMonth"); var year = metaTags.namedItem("PublicationYear"); if(month && year) { - odel.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false); - }*/ + newItem.date = month.getAttribute("content")+" "+year.getAttribute("content"); + } } function doWeb(doc, url) { @@ -667,9 +741,9 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 if (prefix == ''x'') return namespace; else return null; } : null; - var xpath = ''//a[img[@alt="MARC Display"]]''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - if(elmts.length) { + var xpath = ''//a[img[@src="/screens/marcdisp.gif" or @alt="MARC Display" or @src="/screens/regdisp.gif" or @alt="REGULAR RECORD DISPLAY"]]''; + var elmt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(elmt) { return "book"; } // Also, check for links to an item display page @@ -682,10 +756,72 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 return false; }', -'function doWeb(doc, url) { +'function scrape(marc, newDoc) { + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var xpath = ''//pre/text()[1]''; + var text = newDoc.evaluate(xpath, newDoc, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext().nodeValue; + var newItem = new Scholar.Item(); + + var record = new marc.MARC_Record(); + + var linee = text.split("\n"); + for (var i=0; i ''008'' && tag < ''899'') { // jumps low and high tags + if (tag != ''040'') record.add_field(tag,ind1,ind2,value); + } + } + + record.translate(newItem); + newItem.complete(); +} + +function pageByPage(marc, urls) { + Scholar.Utilities.processDocuments(urls, function(newDoc) { + scrape(marc, newDoc); + }, function() { Scholar.done() }); +} + +function doWeb(doc, url) { var uri = doc.location.href; var newUri; + // load translator for MARC + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); var m = matchRegexp.exec(uri); if(m) { @@ -696,74 +832,22 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 if (prefix == ''x'') return namespace; else return null; } : null; - var xpath = ''//a[img[@alt="MARC Display"]]''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - if(elmts.length) { - newUri = elmts[0].href; + var xpath = ''//a[img[@src="/screens/marcdisp.gif" or @alt="MARC Display"]]''; + var aTag = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(aTag) { + newUri = aTag.href; + } else { + var xpath = ''//a[img[@src="/screens/regdisp.gif" or @alt="REGULAR RECORD DISPLAY"]]''; + var aTag = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(aTag) { + scrape(marc, doc); + return; + } } } - // load translator for MARC - var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - if(newUri) { // single page - Scholar.Utilities.loadDocument(newUri, function(newDoc) { - var namespace = newDoc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - - var xpath = ''//pre''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - - var text = doc.evaluate(''./text()[1]'', elmts[0], nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; - - var newItem = new Scholar.Item(); - newItem.source = uri; - - var record = new marc.MARC_Record(); - - var linee = text.split("\n"); - for (var i=0; i ''008'' && tag < ''899'') { // jumps low and high tags - if (tag != ''040'') record.add_field(tag,ind1,ind2,value); - } - } - - record.translate(newItem); - newItem.complete(); - - Scholar.done(); - }, null); + pageByPage(marc, [newUri]); } else { // Search results page // Require link to match this var tagRegexp = new RegExp(); @@ -773,29 +857,52 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 var urls = new Array(); var availableItems = new Array(); - var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//table[@class="browseScreen"]//tr[td/input[@type="checkbox"]]'', nsResolver); + var tableRows = doc.evaluate(''//table[@class="browseScreen"]//tr[@class="browseEntry" or @class="briefCitRow" or td/input[@type="checkbox"]]'', + doc, nsResolver, XPathResult.ANY_TYPE, null); // Go through table rows - for(var i=0; i 1) { + postString += "save_func=save_marked"; + + + Scholar.Utilities.HTTP.doGet(clearUrl, function() { + Scholar.Utilities.HTTP.doPost(postUrl, postString, function() { + Scholar.Utilities.HTTP.doPost(exportUrl, "ex_format=50&ex_device=45&SUBMIT=Submit", function(text) { + var notSpace = /[^\s]/ + if(notSpace.test(text)) { + marc.Scholar.write(text); + marc.Scholar.eof(); + marc.doImport(); + + Scholar.done(); + } else { + pageByPage(marc, newUrls); + } + }); }); }); - }); + } else { + pageByPage(marc, newUrls); + } } Scholar.wait(); @@ -840,13 +964,11 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006 } : null; var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - if(elmts.length) { + if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { return "book"; } var xpath = ''//td[@class="searchsum"]/table''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - if(elmts.length) { + if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { return "multiple"; } }', @@ -857,24 +979,25 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006 } : null; var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - if(!elmts.length) { + var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var elmt = elmts.iterateNext(); + if(!elmt) { return false; } var newItem = new Scholar.Item("book"); - newItem.source = doc.location.href; + newItem.extra = ""; - for (var i = 0; i < elmts.length; i++) { - var elmt = elmts[i]; + while(elmt) { try { var node = doc.evaluate(''./TD[1]/A[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(!node) { var node = doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); } + if(node) { - var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TH[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); - field = field.toLowerCase(); + var casedField = Scholar.Utilities.superCleanString(doc.evaluate(''./TH[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); + field = casedField.toLowerCase(); var value = Scholar.Utilities.superCleanString(node.nodeValue); if(field == "publisher") { newItem.publisher = value; @@ -898,9 +1021,23 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006 newItem.creators.push(Scholar.Utilities.cleanAuthor(value, "contributor", true)); } else if(field == "corporate author") { newItem.creators.push({lastName:author}); + } else if(field == "subject term" || field == "corporate subject" || field == "geographic term") { + var subjects = value.split("--"); + newItem.tags = newItem.tags.concat(subjects); + } else if(field == "personal subject") { + var subjects = value.split(", "); + newItem.tags = newItem.tags.push(value[0]+", "+value[1]); + } else if(value && field != "http") { + newItem.extra += casedField+": "+value+"\n"; } } } catch (e) {} + + elmt = elmts.iterateNext(); + } + + if(newItem.extra) { + newItem.extra = newItem.extra.substr(0, newItem.extra.length-1); } var callNumber = doc.evaluate(''//tr/td[1][@class="holdingslist"]/text()'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); @@ -923,14 +1060,14 @@ function doWeb(doc, url) { var urls = new Array(); var availableItems = new Array(); - var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//td[@class="searchsum"]/table[//input[@value="Details"]]'', nsResolver); + var tableRows = doc.evaluate(''//td[@class="searchsum"]/table[//input[@value="Details"]]'', doc, nsResolver, XPathResult.ANY_TYPE, null); + var tableRow = tableRows.iterateNext(); // skip first row // Go through table rows - for(var i=1; i journalArticle if issue and volume exist + if(newItem.itemType == "magazineArticle" && (newItem.issue || newItem.volume)) { + newItem.itemType = "journalArticle"; + } + + // figure out what we can attach + var attachArray = { + ''//td[@class="textSmall"]//img[@alt="Full Text - PDF"]'':"ProQuest Full Text (PDF)", + ''//td[@class="textSmall"]//img[@alt="Text+Graphics"]'':"ProQuest Full Text (HTML with Graphics)", + ''//td[@class="textSmall"]//img[@alt="Full Text"]'':"ProQuest Full Text (HTML)", + ''//td[@class="textSmall"]//img[@alt="Abstract"]'':"ProQuest Abstract" + } + for(var xpath in attachArray) { + var item = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(item) { + var title = attachArray[xpath]; + Scholar.Utilities.debug(title); + + if(item.parentNode.tagName.toLowerCase() == "a") { + // item is not this page + newItem.attachments.push({url:item.parentNode.href, + title:title, mimeType:(title == "ProQuest Full Text (PDF)" ? "application/pdf" : "text/html"), + downloadable:true}); + } else { + // item is this page + newItem.attachments.push({document:doc, title:title, downloadable:true}); + } } } @@ -1099,14 +1277,16 @@ function doWeb(doc, url) { var tagRegexp = new RegExp(); tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12]|(?:.*&)Fmt=[12].*&did=)''); - var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[@class="rowUnMarked"]/td[3][@class="textMedium"]'', nsResolver); + var tableRows = doc.evaluate(''//tr[@class="rowUnMarked"]'', + doc, nsResolver, XPathResult.ANY_TYPE, null); // Go through table rows - for(var i=0; i 1) { var selectArray = new Array(); @@ -2439,9 +2676,13 @@ function completeCOinS(newItems) { } selectArray = Scholar.selectItems(selectArray); for(var i in selectArray) { + // add doc as attachment + newItems[i].attachments.push({document:doc}); + newItems[i].complete(); } } else if(newItems.length) { + newItems[0].attachments.push({document:doc}); newItems[0].complete(); } } @@ -2476,9 +2717,9 @@ function doWeb(doc, url) { if(needFullItems.length) { // retrieve full items asynchronously Scholar.wait(); - retrieveNextCOinS(needFullItems, newItems); + retrieveNextCOinS(needFullItems, newItems, doc); } else { - completeCOinS(newItems); + completeCOinS(newItems, doc); } }'); @@ -2520,7 +2761,8 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006 Scholar.Utilities.processDocuments(newUris, function(newDoc) { var newItem = new Scholar.Item("book"); - newItem.source = newDoc.location.href; + newItem.extra = ""; + newItem.attachments.push({title:"Google Books Information Page", document:newDoc}); var namespace = newDoc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -2528,10 +2770,12 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006 } : null; var xpath = ''//table[@id="bib"]/tbody/tr''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - for(var i = 0; i