From 10ba568ee85ec3b14efd19a4920455fa4b749f70 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Thu, 17 Aug 2006 07:56:01 +0000 Subject: [PATCH] closes #39, auto-ingest of associated files (as recognizable) closes #3, Overflow metadata dumps into "extra" field add "extra" data where such data is useful and conveniently accessible (not available for XML-based export or MARC formats yet) add links to permanent URLs download associated files from full text sources (if extensions.scholar.downloadAssociatedFiles preference is enabled) fix WorldCat translator improve InnoPAC translator (it now works on Georgetown search results pages, albeit slowly, because it must first realize the catalog is misconfigured) tag items from SIRSI and WorldCat return to putting the full lengths of books into "pages," because some citation styles require it fix COinS (broken a few revisions ago) --- .../content/scholar/fileInterface.js | 2 - .../content/scholar/ingester/browser.js | 9 +- .../content/scholar/xpcom/translate.js | 66 +- .../content/scholar/xpcom/utilities.js | 2 +- defaults/preferences/scholar.js | 3 +- scrapers.sql | 1010 +++++++++++------ 6 files changed, 704 insertions(+), 388 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/fileInterface.js b/chrome/chromeFiles/content/scholar/fileInterface.js index d6b60530c..d48e5963f 100644 --- a/chrome/chromeFiles/content/scholar/fileInterface.js +++ b/chrome/chromeFiles/content/scholar/fileInterface.js @@ -12,8 +12,6 @@ var Scholar_File_Interface = new function() { * Creates Scholar.Translate instance and shows file picker for file export */ function exportFile(items) { - Scholar.debug(items); - var translation = new Scholar.Translate("export"); var translators = translation.getTranslators(); diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js index e6b849f79..1ce0cdd02 100644 --- a/chrome/chromeFiles/content/scholar/ingester/browser.js +++ b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -97,14 +97,12 @@ Scholar_Ingester_Interface.contentLoad = function(event) { var rootDoc = doc; // get the appropriate root document to check which browser we're on - Scholar.debug("getting root document"); while(rootDoc.defaultView.frameElement) { rootDoc = rootDoc.defaultView.frameElement.ownerDocument; } // Figure out what browser this contentDocument is associated with var browser; - Scholar.debug("getting browser"); for(var i=0; i/; var newItem = new Scholar.Item("journalArticle"); + newItem.attachments.push(getJSTORAttachment(viewPages[k])); for(var i in lines) { if(lines[i].substring(0,3) == "<1>") { haveStarted = true; } else if(newItemRe.test(lines[i])) { itemComplete(newItem, url); + k++; + newItem = new Scholar.Item("journalArticle"); + newItem.attachments.push(getJSTORAttachment(viewPages[k])); } else if(lines[i].substring(2, 5) == " : " && haveStarted) { var fieldCode = lines[i].substring(0, 2); var fieldContent = Scholar.Utilities.cleanString(lines[i].substring(5)) if(fieldCode == "TI") { - newItem.title = fieldContent; + if(fieldContent) { + newItem.title = fieldContent; + } else { + newItem.title = "[untitled]"; + } } else if(fieldCode == "AU") { var authors = fieldContent.split(";"); for(j in authors) { @@ -565,7 +636,7 @@ function doWeb(doc, url) { } else if(fieldCode == "PP") { newItem.pages = fieldContent; } else if(fieldCode == "EI") { - newItem.source = fieldContent; + newItem.url = fieldContent; } else if(fieldCode == "IN") { newItem.ISSN = fieldContent; } else if(fieldCode == "PB") { @@ -581,13 +652,13 @@ function doWeb(doc, url) { Scholar.done(); }); - }, function() {}); + }); }); Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', +REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.s?html$|cgi-bin/search.cgi)', 'function detectWeb(doc, url) { if(doc.title == "History Cooperative: Search Results") { return "multiple"; @@ -604,14 +675,14 @@ REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006 function scrape(doc) { var newItem = new Scholar.Item("journalArticle"); - newItem.source = doc.location.href; + newItem.url = doc.location.href; var month, year; var metaTags = doc.getElementsByTagName("meta"); associateMeta(newItem, metaTags, "Title", "title"); - associateMeta(newItem, metaTags, "Journal", "publication"); + associateMeta(newItem, metaTags, "Journal", "publicationTitle"); associateMeta(newItem, metaTags, "Volume", "volume"); - associateMeta(newItem, metaTags, "Issue", "number"); + associateMeta(newItem, metaTags, "Issue", "issue"); var author = metaTags.namedItem("Author"); if(author) { @@ -621,14 +692,17 @@ function scrape(doc) { } } + newItem.attachments.push({document:doc, title:"History Cooperative Full Text", + downloadable:true}); + newItem.complete(); // don''t actually need date info for a journal article - /*var month = metaTags.namedItem("PublicationMonth"); + var month = metaTags.namedItem("PublicationMonth"); var year = metaTags.namedItem("PublicationYear"); if(month && year) { - odel.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false); - }*/ + newItem.date = month.getAttribute("content")+" "+year.getAttribute("content"); + } } function doWeb(doc, url) { @@ -667,9 +741,9 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 if (prefix == ''x'') return namespace; else return null; } : null; - var xpath = ''//a[img[@alt="MARC Display"]]''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - if(elmts.length) { + var xpath = ''//a[img[@src="/screens/marcdisp.gif" or @alt="MARC Display" or @src="/screens/regdisp.gif" or @alt="REGULAR RECORD DISPLAY"]]''; + var elmt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(elmt) { return "book"; } // Also, check for links to an item display page @@ -682,10 +756,72 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 return false; }', -'function doWeb(doc, url) { +'function scrape(marc, newDoc) { + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var xpath = ''//pre/text()[1]''; + var text = newDoc.evaluate(xpath, newDoc, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext().nodeValue; + var newItem = new Scholar.Item(); + + var record = new marc.MARC_Record(); + + var linee = text.split("\n"); + for (var i=0; i ''008'' && tag < ''899'') { // jumps low and high tags + if (tag != ''040'') record.add_field(tag,ind1,ind2,value); + } + } + + record.translate(newItem); + newItem.complete(); +} + +function pageByPage(marc, urls) { + Scholar.Utilities.processDocuments(urls, function(newDoc) { + scrape(marc, newDoc); + }, function() { Scholar.done() }); +} + +function doWeb(doc, url) { var uri = doc.location.href; var newUri; + // load translator for MARC + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); var m = matchRegexp.exec(uri); if(m) { @@ -696,74 +832,22 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 if (prefix == ''x'') return namespace; else return null; } : null; - var xpath = ''//a[img[@alt="MARC Display"]]''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - if(elmts.length) { - newUri = elmts[0].href; + var xpath = ''//a[img[@src="/screens/marcdisp.gif" or @alt="MARC Display"]]''; + var aTag = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(aTag) { + newUri = aTag.href; + } else { + var xpath = ''//a[img[@src="/screens/regdisp.gif" or @alt="REGULAR RECORD DISPLAY"]]''; + var aTag = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(aTag) { + scrape(marc, doc); + return; + } } } - // load translator for MARC - var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - if(newUri) { // single page - Scholar.Utilities.loadDocument(newUri, function(newDoc) { - var namespace = newDoc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - - var xpath = ''//pre''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - - var text = doc.evaluate(''./text()[1]'', elmts[0], nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; - - var newItem = new Scholar.Item(); - newItem.source = uri; - - var record = new marc.MARC_Record(); - - var linee = text.split("\n"); - for (var i=0; i ''008'' && tag < ''899'') { // jumps low and high tags - if (tag != ''040'') record.add_field(tag,ind1,ind2,value); - } - } - - record.translate(newItem); - newItem.complete(); - - Scholar.done(); - }, null); + pageByPage(marc, [newUri]); } else { // Search results page // Require link to match this var tagRegexp = new RegExp(); @@ -773,29 +857,52 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 var urls = new Array(); var availableItems = new Array(); - var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//table[@class="browseScreen"]//tr[td/input[@type="checkbox"]]'', nsResolver); + var tableRows = doc.evaluate(''//table[@class="browseScreen"]//tr[@class="browseEntry" or @class="briefCitRow" or td/input[@type="checkbox"]]'', + doc, nsResolver, XPathResult.ANY_TYPE, null); // Go through table rows - for(var i=0; i 1) { + postString += "save_func=save_marked"; + + + Scholar.Utilities.HTTP.doGet(clearUrl, function() { + Scholar.Utilities.HTTP.doPost(postUrl, postString, function() { + Scholar.Utilities.HTTP.doPost(exportUrl, "ex_format=50&ex_device=45&SUBMIT=Submit", function(text) { + var notSpace = /[^\s]/ + if(notSpace.test(text)) { + marc.Scholar.write(text); + marc.Scholar.eof(); + marc.doImport(); + + Scholar.done(); + } else { + pageByPage(marc, newUrls); + } + }); }); }); - }); + } else { + pageByPage(marc, newUrls); + } } Scholar.wait(); @@ -840,13 +964,11 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006 } : null; var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - if(elmts.length) { + if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { return "book"; } var xpath = ''//td[@class="searchsum"]/table''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - if(elmts.length) { + if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { return "multiple"; } }', @@ -857,24 +979,25 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006 } : null; var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - if(!elmts.length) { + var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var elmt = elmts.iterateNext(); + if(!elmt) { return false; } var newItem = new Scholar.Item("book"); - newItem.source = doc.location.href; + newItem.extra = ""; - for (var i = 0; i < elmts.length; i++) { - var elmt = elmts[i]; + while(elmt) { try { var node = doc.evaluate(''./TD[1]/A[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(!node) { var node = doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); } + if(node) { - var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TH[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); - field = field.toLowerCase(); + var casedField = Scholar.Utilities.superCleanString(doc.evaluate(''./TH[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); + field = casedField.toLowerCase(); var value = Scholar.Utilities.superCleanString(node.nodeValue); if(field == "publisher") { newItem.publisher = value; @@ -898,9 +1021,23 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006 newItem.creators.push(Scholar.Utilities.cleanAuthor(value, "contributor", true)); } else if(field == "corporate author") { newItem.creators.push({lastName:author}); + } else if(field == "subject term" || field == "corporate subject" || field == "geographic term") { + var subjects = value.split("--"); + newItem.tags = newItem.tags.concat(subjects); + } else if(field == "personal subject") { + var subjects = value.split(", "); + newItem.tags = newItem.tags.push(value[0]+", "+value[1]); + } else if(value && field != "http") { + newItem.extra += casedField+": "+value+"\n"; } } } catch (e) {} + + elmt = elmts.iterateNext(); + } + + if(newItem.extra) { + newItem.extra = newItem.extra.substr(0, newItem.extra.length-1); } var callNumber = doc.evaluate(''//tr/td[1][@class="holdingslist"]/text()'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); @@ -923,14 +1060,14 @@ function doWeb(doc, url) { var urls = new Array(); var availableItems = new Array(); - var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//td[@class="searchsum"]/table[//input[@value="Details"]]'', nsResolver); + var tableRows = doc.evaluate(''//td[@class="searchsum"]/table[//input[@value="Details"]]'', doc, nsResolver, XPathResult.ANY_TYPE, null); + var tableRow = tableRows.iterateNext(); // skip first row // Go through table rows - for(var i=1; i journalArticle if issue and volume exist + if(newItem.itemType == "magazineArticle" && (newItem.issue || newItem.volume)) { + newItem.itemType = "journalArticle"; + } + + // figure out what we can attach + var attachArray = { + ''//td[@class="textSmall"]//img[@alt="Full Text - PDF"]'':"ProQuest Full Text (PDF)", + ''//td[@class="textSmall"]//img[@alt="Text+Graphics"]'':"ProQuest Full Text (HTML with Graphics)", + ''//td[@class="textSmall"]//img[@alt="Full Text"]'':"ProQuest Full Text (HTML)", + ''//td[@class="textSmall"]//img[@alt="Abstract"]'':"ProQuest Abstract" + } + for(var xpath in attachArray) { + var item = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(item) { + var title = attachArray[xpath]; + Scholar.Utilities.debug(title); + + if(item.parentNode.tagName.toLowerCase() == "a") { + // item is not this page + newItem.attachments.push({url:item.parentNode.href, + title:title, mimeType:(title == "ProQuest Full Text (PDF)" ? "application/pdf" : "text/html"), + downloadable:true}); + } else { + // item is this page + newItem.attachments.push({document:doc, title:title, downloadable:true}); + } } } @@ -1099,14 +1277,16 @@ function doWeb(doc, url) { var tagRegexp = new RegExp(); tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12]|(?:.*&)Fmt=[12].*&did=)''); - var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[@class="rowUnMarked"]/td[3][@class="textMedium"]'', nsResolver); + var tableRows = doc.evaluate(''//tr[@class="rowUnMarked"]'', + doc, nsResolver, XPathResult.ANY_TYPE, null); // Go through table rows - for(var i=0; i 1) { var selectArray = new Array(); @@ -2439,9 +2676,13 @@ function completeCOinS(newItems) { } selectArray = Scholar.selectItems(selectArray); for(var i in selectArray) { + // add doc as attachment + newItems[i].attachments.push({document:doc}); + newItems[i].complete(); } } else if(newItems.length) { + newItems[0].attachments.push({document:doc}); newItems[0].complete(); } } @@ -2476,9 +2717,9 @@ function doWeb(doc, url) { if(needFullItems.length) { // retrieve full items asynchronously Scholar.wait(); - retrieveNextCOinS(needFullItems, newItems); + retrieveNextCOinS(needFullItems, newItems, doc); } else { - completeCOinS(newItems); + completeCOinS(newItems, doc); } }'); @@ -2520,7 +2761,8 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006 Scholar.Utilities.processDocuments(newUris, function(newDoc) { var newItem = new Scholar.Item("book"); - newItem.source = newDoc.location.href; + newItem.extra = ""; + newItem.attachments.push({title:"Google Books Information Page", document:newDoc}); var namespace = newDoc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -2528,10 +2770,12 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006 } : null; var xpath = ''//table[@id="bib"]/tbody/tr''; - var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - for(var i = 0; i