From 0e63958f96d2d3747b53334ffa0da7bf81d17891 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Thu, 24 Aug 2006 18:00:48 +0000 Subject: [PATCH] - make proquest work better behind proxies - improved frame support --- .../content/scholar/ingester/browser.js | 19 +++++++- .../content/scholar/xpcom/utilities.js | 4 +- scrapers.sql | 43 ++++++++++++------- 3 files changed, 46 insertions(+), 20 deletions(-) diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js index cb702defd..d1d359a2a 100644 --- a/chrome/chromeFiles/content/scholar/ingester/browser.js +++ b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -88,6 +88,17 @@ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) { } } +Scholar_Ingester_Interface.searchFrames = function(rootDoc, searchDoc) { + for each(var frame in rootDoc.frames) { + if(frame.document == searchDoc || + (frame.document.frames && searchFrames(frame, searchDoc))) { + return true; + } + } + + return false; +} + /* * An event handler called when a new document is loaded. Creates a new document * object, and updates the status of the capture icon @@ -118,9 +129,13 @@ Scholar_Ingester_Interface.contentLoad = function(event) { var data = Scholar_Ingester_Interface._getData(browser); // if there's already a scrapable page in the browser window, and it's - // still there, return + // still there, ensure it is actually part of the page, then return if(data.translators && data.translators.length && data.document.location) { - return; + if(Scholar_Ingester_Interface.searchFrames(rootDoc, data.document)) { + return; + } else { + data.document = null; + } } // get translators diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js index 00bc4542c..ba868aa67 100644 --- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js +++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -259,12 +259,12 @@ Scholar.Utilities.Ingester._protocolRe = new RegExp(); Scholar.Utilities.Ingester._protocolRe.compile("^(?:(?:http|https|ftp):|[^:]*/)", "i"); Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) { if(this.translate.locationIsProxied) { - for(i in urls) { + for(var i in urls) { if(this.translate.locationIsProxied) { urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]); } // check for a protocol colon - if(!Scholar.Utilities.Ingester._protocolRe.test(uris[i])) { + if(!Scholar.Utilities.Ingester._protocolRe.test(urls[i])) { throw("invalid URL in processDocuments"); } } diff --git a/scrapers.sql b/scrapers.sql index 4d5003f2f..6d7a6085b 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 54 +-- 55 -- Set the following timestamp to the most recent scraper update date REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00')); @@ -1100,15 +1100,25 @@ function doWeb(doc, url) { } }'); -REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)', +REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest', 'Simon Kornblith', '^http://[^/]+/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)', 'function detectWeb(doc, url) { - if(doc.title == "Results") { - return "multiple"; - } else { - return "magazineArticle"; + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + if(doc.evaluate(''//img[substring(@src, string-length(@src)-32) = "/images/common/logo_proquest.gif" or substring(@src, string-length(@src)-38) = "/images/common/logo_proquest_small.gif"]'', + doc, nsResolver, XPathResult.ANY_TYPE, null)) { + if(doc.title == "Results") { + return "multiple"; + } else { + return "magazineArticle"; + } } }', 'function scrape(doc) { + Scholar.Utilities.debug(doc.getElementsByTagName("body")[0].innerHTML); + var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; @@ -1128,7 +1138,7 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006 // there are sometimes additional tags representing higlighting var author = Scholar.Utilities.getNodeString(doc, elmt, ''.//text()'', nsResolver); if(author) { - newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author", true)); + newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author")); } } @@ -1275,7 +1285,7 @@ function doWeb(doc, url) { // Require link to match this var tagRegexp = new RegExp(); - tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12]|(?:.*&)Fmt=[12].*&did=)''); + tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12][^0-9]|(?:.*&)Fmt=[12][^0-9].*&did=)''); var tableRows = doc.evaluate(''//tr[@class="rowUnMarked"]'', doc, nsResolver, XPathResult.ANY_TYPE, null); @@ -1301,22 +1311,23 @@ function doWeb(doc, url) { return true; } - var uris = new Array(); + var urls = new Array(); for(var i in items) { - uris.push(i); + urls.push(i); } - Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, + Scholar.Utilities.processDocuments(urls, function(doc) { scrape(doc) }, function() { Scholar.done(); }, null); Scholar.wait(); } else { - var fmtCheck = /(?:\&|\?)Fmt=([0-9]+)/i - var m = fmtCheck.exec(doc.location.href); - if(m && (m[1] == "1" || m[1] == "2" || m[1] == "3")) { + if(doc.evaluate(''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="headerBlack"]/strong//text()'', + doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { scrape(doc); - } else if(m) { - Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, null); + } else { + var newURL = doc.location.href.replace(/RQT=[0-9]+/i, "RQT=309"); + newURL = newURL.replace(/Fmt=[0-9]+/i, "Fmt=1"); + Scholar.Utilities.loadDocument(newURL, function(doc) { scrape(doc); Scholar.done(); }, null); Scholar.wait(); } }