From 0e63958f96d2d3747b53334ffa0da7bf81d17891 Mon Sep 17 00:00:00 2001
From: Simon Kornblith <simon@simonster.com>
Date: Thu, 24 Aug 2006 18:00:48 +0000
Subject: [PATCH] - make proquest work better behind proxies - improved frame
 support

---
 .../content/scholar/ingester/browser.js       | 19 +++++++-
 .../content/scholar/xpcom/utilities.js        |  4 +-
 scrapers.sql                                  | 43 ++++++++++++-------
 3 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
index cb702defd..d1d359a2a 100644
--- a/chrome/chromeFiles/content/scholar/ingester/browser.js
+++ b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -88,6 +88,17 @@ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) {
 	}
 }
 
+Scholar_Ingester_Interface.searchFrames = function(rootDoc, searchDoc) {
+	for each(var frame in rootDoc.frames) {
+		if(frame.document == searchDoc ||
+		   (frame.document.frames && searchFrames(frame, searchDoc))) {
+			return true;
+		}
+	}
+	
+	return false;
+}
+
 /*
  * An event handler called when a new document is loaded. Creates a new document
  * object, and updates the status of the capture icon
@@ -118,9 +129,13 @@ Scholar_Ingester_Interface.contentLoad = function(event) {
 		var data = Scholar_Ingester_Interface._getData(browser);
 		
 		// if there's already a scrapable page in the browser window, and it's
-		// still there, return
+		// still there, ensure it is actually part of the page, then return
 		if(data.translators && data.translators.length && data.document.location) {
-			return;
+			if(Scholar_Ingester_Interface.searchFrames(rootDoc, data.document)) {
+				return;
+			} else {
+				data.document = null;
+			}
 		}
 		
 		// get translators
diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js
index 00bc4542c..ba868aa67 100644
--- a/chrome/chromeFiles/content/scholar/xpcom/utilities.js
+++ b/chrome/chromeFiles/content/scholar/xpcom/utilities.js
@@ -259,12 +259,12 @@ Scholar.Utilities.Ingester._protocolRe = new RegExp();
 Scholar.Utilities.Ingester._protocolRe.compile("^(?:(?:http|https|ftp):|[^:]*/)", "i");
 Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) {
 	if(this.translate.locationIsProxied) {
-		for(i in urls) {
+		for(var i in urls) {
 			if(this.translate.locationIsProxied) {
 				urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
 			}
 			// check for a protocol colon
-			if(!Scholar.Utilities.Ingester._protocolRe.test(uris[i])) {
+			if(!Scholar.Utilities.Ingester._protocolRe.test(urls[i])) {
 				throw("invalid URL in processDocuments");
 			}
 		}
diff --git a/scrapers.sql b/scrapers.sql
index 4d5003f2f..6d7a6085b 100644
--- a/scrapers.sql
+++ b/scrapers.sql
@@ -1,4 +1,4 @@
--- 54
+-- 55
 
 -- Set the following timestamp to the most recent scraper update date
 REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00'));
@@ -1100,15 +1100,25 @@ function doWeb(doc, url) {
 	}
 }');
 
-REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)',
+REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest', 'Simon Kornblith', '^http://[^/]+/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)',
 'function detectWeb(doc, url) {
-	if(doc.title == "Results") {
-		return "multiple";
-	} else {
-		return "magazineArticle";
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == ''x'') return namespace; else return null;
+	} : null;
+	
+	if(doc.evaluate(''//img[substring(@src, string-length(@src)-32) = "/images/common/logo_proquest.gif" or substring(@src, string-length(@src)-38) = "/images/common/logo_proquest_small.gif"]'',
+	                doc, nsResolver, XPathResult.ANY_TYPE, null)) {
+		if(doc.title == "Results") {
+			return "multiple";
+		} else {
+			return "magazineArticle";
+		}
 	}
 }',
 'function scrape(doc) {
+	Scholar.Utilities.debug(doc.getElementsByTagName("body")[0].innerHTML);
+	
 	var namespace = doc.documentElement.namespaceURI;
 	var nsResolver = namespace ? function(prefix) {
 		if (prefix == ''x'') return namespace; else return null;
@@ -1128,7 +1138,7 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006
 		// there are sometimes additional tags representing higlighting
 		var author = Scholar.Utilities.getNodeString(doc, elmt, ''.//text()'', nsResolver);
 		if(author) {
-			newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author", true));
+			newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author"));
 		}
 	}
 	
@@ -1275,7 +1285,7 @@ function doWeb(doc, url) {
 		
 		// Require link to match this
 		var tagRegexp = new RegExp();
-		tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12]|(?:.*&)Fmt=[12].*&did=)'');
+		tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12][^0-9]|(?:.*&)Fmt=[12][^0-9].*&did=)'');
 		
 		var tableRows = doc.evaluate(''//tr[@class="rowUnMarked"]'',
 		                doc, nsResolver, XPathResult.ANY_TYPE, null);
@@ -1301,22 +1311,23 @@ function doWeb(doc, url) {
 			return true;
 		}
 		
-		var uris = new Array();
+		var urls = new Array();
 		for(var i in items) {
-			uris.push(i);
+			urls.push(i);
 		}
 		
-		Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
+		Scholar.Utilities.processDocuments(urls, function(doc) { scrape(doc) },
 			function() { Scholar.done(); }, null);
 		
 		Scholar.wait();
 	} else {
-		var fmtCheck = /(?:\&|\?)Fmt=([0-9]+)/i
-		var m = fmtCheck.exec(doc.location.href);
-		if(m && (m[1] == "1" || m[1] == "2" || m[1] == "3")) {
+		if(doc.evaluate(''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="headerBlack"]/strong//text()'',
+						doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
 			scrape(doc);
-		} else if(m) {
-			Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, null);
+		} else {
+			var newURL = doc.location.href.replace(/RQT=[0-9]+/i, "RQT=309");
+			newURL = newURL.replace(/Fmt=[0-9]+/i, "Fmt=1");
+			Scholar.Utilities.loadDocument(newURL, function(doc) { scrape(doc); Scholar.done(); }, null);
 			Scholar.wait();
 		}
 	}