- make proquest work better behind proxies
- improved frame support
This commit is contained in:
parent
c5ec34d6ae
commit
0e63958f96
|
@ -88,6 +88,17 @@ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Scholar_Ingester_Interface.searchFrames = function(rootDoc, searchDoc) {
|
||||||
|
for each(var frame in rootDoc.frames) {
|
||||||
|
if(frame.document == searchDoc ||
|
||||||
|
(frame.document.frames && searchFrames(frame, searchDoc))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* An event handler called when a new document is loaded. Creates a new document
|
* An event handler called when a new document is loaded. Creates a new document
|
||||||
* object, and updates the status of the capture icon
|
* object, and updates the status of the capture icon
|
||||||
|
@ -118,9 +129,13 @@ Scholar_Ingester_Interface.contentLoad = function(event) {
|
||||||
var data = Scholar_Ingester_Interface._getData(browser);
|
var data = Scholar_Ingester_Interface._getData(browser);
|
||||||
|
|
||||||
// if there's already a scrapable page in the browser window, and it's
|
// if there's already a scrapable page in the browser window, and it's
|
||||||
// still there, return
|
// still there, ensure it is actually part of the page, then return
|
||||||
if(data.translators && data.translators.length && data.document.location) {
|
if(data.translators && data.translators.length && data.document.location) {
|
||||||
return;
|
if(Scholar_Ingester_Interface.searchFrames(rootDoc, data.document)) {
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
data.document = null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// get translators
|
// get translators
|
||||||
|
|
|
@ -259,12 +259,12 @@ Scholar.Utilities.Ingester._protocolRe = new RegExp();
|
||||||
Scholar.Utilities.Ingester._protocolRe.compile("^(?:(?:http|https|ftp):|[^:]*/)", "i");
|
Scholar.Utilities.Ingester._protocolRe.compile("^(?:(?:http|https|ftp):|[^:]*/)", "i");
|
||||||
Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) {
|
Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) {
|
||||||
if(this.translate.locationIsProxied) {
|
if(this.translate.locationIsProxied) {
|
||||||
for(i in urls) {
|
for(var i in urls) {
|
||||||
if(this.translate.locationIsProxied) {
|
if(this.translate.locationIsProxied) {
|
||||||
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
|
urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]);
|
||||||
}
|
}
|
||||||
// check for a protocol colon
|
// check for a protocol colon
|
||||||
if(!Scholar.Utilities.Ingester._protocolRe.test(uris[i])) {
|
if(!Scholar.Utilities.Ingester._protocolRe.test(urls[i])) {
|
||||||
throw("invalid URL in processDocuments");
|
throw("invalid URL in processDocuments");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
43
scrapers.sql
43
scrapers.sql
|
@ -1,4 +1,4 @@
|
||||||
-- 54
|
-- 55
|
||||||
|
|
||||||
-- Set the following timestamp to the most recent scraper update date
|
-- Set the following timestamp to the most recent scraper update date
|
||||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00'));
|
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00'));
|
||||||
|
@ -1100,15 +1100,25 @@ function doWeb(doc, url) {
|
||||||
}
|
}
|
||||||
}');
|
}');
|
||||||
|
|
||||||
REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)',
|
REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest', 'Simon Kornblith', '^http://[^/]+/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)',
|
||||||
'function detectWeb(doc, url) {
|
'function detectWeb(doc, url) {
|
||||||
if(doc.title == "Results") {
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
return "multiple";
|
var nsResolver = namespace ? function(prefix) {
|
||||||
} else {
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
return "magazineArticle";
|
} : null;
|
||||||
|
|
||||||
|
if(doc.evaluate(''//img[substring(@src, string-length(@src)-32) = "/images/common/logo_proquest.gif" or substring(@src, string-length(@src)-38) = "/images/common/logo_proquest_small.gif"]'',
|
||||||
|
doc, nsResolver, XPathResult.ANY_TYPE, null)) {
|
||||||
|
if(doc.title == "Results") {
|
||||||
|
return "multiple";
|
||||||
|
} else {
|
||||||
|
return "magazineArticle";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}',
|
}',
|
||||||
'function scrape(doc) {
|
'function scrape(doc) {
|
||||||
|
Scholar.Utilities.debug(doc.getElementsByTagName("body")[0].innerHTML);
|
||||||
|
|
||||||
var namespace = doc.documentElement.namespaceURI;
|
var namespace = doc.documentElement.namespaceURI;
|
||||||
var nsResolver = namespace ? function(prefix) {
|
var nsResolver = namespace ? function(prefix) {
|
||||||
if (prefix == ''x'') return namespace; else return null;
|
if (prefix == ''x'') return namespace; else return null;
|
||||||
|
@ -1128,7 +1138,7 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006
|
||||||
// there are sometimes additional tags representing higlighting
|
// there are sometimes additional tags representing higlighting
|
||||||
var author = Scholar.Utilities.getNodeString(doc, elmt, ''.//text()'', nsResolver);
|
var author = Scholar.Utilities.getNodeString(doc, elmt, ''.//text()'', nsResolver);
|
||||||
if(author) {
|
if(author) {
|
||||||
newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author", true));
|
newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1275,7 +1285,7 @@ function doWeb(doc, url) {
|
||||||
|
|
||||||
// Require link to match this
|
// Require link to match this
|
||||||
var tagRegexp = new RegExp();
|
var tagRegexp = new RegExp();
|
||||||
tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12]|(?:.*&)Fmt=[12].*&did=)'');
|
tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12][^0-9]|(?:.*&)Fmt=[12][^0-9].*&did=)'');
|
||||||
|
|
||||||
var tableRows = doc.evaluate(''//tr[@class="rowUnMarked"]'',
|
var tableRows = doc.evaluate(''//tr[@class="rowUnMarked"]'',
|
||||||
doc, nsResolver, XPathResult.ANY_TYPE, null);
|
doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||||
|
@ -1301,22 +1311,23 @@ function doWeb(doc, url) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
var uris = new Array();
|
var urls = new Array();
|
||||||
for(var i in items) {
|
for(var i in items) {
|
||||||
uris.push(i);
|
urls.push(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
Scholar.Utilities.processDocuments(urls, function(doc) { scrape(doc) },
|
||||||
function() { Scholar.done(); }, null);
|
function() { Scholar.done(); }, null);
|
||||||
|
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
} else {
|
} else {
|
||||||
var fmtCheck = /(?:\&|\?)Fmt=([0-9]+)/i
|
if(doc.evaluate(''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="headerBlack"]/strong//text()'',
|
||||||
var m = fmtCheck.exec(doc.location.href);
|
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||||
if(m && (m[1] == "1" || m[1] == "2" || m[1] == "3")) {
|
|
||||||
scrape(doc);
|
scrape(doc);
|
||||||
} else if(m) {
|
} else {
|
||||||
Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, null);
|
var newURL = doc.location.href.replace(/RQT=[0-9]+/i, "RQT=309");
|
||||||
|
newURL = newURL.replace(/Fmt=[0-9]+/i, "Fmt=1");
|
||||||
|
Scholar.Utilities.loadDocument(newURL, function(doc) { scrape(doc); Scholar.done(); }, null);
|
||||||
Scholar.wait();
|
Scholar.wait();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user