zotero/translators/Google Scholar.js
2011-08-11 07:01:19 +00:00

541 lines
14 KiB
JavaScript

{
"translatorID": "57a00950-f0d1-4b41-b6ba-44ff0fc30289",
"label": "Google Scholar",
"creator": "Simon Kornblith, Frank Bennett",
"target": "^https?://scholar\\.google\\.(?:com|com?\\.[a-z]{2}|[a-z]{2}|co\\.[a-z]{2})/scholar(?:_case)*",
"minVersion": "2.1.9",
"maxVersion": "",
"priority": 100,
"inRepository": true,
"translatorType": 4,
"browserSupport": "gcs",
"lastUpdated": "2011-07-04 13:18:22"
}
/*
* Test pages
*
* Searches of Google Scholar with the following terms should yield a folder
* icon that works. Check that unlinked ([CITATION]) items that provide
* no BibTeX data (there is currently one under "Marbury v. Madison",
* and "clifford" seems to be a good source of garbage) are
* dropped from the listings:
*
* marbury v madison
* kelo
* smith
* view of the cathedral
* clifford
*
* "How cited" pages should NOT yield a page or folder icon. The
* Urls to these currently look like this:
*
* http://scholar.google.co.jp/scholar_case?about=1101424605047973909&q=kelo&hl=en&as_sdt=2002
*
* Case pages should present a document icon that works:
*
* http://scholar.google.co.jp/scholar_case?case=18273389148555376997&hl=en&as_sdt=2002&kqfp=13204897074208725174&kql=186&kqpfp=16170611681001262513#kq
*/
/*
* ###############################
* ### detectWeb() and doWeb() ###
* ###############################
*/
var bogusItemID = 1;
var detectWeb = function (doc, url) {
// Icon shows only for search results and law cases
if (url.match(/scholar_case/)) {
if (url.match(/about=/)) {
return false;
} else {
return "case";
}
} else {
return "multiple";
}
};
function doWeb(doc, url) {
var haveBibTexLinks, nsResolver;
// Invoke the case or the listing scraper, as appropriate.
// In a listings page, this forces use of bibtex data and English page version
nsResolver = doc.createNSResolver(doc.documentElement);
if (url.match(/scholar_case/)) {
scrapeCase(doc, url);
} else {
haveBibTexLinks = doc.evaluate('//a[contains(@href, "scholar.bib")]',
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if(!haveBibTexLinks) {
url = url.replace (/hl\=[^&]*&?/, "");
url = url.replace("scholar?", "scholar_setprefs?hl=en&scis=yes&scisf=4&submit=Save+Preferences&");
Zotero.Utilities.processDocuments(url, function(scisigDoc) {
var scisig = scisigDoc.evaluate('//input[@name="scisig"]',
scisigDoc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
url = url + "&scisig="+scisig.value;
Zotero.Utilities.processDocuments(url, function(doc) {
scrapeListing(doc);
}, function() {});
}, function() {});
} else {
scrapeListing(doc);
}
Zotero.wait();
}
}
/*
* #########################
* ### Scraper Functions ###
* #########################
*/
var scrapeListing = function (doc) {
var nsResolver = doc.createNSResolver(doc.documentElement);
// XML fragment lists
var titleFrags = doc.evaluate('//div[@class="gs_r"]//h3', doc, nsResolver, XPathResult.ANY_TYPE, null);
var citeletFrags = doc.evaluate('//span[@class="gs_a"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
var bibtexFrags = doc.evaluate('//a[contains(@href, "scholar.bib")]',
doc, nsResolver, XPathResult.ANY_TYPE, null);
var labels = [];
var factories = [];
while (true) {
var titleFrag = titleFrags.iterateNext();
if (!titleFrag) {
break;
}
// initialize argument values
var titleString = titleFrag.textContent;
var citeletString = citeletFrags.iterateNext().textContent;
var bibtexLink = bibtexFrags.iterateNext().href;
var attachmentFrag = doc.evaluate('.//a',
titleFrag, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if (attachmentFrag) {
var attachmentLinks = [attachmentFrag.href];
} else {
var attachmentLinks = [];
}
// Instantiate item factory with available data
var factory = new ItemFactory(citeletString, attachmentLinks, titleString, bibtexLink);
if (!factory.hasUsefulData()) {
continue;
}
// (Feed the array used in the selection list)
if (factory.hyphenSplit.length) {
labels.push(titleString + " (" + factory.trailingInfo + ")");
} else {
labels.push(titleString);
}
factories.push(factory);
}
Zotero.selectItems(labels, function(items) {
if(!items) {
return false;
}
var newFactories = [];
for(var i in items) {
newFactories.push(factories[i]);
}
processFactories(newFactories);
});
return true;
};
function processFactories(factories) {
if(!factories.length) {
Zotero.done();
return;
}
var factory = factories.shift();
factory.getCourt();
factory.getVolRepPag();
if (factory.hasReporter()) {
// If we win here, we get by without fetching the BibTeX object at all.
factory.saveItem();
processFactories(factories);
} else {
var attachments;
// The only supplementary translator we use is BibTeX
var bibtexTranslator = Zotero.loadTranslator("import");
bibtexTranslator.setTranslator("9cb70025-a888-4a29-a210-93ec52da40d4");
bibtexTranslator.setHandler("itemDone", function(obj, item) {
item.attachments = attachments;
item.complete();
});
factory.getBibtexData(function(res) {
if (res) {
// Has BibTeX data with title, pass it through to the BibTeX translator
attachments = factory.getAttachments("Page");
bibtexTranslator.setString(res);
bibtexTranslator.setHandler("done", function() {
processFactories(factories);
});
bibtexTranslator.translate();
} else {
// If BibTeX is empty, this is some kind of case, if anything.
// Metadata from the citelet, supplemented by the target
// document for the docket number, if possible.
if (!factory.hasReporter()) {
factory.getDocketNumber(null, function() {
factory.saveItem();
processFactories(factories);
});
} else {
factory.saveItem();
processFactories(factories);
}
}
});
}
}
var scrapeCase = function (doc, url) {
// Citelet is identified by
// id="gsl_reference"
var nsResolver = doc.createNSResolver(doc.documentElement);
var refFrag = doc.evaluate('//div[@id="gsl_reference"]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if (refFrag) {
// citelet looks kind of like this
// Powell v. McCormack, 395 US 486 - Supreme Court 1969
var item = new Zotero.Item("case");
var factory = new ItemFactory(refFrag.textContent, [url]);
factory.repairCitelet();
factory.getDate();
factory.getCourt();
factory.getVolRepPag();
if (!factory.hasReporter()) {
// Look for docket number in the current document
factory.getDocketNumber(doc);
}
factory.getTitle();
factory.saveItem();
}
};
/*
* ####################
* ### Item Factory ###
* ####################
*/
var ItemFactory = function (citeletString, attachmentLinks, titleString, bibtexLink) {
// var strings
this.v = {};
this.v.title = titleString;
this.v.number = false;
this.v.court = false;
this.v.extra = false;
this.v.date = undefined;
this.v.jurisdiction = false;
this.v.docketNumber = false;
this.vv = {};
this.vv.volRepPag = [];
// portable array
this.attachmentLinks = attachmentLinks;
// working strings
this.citelet = citeletString;
this.bibtexLink = bibtexLink;
this.bibtexData = undefined;
this.trailingInfo = false;
// simple arrays of strings
this.hyphenSplit = false;
this.commaSplit = false;
};
ItemFactory.prototype.repairCitelet = function () {
if (!this.citelet.match(/\s+-\s+/)) {
this.citelet = this.citelet.replace(/,\s+([A-Z][a-z]+:)/, " - $1");
}
};
ItemFactory.prototype.repairTitle = function () {
// All-caps words of four or more characters probably need fixing.
if (this.v.title.match(/(?:[^a-z]|^)[A-Z]{4,}(?:[^a-z]|$)/)) {
this.v.title = Zotero.Utilities.capitalizeTitle(this.v.title.toLowerCase()).replace(/([^0-9a-z])V([^0-9a-z])/, "$1v$2");
}
};
ItemFactory.prototype.hasUsefulData = function () {
if (this.getDate()) {
return true;
}
if (this.hasInitials()) {
return true;
}
return false;
};
ItemFactory.prototype.hasInitials = function () {
if (this.hyphenSplit.length && this.hyphenSplit[0].match(/[A-Z] /)) {
return true;
}
return false;
};
ItemFactory.prototype.hasReporter = function () {
if (this.vv.volRepPag.length > 0) {
return true;
}
return false;
};
ItemFactory.prototype.getDate = function () {
var i, m;
// Citelet parsing, step (1)
if (!this.hyphenSplit) {
this.hyphenSplit = this.citelet.split(/\s+-\s+/);
this.trailingInfo = this.hyphenSplit.slice(-1);
}
if (!this.v.date && this.v.date !== false) {
this.v.date = false;
for (i = this.hyphenSplit.length - 1; i > -1; i += -1) {
m = this.hyphenSplit[i].match(/(?:(.*)\s+)*([0-9]{4})$/);
if (m) {
this.v.date = m[2];
if (m[1]) {
this.hyphenSplit[i] = m[1];
} else {
this.hyphenSplit[i] = "";
}
this.hyphenSplit = this.hyphenSplit.slice(0, i + 1);
break;
}
}
}
return this.v.date;
};
ItemFactory.prototype.getCourt = function () {
var s, m;
// Citelet parsing, step (2)
s = this.hyphenSplit.pop().replace(/,\s*$/, "").replace(/\u2026\s*$/, "Court");
m = s.match(/(?:([a-zA-Z]+):\s*)*(.*)/);
if (m) {
this.v.court = m[2].replace("_", " ", "g");
if (m[1]) {
this.v.extra = "{:jurisdiction: " + m[1] + "}";
}
}
return this.v.court;
};
ItemFactory.prototype.getVolRepPag = function () {
var i, m;
// Citelet parsing, step (3)
if (this.hyphenSplit.length) {
this.commaSplit = this.hyphenSplit.slice(-1)[0].split(/\s*,\s+/);
var gotOne = false;
for (i = this.commaSplit.length - 1; i > -1; i += -1) {
m = this.commaSplit[i].match(/^([0-9]+)\s+(.*)\s+(.*)/);
if (m) {
var volRepPag = {};
volRepPag.volume = m[1];
volRepPag.reporter = m[2];
volRepPag.pages = m[3].replace(/\s*$/, "");
this.commaSplit.pop();
if (!volRepPag.pages.match(/[0-9]$/) && (i > 0 || gotOne)) {
continue;
}
gotOne = true;
this.vv.volRepPag.push(volRepPag);
} else {
break;
}
}
}
};
ItemFactory.prototype.getTitle = function () {
// Citelet parsing, step (4) [optional]
if (this.commaSplit) {
this.v.title = this.commaSplit.join(", ");
}
};
ItemFactory.prototype.getDocketNumber = function (doc, callback) {
if (!doc) {
// Needs doc fetch and xpath
var me = this;
Zotero.Utilities.processDocuments(this.attachmentLinks[0],
function(doc) { me.getDocumentNumber(doc, callback) }, function() {});
return;
}
var nsResolver = doc.createNSResolver(doc.documentElement);
if (doc) {
var docNumFrag = doc.evaluate('//center[preceding-sibling::center//h3[@id="gsl_case_name"]]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
if (docNumFrag) {
this.v.docketNumber = docNumFrag.textContent.replace(/^\s*[Nn][Oo](?:.|\s+)\s*/, "").replace(/\.\s*$/, "");
}
}
if(callback) callback();
};
ItemFactory.prototype.getAttachments = function (doctype) {
var i, ilen, attachments;
attachments = [];
for (i = 0, ilen = this.attachmentLinks.length; i < ilen; i += 1) {
attachments.push({title:"Google Scholar Linked " + doctype, type:"text/html",
url:this.attachmentLinks[i]});
}
return attachments;
};
ItemFactory.prototype.pushAttachments = function (doctype) {
this.item.attachments = this.getAttachments(doctype);
};
ItemFactory.prototype.getBibtexData = function (callback) {
if (!this.bibtexData) {
if (this.bibtexData !== false) {
Zotero.Utilities.doGet(this.bibtexLink, function(bibtexData) {
if (!bibtexData.match(/title={{}}/)) {
this.bibtexData = bibtexData;
} else {
this.bibtexData = false;
}
callback(this.bibtexData);
});
return;
}
}
callback(this.bibtexData);
};
ItemFactory.prototype.saveItem = function () {
var i, ilen, key;
if (this.v.title) {
this.repairTitle();
if (this.vv.volRepPag.length) {
var completed_items = [];
for (i = 0, ilen = this.vv.volRepPag.length; i < ilen; i += 1) {
this.item = new Zotero.Item("case");
for (key in this.vv.volRepPag[i]) {
if (this.vv.volRepPag[i][key]) {
this.item[key] = this.vv.volRepPag[i][key];
}
}
this.saveItemCommonVars();
if (i === (this.vv.volRepPag.length - 1)) {
this.pushAttachments("Judgement");
}
this.item.itemID = "" + bogusItemID;
bogusItemID += 1;
completed_items.push(this.item);
}
for (i = 0, ilen = completed_items.length; i < ilen; i += 1) {
for (j = 0, jlen = completed_items.length; j < jlen; j += 1) {
if (i === j) {
continue;
}
completed_items[i].seeAlso.push(completed_items[j].itemID);
}
completed_items[i].complete();
}
} else {
this.item = new Zotero.Item("case");
this.saveItemCommonVars();
this.pushAttachments("Judgement");
this.item.complete();
}
}
};
ItemFactory.prototype.saveItemCommonVars = function () {
for (key in this.v) {
if (this.v[key]) {
this.item[key] = this.v[key];
}
}
};
/** BEGIN TEST CASES **/
var testCases = [
{
"type": "web",
"url": "http://scholar.google.com/scholar?q=marbury&hl=en&btnG=Search&as_sdt=1%2C22&as_sdtp=on",
"items": "multiple"
},
{
"type": "web",
"url": "http://scholar.google.com/scholar?hl=en&q=kelo&btnG=Search&as_sdt=0%2C22&as_ylo=&as_vis=0",
"items": "multiple"
},
{
"type": "web",
"url": "http://scholar.google.com/scholar?hl=en&q=smith&btnG=Search&as_sdt=0%2C22&as_ylo=&as_vis=0",
"items": "multiple"
},
{
"type": "web",
"url": "http://scholar.google.com/scholar?hl=en&q=view+of+the+cathedral&btnG=Search&as_sdt=0%2C22&as_ylo=&as_vis=0",
"items": "multiple"
},
{
"type": "web",
"url": "http://scholar.google.com/scholar?hl=en&q=clifford&btnG=Search&as_sdt=0%2C22&as_ylo=&as_vis=0",
"items": "multiple"
},
{
"type": "web",
"url": "http://scholar.google.com/scholar_case?case=9834052745083343188&q=marbury+v+madison&hl=en&as_sdt=2,5",
"items": [
{
"itemType": "case",
"creators": [],
"notes": [],
"tags": [],
"seeAlso": [],
"attachments": [
{
"title": "Google Scholar Linked Judgement",
"type": "text/html",
"url": false
}
],
"volume": "5",
"reporter": "US",
"pages": "137",
"title": "Marbury v. Madison",
"court": "Supreme Court",
"date": "1803",
"itemID": "1",
"libraryCatalog": "Google Scholar"
}
]
}
]
/** END TEST CASES **/