updating Sudoc.js with sylvain's changes

This commit is contained in:
Matt Burton 2009-04-07 15:53:36 +00:00
parent 19ff243b92
commit d4a7e5bd37

View File

@ -2,7 +2,7 @@
"translatorID":"1b9ed730-69c7-40b0-8a06-517a89a3a278", "translatorID":"1b9ed730-69c7-40b0-8a06-517a89a3a278",
"translatorType":4, "translatorType":4,
"label":"Sudoc", "label":"Sudoc",
"creator":"Sean Takats and Michael Berkowitz", "creator":"Sean Takats and Michael Berkowitz, updated by Sylvain Machefert",
"target":"^http://www\\.sudoc\\.abes\\.fr", "target":"^http://www\\.sudoc\\.abes\\.fr",
"minVersion":"1.0.0b3.r1", "minVersion":"1.0.0b3.r1",
"maxVersion":"", "maxVersion":"",
@ -11,42 +11,59 @@
"lastUpdated":"2008-05-19 17:30:00" "lastUpdated":"2008-05-19 17:30:00"
} }
function detectWeb(doc, url) { function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI; var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) { var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null; if (prefix == 'x') return namespace; else return null;
} : null; } : null;
var xpath = '//table/tbody/tr/td[1][@class="preslabel"]/strong'; var multxpath = '/html/body/div[2]/div/span';
var multxpath = '//a[@id="InitialFocusPoint"]';
var elt;
if (elt = doc.evaluate(multxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { if (elt = doc.evaluate(multxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
return "multiple"; var content = elt.textContent;
} if ( (content == "Résultats") || (content == "Results") )
else if (elt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
{ return "multiple";
var contenu = elt.textContent; }
var numRegexp = /(Num.ro.de.notice|Record.number)/; else if ( (content == "Notice complète") || (content == "title data") )
var m = numRegexp.exec(contenu); {
if (m) { var xpathimage = '/html/body/div[2]/div[4]/span/img';
// On a bien une notice d"ouvrage, on doit chercher limage if (elt = doc.evaluate(xpathimage, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
// pour choisir le type de document {
var imgXpath = '/html/body/table/tbody/tr/td[1]/p/img/@src'; var type = elt.getAttribute('src');
var imgsrc = doc.evaluate(imgXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; if (type.indexOf('article.gif') > 0)
if (imgsrc){ {
if (imgsrc.indexOf("icon_per.gif") > 0){ return "journalArticle";
return "book"; }
} else if (imgsrc.indexOf("icon_books.gif") > 0){ else if (type.indexOf('book.gif') > 0)
return "book"; {
} else if (imgsrc.indexOf("icon_thesis.gif") > 0){ return "book";
return "thesis"; }
} else if (imgsrc.indexOf("icon_art.gif") > 0){ else if (type.indexOf('handwriting.gif') > 0)
return "journalArticle"; {
} else { return "manuscript";
return "book"; }
} else if (type.indexOf('sons.gif') > 0)
} {
return "audioRecording";
}
else if (type.indexOf('sound.gif') > 0)
{
return "audioRecording";
}
else if (type.indexOf('thesis.gif') > 0)
{
return "thesis";
}
else if (type.indexOf('map.gif') > 0)
{
return "map";
}
else
{
return "book";
}
}
} }
} }
} }
@ -56,105 +73,154 @@ function scrape(doc) {
var nsResolver = namespace ? function(prefix) { var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null; if (prefix == 'x') return namespace; else return null;
} : null; } : null;
var rowXpath = '//tr[td[@class="preslabel"]]'; var zXpath = '/html/body/span[@class="Z3988"]';
var tableRows = doc.evaluate(rowXpath, doc, nsResolver, XPathResult.ANY_TYPE, null); var eltCoins = doc.evaluate(zXpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var tableRow; if (eltCoins = doc.evaluate(zXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
var newItem = new Zotero.Item();
// TODO add other item types using detectWeb's icon checking code
newItem.itemType = "book";
var imgXpath = '/html/body/table/tbody/tr/td[1]/p/img/@src';
var imgsrc = doc.evaluate(imgXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
if (imgsrc){
if (imgsrc.indexOf("icon_per.gif") > 0){
newItem.itemType = "book";
} else if (imgsrc.indexOf("icon_books.gif") > 0){
newItem.itemType = "book";
} else if (imgsrc.indexOf("icon_thesis.gif") > 0){
newItem.itemType = "thesis";
} else if (imgsrc.indexOf("icon_art.gif") > 0){
newItem.itemType = "journalArticle";
} else {
newItem.itemType = "book";
}
} else {
newItem.itemType = "book";
}
while (tableRow = tableRows.iterateNext())
{ {
var field = doc.evaluate('./td[1]', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; var coins = eltCoins.getAttribute('title');
var value = doc.evaluate('./td[2]', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
field = Zotero.Utilities.superCleanString(field); var newItem = new Zotero.Item();
field = field.replace(/(\(s\))?\s*:\s*$/, ""); newItem.repository = false; // do not save repository
if (field == "Titre" || field == "Title"){ if(Zotero.Utilities.parseContextObject(coins, newItem))
Zotero.debug("title = " + value); {
value = value.replace(/(\[[^\]]+\])/g,""); if (newItem.title)
newItem.title = value.split(" / ")[0]; {
} // We use the same method as in detectWeb to find
if (field.substr(0,6) == "Auteur" || field.substr(0,6) == "Author"){ // the real type of document
var authors = doc.evaluate('./td[2]/a', tableRow, nsResolver, XPathResult.ANY_TYPE, null); var xpathimage = '/html/body/div[2]/div[4]/span/img';
var author; if (elt = doc.evaluate(xpathimage, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
while (author = authors.iterateNext()){ {
var type = elt.getAttribute('src');
var ZoteroType = '';
if (type.indexOf('article.gif') > 0)
{
zoteroType = 'journalArticle';
}
else if (type.indexOf('book.gif') > 0)
{
zoteroType = 'book';
}
else if (type.indexOf('handwriting.gif') > 0)
{
zoteroType = 'manuscript';
}
else if (type.indexOf('sons.gif') > 0)
{
zoteroType = "audioRecording";
}
else if (type.indexOf('sound.gif') > 0)
{
zoteroType = "audioRecording";
}
else if (type.indexOf('thesis.gif') > 0)
{
zoteroType = "thesis";
}
else if (type.indexOf('map.gif') > 0)
{
zoteroType = "map";
}
else
{
zoteroType = "book";
}
newItem.itemType = zoteroType;
}
// We need to correct some informations where COinS is wrong
var rowXpath = '//tr[td[@class="rec_lable"]]';
var tableRows = doc.evaluate(rowXpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var tableRow;
while (tableRow = tableRows.iterateNext())
{
var field = doc.evaluate('./td[1]', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
var value = doc.evaluate('./td[2]', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
field = Zotero.Utilities.superCleanString(field);
field = field.replace(/(\(s\))?\s*:\s*$/, "");
// With COins, only one author is taken, changed.
if (field.substr(0,6) == "Auteur" || field.substr(0,6) == "Author")
{
var authors = doc.evaluate('./td[2]/div', tableRow, nsResolver, XPathResult.ANY_TYPE, null);
newItem.creators = new Array();
while (author = authors.iterateNext())
{
var authorText = author.textContent; var authorText = author.textContent;
var authorParts = authorText.split(" (");
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorParts[0], "author", true)); authorFunction = authorText.split(". ")[1];
authorText = authorText.split(". ")[0];
if (authorFunction)
{
authorFunction = Zotero.Utilities.superCleanString(authorFunction);
}
var zoteroFunction = '';
// TODO : Add other authotiry types
if (authorFunction == 'Traduction')
{
zoteroFunction = 'Translator';
}
else
{
zoteroFunction = 'Author';
}
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorText, zoteroFunction, true));
}
} }
} // The serie isn't in COinS
if (field.substr(0,4) == "Date"){ else if (field.substr(0,5) == "Serie" || field.substr(0,10) == "Collection")
newItem.date = value; {
} newItem.series = value;
if (field.substr(0,7) == "Editeur" || field.substr(0,9) == "Publisher"){
var pubParts = value.split(" : ");
newItem.place = pubParts[0];
// needs error checking below to avoid error
if (pubParts[1] ) {
pubParts = pubParts[1].split(", ");
newItem.publisher = pubParts[0];
} }
} // When there's a subtitle, only main title is used !
if (field.substr(0,4) == "ISBN" || field.substr(0,4) == "ISSN"){ else if (field == "Titre" || field == "Title")
newItem.ISBN = value.split(" (")[0]; {
} var title = '';
if (field == "Description") { var titles = doc.evaluate('./td[2]/div/span', tableRow, nsResolver, XPathResult.ANY_TYPE, null);
var m = value.match(/([0-9]+) (?:[pP])/); while (partTitle = titles.iterateNext())
if (m) { {
newItem.pages = m[1]; partTitle = partTitle.textContent;
partTitle = partTitle.replace(/(\[[^\]]+\] ?)/g,"");
title = title + partTitle;
}
// Remove the author
title = title.split(" / ")[0];
newItem.title = title;
} }
} // Language not defined in COinS
if (field.substr(0,5) == "Serie" || field.substr(0,10) == "Collection"){ else if ( (field == "Langue") || (field == "Language") )
newItem.series = value; {
} newItem.language = value;
if (field.substr(0,6) == "Sujets" || field.substr(0,8) == "Subjects"){
var subjectElmts = doc.evaluate('./td[2]/a', tableRow, nsResolver, XPathResult.ANY_TYPE, null);
var subject;
var subjects;
while (subject = subjectElmts.iterateNext()){
subjects = subject.textContent.split(" -- ");
newItem.tags = newItem.tags.concat(subjects);
} }
} else if ( (field == "Résumé") || (field == "Abstract") )
if (field == "In" || field == "Dans"){ {
var jtitle = value.replace(/(\[[^\]]+\])/g,""); if (newItem.abstractNote)
jtitle = jtitle.split(" / ")[0]; {
jtitle = jtitle.split(" - ")[0]; newItem.abstractNote = newItem.abstractNote + " " + value;
newItem.publicationTitle = jtitle; }
//get page numbers else
var m = value.match(/(?:[Pp]\. )([0-9\-]+)/); {
if (m) { newItem.abstractNote = value;
newItem.pages = m[1]; }
} }
//get ISBN or ISSN else if (field == "Notes")
m = value.match(/(?:ISSN|ISBN) ([0-9Xx\-]+)/); {
if (m) { if (newItem.abstractNote)
newItem.ISBN = m[1]; {
newItem.ISSN = m[1]; newItem.abstractNote = newItem.abstractNote + " " + value;
}
else
{
newItem.abstractNote = value;
}
} }
// publicationTitle, issue/volume }
newItem.complete();
} }
// TODO Pages, Notes, Description, Language, Annexes }
} }
newItem.complete();
} }
function doWeb(doc, url) { function doWeb(doc, url) {
@ -162,39 +228,44 @@ function doWeb(doc, url) {
var nsResolver = namespace ? function(prefix) { var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null; if (prefix == 'x') return namespace; else return null;
} : null; } : null;
var multxpath = '//a[@id="InitialFocusPoint"]'; var multxpath = '/html/body/div[2]/div/span';
var elt;
if (elt = doc.evaluate(multxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { if (elt = doc.evaluate(multxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
var newUrl = doc.evaluate('//base/@href', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; var content = elt.textContent;
var xpath = '//tr/td[3]/a'; if ( (content == "Résultats") || (content == "Results") )
var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); {
var elmt = elmts.iterateNext(); var newUrl = doc.evaluate('//base/@href', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var links = new Array(); var xpath = '/html/body/div[2]/table/tbody/tr/td[3]/div/a';
var availableItems = new Array(); var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var i = 0; var elmt = elmts.iterateNext();
do { var links = new Array();
var availableItems = new Array();
var i = 0;
do {
var link = doc.evaluate('./@href', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; var link = doc.evaluate('./@href', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var searchTitle = elmt.textContent; var searchTitle = elmt.textContent;
availableItems[i] = searchTitle; availableItems[i] = searchTitle;
links[i] = link; links[i] = link;
i++; i++;
} while (elmt = elmts.iterateNext()); } while (elmt = elmts.iterateNext());
var items = Zotero.selectItems(availableItems); var items = Zotero.selectItems(availableItems);
if(!items) { if(!items) {
return true; return true;
}
var uris = new Array();
for(var i in items) {
uris.push(newUrl + links[i]);
Zotero.debug(newUrl + links[i]);
}
Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Zotero.done(); }, null);
Zotero.wait();
} }
var uris = new Array(); else if ( (content == "Notice complète") || (content == 'title data') )
for(var i in items) { {
uris.push(newUrl + links[i]); scrape(doc);
} }
Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
function() { Zotero.done(); }, null);
Zotero.wait();
} }
else { }
scrape(doc);
}
}