From df1e58aac83199eabbc715122d2a5506e31dbcee Mon Sep 17 00:00:00 2001 From: Michael Berkowitz Date: Tue, 10 Jun 2008 18:46:01 +0000 Subject: [PATCH] -Adds Leon Krauthausen's MAB2 translator and his modifications to the Aleph translator for better German library support. --- scrapers.sql | 364 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 358 insertions(+), 6 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index dc7e50161..d7f4aae71 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-06-10 19:00:00')); +REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-06-10 19:30:00')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2008-03-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats and Michael Berkowitz', '^https?://(?:www\.)?amazon', 'function detectWeb(doc, url) { @@ -1388,6 +1388,353 @@ function doWeb (doc, url) { Zotero.wait(); }'); +REPLACE INTO translators VALUES ('91acf493-0de7-4473-8b62-89fd141e6c74', '1.0.0b3.r1', '', '2008-06-10 19:30:00', '1', '100', '1', 'MAB2', 'Simon Kornblith. Adaptions for MAB2: Leon Krauthausen (FUB)', 'mab2', +'function detectImport() { + var mab2RecordRegexp = /^[0-9]{3}[a-z ]{2}[a-z ]{3}$/ + var read = Zotero.read(8); + if(mab2RecordRegexp.test(read)) { + return true; + } +}', +'var fieldTerminator = "\x1E"; +var recordTerminator = "\x1D"; +var subfieldDelimiter = "\x1F"; + +/* +* CLEANING FUNCTIONS +*/ + +// general purpose cleaning +function clean(value) { + value = value.replace(/^[\s\.\,\/\:;]+/, ''''); + value = value.replace(/[\s\.\,\/\:;]+$/, ''''); + value = value.replace(/<<+/g, ''''); + value = value.replace(/>>+/g, ''''); + value = value.replace(/ +/g, '' ''); + + var char1 = value[0]; + var char2 = value[value.length-1]; + if((char1 == "[" && char2 == "]") || (char1 == "(" && char2 == ")")) { + // chop of extraneous characters + return value.substr(1, value.length-2); + } + + return value; +} + +function cleanTag(value) { + // Chop off Authority-IDs + value = value.slice(0, value.indexOf(''|'')); + return value; +} + +// number extraction +function pullNumber(text) { + var pullRe = /[0-9]+/; + var m = pullRe.exec(text); + if(m) { + return m[0]; + } +} + +// ISBN extraction +function pullISBN(text) { + var pullRe = /[0-9X\-]+/; + var m = pullRe.exec(text); + if(m) { + return m[0]; + } +} + +// corporate author extraction +function corpAuthor(author) { + return {lastName:author, fieldMode:true}; +} + +// regular author extraction +function author(author, type, useComma) { + return Zotero.Utilities.cleanAuthor(author, type, useComma); +} + +// MAB2 author extraction +// evaluates subfield $b and sets authType +function authorMab(author, authType, useComma) { + if(!authType) var authType=''author''; + authType = authType.replace(''[Hrsg.]'', ''editor''); + authType = authType.replace(''[Mitarb.]'', ''contributor''); + authType = authType.replace(''[Übers.]'', ''translator''); + return Zotero.Utilities.cleanAuthor(author, authType, useComma); +} +/* +* END CLEANING FUNCTIONS +*/ + +var record = function() { + this.directory = new Object(); + this.leader = ""; + this.content = ""; + + // defaults + this.indicatorLength = 2; + this.subfieldCodeLength = 2; +} + +// import a binary MAB2 record into this record +record.prototype.importBinary = function(record) { + // get directory and leader + var directory = record.substr(0, record.indexOf(fieldTerminator)); + this.leader = directory.substr(0, 24); + var directory = directory.substr(24); + + // get various data + this.indicatorLength = parseInt(this.leader[10], 10); + this.subfieldCodeLength = parseInt(this.leader[11], 10); + var baseAddress = parseInt(this.leader.substr(12, 5), 10); + + // get record data + var contentTmp = record.substr(baseAddress); + + // MARC wants one-byte characters, so when we have multi-byte UTF-8 + // sequences, add null characters so that the directory shows up right. we + // can strip the nulls later. + this.content = ""; + for(i=0; i 0x00FFFF) { + this.content += "\x00\x00\x00"; + } else if(contentTmp.charCodeAt(i) > 0x0007FF) { + this.content += "\x00\x00"; + } else if(contentTmp.charCodeAt(i) > 0x00007F) { + this.content += "\x00"; + } + } + + // read directory + for(var i=0; i this.indicatorLength) { + indicator = indicator.substr(0, this.indicatorLength); + } else if(indicator.length != this.indicatorLength) { + indicator = Zotero.Utilities.lpad(indicator, " ", this.indicatorLength); + } + + // add terminator + value = indicator+value+fieldTerminator; + + // add field to directory + if(!this.directory[field]) { + this.directory[field] = new Array(); + } + this.directory[field].push([this.content.length, value.length]); + + // add field to record + this.content += value; +} + +// get all fields with a certain field number +record.prototype.getField = function(field) { + field = parseInt(field, 10); + var fields = new Array(); + + // make sure fields exist + if(!this.directory[field]) { + return fields; + } + + // get fields + for(var i in this.directory[field]) { + var location = this.directory[field][i]; + + // add to array, replacing null characters + fields.push([this.content.substr(location[0], this.indicatorLength), + this.content.substr(location[0]+this.indicatorLength, + location[1]-this.indicatorLength-1).replace(/\x00/g, "")]); + } + + return fields; +} + +// get subfields from a field +record.prototype.getFieldSubfields = function(tag) { // returns a two-dimensional array of values + var fields = this.getField(tag); + var returnFields = new Array(); + + for(var i in fields) { + returnFields[i] = new Object(); + + var subfields = fields[i][1].split(subfieldDelimiter); + if (subfields.length == 1) { + returnFields[i]["?"] = fields[i][1]; + } else { + for(var j in subfields) { + if(subfields[j]) { + var subfieldIndex = subfields[j].substr(0, this.subfieldCodeLength-1); + if(!returnFields[i][subfieldIndex]) { + returnFields[i][subfieldIndex] = subfields[j].substr(this.subfieldCodeLength-1); + } + } + } + } + } + + return returnFields; +} + +// add field to DB +record.prototype._associateDBField = function(item, fieldNo, part, fieldName, execMe, arg1, arg2) { + var field = this.getFieldSubfields(fieldNo); + Zotero.debug(''MARC: found ''+field.length+'' matches for ''+fieldNo+part); + if(field) { + for(var i in field) { + var value = false; + for(var j=0; j 1) { + records[0] = holdOver + records[0]; + holdOver = records.pop(); // skip last record, since it''s not done + + for(var i in records) { + var newItem = new Zotero.Item(); + + // create new record + var rec = new record(); + rec.importBinary(records[i]); + rec.translate(newItem); + + newItem.complete(); + } + } else { + holdOver += text; + } + } +}'); + REPLACE INTO translators VALUES ('b662c6eb-e478-46bd- bad4-23cdfd0c9d67', '1.0.0b4.r5', '', '2008-06-10 19:00:00', '0', '100', '4', 'JurPC', 'Oliver Vivell and Michael Berkowitz', 'http://www.jurpc.de/', 'function detectWeb(doc, url) { var doctype = doc.evaluate(''//meta/@doctype'', doc, null,XPathResult.ANY_TYPE, null).iterateNext().textContent; @@ -12807,9 +13154,9 @@ REPLACE INTO translators VALUES ('5e3e6245-83da-4f55-a39b-b712df54a935', '1.0.0b Zotero.wait(); }'); -REPLACE INTO translators VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '1.0.0b3.r1', '', '2008-04-15 07:30:00', '1', '100', '4', 'Library Catalog (Aleph)', 'Simon Kornblith and Michael Berkowitz', 'https?://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find|\?func=scan)', +REPLACE INTO translators VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '1.0.0b3.r1', '', '2008-06-10 19:30:00', '1', '100', '4', 'Library Catalog (Aleph)', 'Simon Kornblith and Michael Berkowitz', 'https?://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find|\?func=scan|\?func=short)', 'function detectWeb(doc, url) { - var singleRe = new RegExp("^https?://[^/]+/F/[A-Z0-9\-]+\?.*(?:func=full-set-set.*\&format=[0-9]{3}|func=direct)"); + var singleRe = new RegExp("^https?://[^/]+/F/[A-Z0-9\-]+\?.*(?:func=full-set-set.*\&format=[0-9]{3}|func=direct|func=myshelf-full.*)"); if(singleRe.test(doc.location.href)) { return "book"; @@ -12823,7 +13170,8 @@ REPLACE INTO translators VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '1.0.0b } }', 'function doWeb(doc, url) { - var detailRe = new RegExp("^https?://[^/]+/F/[A-Z0-9\-]+\?.*(?:func=full-set-set.*\&format=[0-9]{3}|func=direct)"); + var detailRe = new RegExp("^https?://[^/]+/F/[A-Z0-9\-]+\?.*(?:func=full-set-set.*\&format=[0-9]{3}|func=direct|func=myshelf-full.*)"); + var mab2Opac = new RegExp("^https?://[^/]+berlin|193\.30\.112\.134|duisburg-essen/F/[A-Z0-9\-]+\?.*"); var uri = doc.location.href; var newUris = new Array(); @@ -12832,7 +13180,7 @@ REPLACE INTO translators VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '1.0.0b if (newuri == uri) newuri += "&format=001"; newUris.push(newuri); } else { - var itemRegexp = ''^https?://[^/]+/F/[A-Z0-9\-]+\?.*(?:func=full-set-set.*\&format=999|func=direct)'' + var itemRegexp = ''^https?://[^/]+/F/[A-Z0-9\-]+\?.*(?:func=full-set-set.*\&format=999|func=direct|func=myshelf-full.*)'' var items = Zotero.Utilities.getItemArray(doc, doc, itemRegexp, ''^[0-9]+$''); // ugly hack to see if we have any items @@ -12862,7 +13210,11 @@ REPLACE INTO translators VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '1.0.0b } } var translator = Zotero.loadTranslator("import"); - translator.setTranslator("a6ee60df-1ddc-4aae-bb25-45e0537be973"); + if(mab2Opac.test(uri)) { + translator.setTranslator("91acf493-0de7-4473-8b62-89fd141e6c74"); + } else { + translator.setTranslator("a6ee60df-1ddc-4aae-bb25-45e0537be973"); + } var marc = translator.getTranslatorObject(); Zotero.Utilities.processDocuments(newUris, function(newDoc) { var uri = newDoc.location.href;