From 554f74a0d26e562e23185b022542c694f2cedc7a Mon Sep 17 00:00:00 2001 From: Juan Grigera Date: Mon, 16 Jul 2012 01:28:55 -0300 Subject: [PATCH] Improvements to ISBN recognition code in recognizePDF.js --- chrome/content/zotero/recognizePDF.js | 125 +++++++++++++++++--------- 1 file changed, 84 insertions(+), 41 deletions(-) diff --git a/chrome/content/zotero/recognizePDF.js b/chrome/content/zotero/recognizePDF.js index a9f2db600..c4ef539a4 100644 --- a/chrome/content/zotero/recognizePDF.js +++ b/chrome/content/zotero/recognizePDF.js @@ -243,7 +243,7 @@ Zotero_RecognizePDF.Recognizer = function () {} * (function will be passed image as URL and must return text of CAPTCHA) */ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, callback, captchaCallback) { - const MAX_PAGES = 3; + const MAX_PAGES = 7; this._libraryID = libraryID; this._callback = callback; @@ -274,7 +274,7 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c } } catch (e) { - Zotero.debug("Error running pdfinfo", 1); + Zotero.debug("Error running pdftotext", 1); Zotero.debug(e, 1); } @@ -305,18 +305,18 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c inputStream.close(); cacheFile.remove(false); - // look for DOI + // look for DOI - Use only first 80 lines to avoid catching article references var allText = lines.join("\n"); Zotero.debug(allText); - var m = Zotero.Utilities.cleanDOI(allText); + var m = Zotero.Utilities.cleanDOI(allText.slice(0,80)); if(m) { this._DOI = m[0]; } - var isbn = this._findISBN(allText); - if(isbn) { - this._ISBN = isbn; - Zotero.debug("Found ISBN: " + isbn); + var isbns = this._findISBNs(allText); + if(isbns.length > 0) { + this._ISBNs = isbns; + Zotero.debug("Found ISBNs: " + isbns); } // Use only first column from multi-column lines @@ -355,17 +355,67 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, libraryID, c } } -Zotero_RecognizePDF.Recognizer.prototype._findISBN = function(x) { - if(typeof(x) != "string") { - throw "findISBN: argument must be a string"; - } - var m = x.match(/SBN[0-9X \u2014\u2013\u2012-]+/); // \xE28093\xE28094 - if(m) { - var isbn = m[0].replace(/SBN/,''); - return isbn.replace(/[ \u2014\u2013\u2012-]/g, ''); - } - else - return null; +/** + * Search ISBNs in text + * @private + * @return array with ISBNs + */ +Zotero_RecognizePDF.Recognizer.prototype._findISBNs = function(x) { + if(typeof(x) != "string") { + throw "findISBNs: argument must be a string"; + } + var isbns = []; + + // Match lines saying "isbn: " or "ISBN-10:" or similar, consider m-dashes and n-dashes as well + var pattern = /(SBN|sbn)[ \u2014\u2013\u2012-]?(10|13)?[: ]*([0-9X][0-9X \u2014\u2013\u2012-]+)/g; + var match; + + while (match = pattern.exec(x)) { + Zotero.debug("isbn0: " + match); + var isbn = match[3]; + isbn = isbn.replace(/[ \u2014\u2013\u2012-]/g, ''); + if(isbn.length==20 || isbn.length==26) { + // Handle the case of two isbns (e.g. paper+hardback) next to each other + isbns.push(isbn.slice(0,isbn.length/2), isbn.slice(isbn.length/2)); + } else if(isbn.length==23) { + // Handle the case of two isbns (10+13) next to each other + isbns.push(isbn.slice(0,10), isbn.slice(10)); + } else if(isbn.length==10 || isbn.length==13) { + isbns.push(isbn); + } + } + + // Validate ISBNs + var validIsbns = []; + for (var i =0; i < isbns.length; i++) { + if(this._isValidISBN(isbns[i])) validIsbns.push(isbns[i]); + } + Zotero.debug("validIsbns: " + validIsbns); + return validIsbns; +} + +Zotero_RecognizePDF.Recognizer.prototype._isValidISBN = function(isbn) { + if(isbn.length == 13) { + // ISBN-13 should start with 978 or 979 i.e. GS1 for book publishing industry + var prefix = isbn.slice(0,3); + if (prefix != "978" && prefix != "979") return false; + // Verify check digit + var check = 0; + for (var i = 0; i < 13; i+=2) check += isbn[i]*1; + for (i = 1; i < 12; i+=2) check += 3 * isbn[i]*1; + return (check % 10 == 0); + } else if(isbn.length == 10) { + // Verify ISBN-10 check digit + var check = 0; + for (var i = 0; i < 9; i++) check += isbn[i]*1 * (10-i); + // last number might be 'X' + if (isbn[9] == 'X' || isbn[9] == 'x') check += 10; + else check += isbn[i]*1; + Zotero.debug("ISBN-10 check digit " + (check % 11)); + return (check % 11 == 0); + } else { + return false; + } } /** @@ -384,11 +434,20 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() { var queryString = ""; var me = this; - if(this._DOI) { - // use CrossRef to look for DOI + if(this._DOI || this._ISBNs) { var translate = new Zotero.Translate.Search(); - translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753"); - var item = {"itemType":"journalArticle", "DOI":this._DOI}; + var item = {}; + if(this._DOI) { + // use CrossRef to look for DOI + translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753"); + item = {"itemType":"journalArticle", "DOI":this._DOI}; + + } + if(this._ISBNs) { + // use Open WorldCat to look for ISBN + translate.setTranslator("c73a4a8c-3ef1-4ec8-8229-7531ee384cc4"); + item = {"itemType":"book", "ISBN":this._ISBNs[0]}; + } translate.setSearch(item); translate.setHandler("itemDone", function(translate, item) { me._callback(item); @@ -400,24 +459,8 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() { if(!success) me._queryGoogle(); }); translate.translate(this._libraryID, false); - delete this._DOI; - } else if(this._ISBN) { - // look for ISBN - var translate = new Zotero.Translate("search"); //.Search(); - translate.setTranslator("c73a4a8c-3ef1-4ec8-8229-7531ee384cc4"); //Open WorldCat - var item = {"itemType":"book", "ISBN":this._ISBN}; - translate.setSearch(item); - translate.setHandler("itemDone", function(translate, item) { - me._callback(item); - }); - translate.setHandler("select", function(translate, items, callback) { - return me._selectItems(translate, items, callback); - }); - translate.setHandler("done", function(translate, success) { - if(!success) me._queryGoogle(); - }); - translate.translate(this._libraryID, false); - delete this._ISBN; + if(this._DOI) delete this._DOI; + if(this._ISBNs) delete this.ISBNs; } else { // take the relevant parts of some lines (exclude hyphenated word) var queryStringWords = 0;