From c22c87a0535d6c14a44687e3b859a58042672541 Mon Sep 17 00:00:00 2001 From: Dan Stillman Date: Tue, 11 Oct 2011 23:48:20 +0000 Subject: [PATCH] Require at least one last name + first initial match for title matching in duplicate detection --- chrome/content/zotero/xpcom/duplicates.js | 119 +++++++++++++++++----- 1 file changed, 96 insertions(+), 23 deletions(-) diff --git a/chrome/content/zotero/xpcom/duplicates.js b/chrome/content/zotero/xpcom/duplicates.js index 6dbf2d3b3..74020e58e 100644 --- a/chrome/content/zotero/xpcom/duplicates.js +++ b/chrome/content/zotero/xpcom/duplicates.js @@ -101,6 +101,8 @@ Zotero.Duplicates.prototype._getObjectFromID = function (id) { Zotero.Duplicates.prototype._findDuplicates = function () { + var start = Date.now(); + var self = this; this._sets = new Zotero.DisjointSetForest; @@ -110,6 +112,10 @@ Zotero.Duplicates.prototype._findDuplicates = function () { // Make sure we have a string and not an integer str = str + ""; + if (str === "") { + return ""; + } + str = Zotero.Utilities.removeDiacritics(str) .replace(/[!-/:-@[-`{-~]/g, ' ') // Convert (ASCII) punctuation to spaces .replace(/ +/, ' ') // Normalize spaces @@ -120,14 +126,23 @@ Zotero.Duplicates.prototype._findDuplicates = function () { /** * @param {Function} compareRows Comparison function, if not exact match + * @param {Boolean} reprocessMatches Compare every row against every other, + * without skipping ahead to the last match. + * This is necessary for multi-dimensional + * matches such as title + at least one creator. + * Without it, only one set of matches would be + * found per matching title, since items with + * different creators wouldn't match the first + * set and the next start row would be a + * different title. */ - function processRows(compareRows) { + function processRows(compareRows, reprocessMatches) { if (!rows) { return; } for (var i = 0, len = rows.length; i < len; i++) { - var j = i + 1, lastMatch = false, added = false; + var j = i + 1, lastMatch = false; while (j < len) { if (compareRows) { var match = compareRows(rows[i], rows[j]); @@ -143,7 +158,7 @@ Zotero.Duplicates.prototype._findDuplicates = function () { } // If no comparison function, check for exact match else { - if (rows[i].value != rows[j].value) { + if (rows[i].value !== rows[j].value) { break; } } @@ -156,31 +171,12 @@ Zotero.Duplicates.prototype._findDuplicates = function () { lastMatch = j; j++; } - if (lastMatch) { + if (!reprocessMatches && lastMatch) { i = lastMatch; } } } - // Match on normalized title - var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) " - + "JOIN itemDataValues USING (valueID) " - + "WHERE libraryID=? AND fieldID BETWEEN 110 AND 113 " - + "AND itemTypeID NOT IN (1, 14) " - + "AND itemID NOT IN (SELECT itemID FROM deletedItems) " - + "ORDER BY value COLLATE locale"; - var rows = Zotero.DB.query(sql, [this._libraryID]); - processRows(function (a, b) { - a = normalizeString(a.value); - b = normalizeString(b.value); - - // If we stripped one of the strings completely, we can't compare them - if (a.length == 0 || b.length == 0) { - return -1; - } - return a == b ? 1 : -1; - }); - // Match books by ISBN var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) " + "JOIN itemDataValues USING (valueID) " @@ -206,6 +202,81 @@ Zotero.Duplicates.prototype._findDuplicates = function () { var rows = Zotero.DB.query(sql, [this._libraryID, Zotero.ItemFields.getID('DOI')]); processRows(); + var creatorRowsCache = {}; + + // Match on normalized title + var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) " + + "JOIN itemDataValues USING (valueID) " + + "WHERE libraryID=? AND fieldID BETWEEN 110 AND 113 " + + "AND itemTypeID NOT IN (1, 14) " + + "AND itemID NOT IN (SELECT itemID FROM deletedItems) " + + "ORDER BY value COLLATE locale"; + var rows = Zotero.DB.query(sql, [this._libraryID]); + processRows(function (a, b) { + var aTitle = normalizeString(a.value); + var bTitle = normalizeString(b.value); + + // If we stripped one of the strings completely, we can't compare them + if (aTitle.length == 0 || bTitle.length == 0) { + return -1; + } + + if (aTitle !== bTitle) { + return -1; + } + + // Check for at least one match on last name + first initial of first name + if (creatorRowsCache[a.itemID] != undefined) { + aCreatorRows = creatorRowsCache[a.itemID]; + } + else { + var sql = "SELECT lastName, firstName, fieldMode FROM itemCreators " + + "JOIN creators USING (creatorID) " + + "JOIN creatorData USING (creatorDataID) " + + "WHERE itemID=? ORDER BY orderIndex LIMIT 10"; + var aCreatorRows = Zotero.DB.query(sql, a.itemID); + creatorRowsCache[a.itemID] = aCreatorRows; + } + + // Check for at least one match on last name + first initial of first name + if (creatorRowsCache[b.itemID] != undefined) { + bCreatorRows = creatorRowsCache[b.itemID]; + } + else { + var sql = "SELECT lastName, firstName, fieldMode FROM itemCreators " + + "JOIN creators USING (creatorID) " + + "JOIN creatorData USING (creatorDataID) " + + "WHERE itemID=? ORDER BY orderIndex LIMIT 10"; + var bCreatorRows = Zotero.DB.query(sql, b.itemID); + creatorRowsCache[b.itemID] = bCreatorRows; + } + + // Match if no creators + if (!aCreatorRows && !bCreatorRows.length) { + return 1; + } + + if (!aCreatorRows || !bCreatorRows) { + return 0; + } + + for each(var aCreatorRow in aCreatorRows) { + var aLastName = normalizeString(aCreatorRow.lastName); + var aFirstInitial = aCreatorRow.fieldMode == 0 ? normalizeString(aCreatorRow.firstName.substr(1)) : false; + + for each(var bCreatorRow in bCreatorRows) { + var bLastName = normalizeString(bCreatorRow.lastName); + var bFirstInitial = bCreatorRow.fieldMode == 0 ? normalizeString(bCreatorRow.firstName.substr(1)) : false; + + if (aLastName === bLastName && aFirstInitial === bFirstInitial) { + return 1; + } + } + } + + return 0; + }, true); + // Match on exact fields /*var fields = ['']; for each(var field in fields) { @@ -217,6 +288,8 @@ Zotero.Duplicates.prototype._findDuplicates = function () { var rows = Zotero.DB.query(sql, [this._libraryID, Zotero.ItemFields.getID(field)]); processRows(); }*/ + + Zotero.debug("Found duplicates in " + (Date.now() - start) + " ms"); }