Require at least one last name + first initial match for title matching in duplicate detection
This commit is contained in:
parent
01ea59491a
commit
c22c87a053
|
@ -101,6 +101,8 @@ Zotero.Duplicates.prototype._getObjectFromID = function (id) {
|
||||||
|
|
||||||
|
|
||||||
Zotero.Duplicates.prototype._findDuplicates = function () {
|
Zotero.Duplicates.prototype._findDuplicates = function () {
|
||||||
|
var start = Date.now();
|
||||||
|
|
||||||
var self = this;
|
var self = this;
|
||||||
|
|
||||||
this._sets = new Zotero.DisjointSetForest;
|
this._sets = new Zotero.DisjointSetForest;
|
||||||
|
@ -110,6 +112,10 @@ Zotero.Duplicates.prototype._findDuplicates = function () {
|
||||||
// Make sure we have a string and not an integer
|
// Make sure we have a string and not an integer
|
||||||
str = str + "";
|
str = str + "";
|
||||||
|
|
||||||
|
if (str === "") {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
str = Zotero.Utilities.removeDiacritics(str)
|
str = Zotero.Utilities.removeDiacritics(str)
|
||||||
.replace(/[!-/:-@[-`{-~]/g, ' ') // Convert (ASCII) punctuation to spaces
|
.replace(/[!-/:-@[-`{-~]/g, ' ') // Convert (ASCII) punctuation to spaces
|
||||||
.replace(/ +/, ' ') // Normalize spaces
|
.replace(/ +/, ' ') // Normalize spaces
|
||||||
|
@ -120,14 +126,23 @@ Zotero.Duplicates.prototype._findDuplicates = function () {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param {Function} compareRows Comparison function, if not exact match
|
* @param {Function} compareRows Comparison function, if not exact match
|
||||||
|
* @param {Boolean} reprocessMatches Compare every row against every other,
|
||||||
|
* without skipping ahead to the last match.
|
||||||
|
* This is necessary for multi-dimensional
|
||||||
|
* matches such as title + at least one creator.
|
||||||
|
* Without it, only one set of matches would be
|
||||||
|
* found per matching title, since items with
|
||||||
|
* different creators wouldn't match the first
|
||||||
|
* set and the next start row would be a
|
||||||
|
* different title.
|
||||||
*/
|
*/
|
||||||
function processRows(compareRows) {
|
function processRows(compareRows, reprocessMatches) {
|
||||||
if (!rows) {
|
if (!rows) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var i = 0, len = rows.length; i < len; i++) {
|
for (var i = 0, len = rows.length; i < len; i++) {
|
||||||
var j = i + 1, lastMatch = false, added = false;
|
var j = i + 1, lastMatch = false;
|
||||||
while (j < len) {
|
while (j < len) {
|
||||||
if (compareRows) {
|
if (compareRows) {
|
||||||
var match = compareRows(rows[i], rows[j]);
|
var match = compareRows(rows[i], rows[j]);
|
||||||
|
@ -143,7 +158,7 @@ Zotero.Duplicates.prototype._findDuplicates = function () {
|
||||||
}
|
}
|
||||||
// If no comparison function, check for exact match
|
// If no comparison function, check for exact match
|
||||||
else {
|
else {
|
||||||
if (rows[i].value != rows[j].value) {
|
if (rows[i].value !== rows[j].value) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -156,31 +171,12 @@ Zotero.Duplicates.prototype._findDuplicates = function () {
|
||||||
lastMatch = j;
|
lastMatch = j;
|
||||||
j++;
|
j++;
|
||||||
}
|
}
|
||||||
if (lastMatch) {
|
if (!reprocessMatches && lastMatch) {
|
||||||
i = lastMatch;
|
i = lastMatch;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Match on normalized title
|
|
||||||
var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) "
|
|
||||||
+ "JOIN itemDataValues USING (valueID) "
|
|
||||||
+ "WHERE libraryID=? AND fieldID BETWEEN 110 AND 113 "
|
|
||||||
+ "AND itemTypeID NOT IN (1, 14) "
|
|
||||||
+ "AND itemID NOT IN (SELECT itemID FROM deletedItems) "
|
|
||||||
+ "ORDER BY value COLLATE locale";
|
|
||||||
var rows = Zotero.DB.query(sql, [this._libraryID]);
|
|
||||||
processRows(function (a, b) {
|
|
||||||
a = normalizeString(a.value);
|
|
||||||
b = normalizeString(b.value);
|
|
||||||
|
|
||||||
// If we stripped one of the strings completely, we can't compare them
|
|
||||||
if (a.length == 0 || b.length == 0) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
return a == b ? 1 : -1;
|
|
||||||
});
|
|
||||||
|
|
||||||
// Match books by ISBN
|
// Match books by ISBN
|
||||||
var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) "
|
var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) "
|
||||||
+ "JOIN itemDataValues USING (valueID) "
|
+ "JOIN itemDataValues USING (valueID) "
|
||||||
|
@ -206,6 +202,81 @@ Zotero.Duplicates.prototype._findDuplicates = function () {
|
||||||
var rows = Zotero.DB.query(sql, [this._libraryID, Zotero.ItemFields.getID('DOI')]);
|
var rows = Zotero.DB.query(sql, [this._libraryID, Zotero.ItemFields.getID('DOI')]);
|
||||||
processRows();
|
processRows();
|
||||||
|
|
||||||
|
var creatorRowsCache = {};
|
||||||
|
|
||||||
|
// Match on normalized title
|
||||||
|
var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) "
|
||||||
|
+ "JOIN itemDataValues USING (valueID) "
|
||||||
|
+ "WHERE libraryID=? AND fieldID BETWEEN 110 AND 113 "
|
||||||
|
+ "AND itemTypeID NOT IN (1, 14) "
|
||||||
|
+ "AND itemID NOT IN (SELECT itemID FROM deletedItems) "
|
||||||
|
+ "ORDER BY value COLLATE locale";
|
||||||
|
var rows = Zotero.DB.query(sql, [this._libraryID]);
|
||||||
|
processRows(function (a, b) {
|
||||||
|
var aTitle = normalizeString(a.value);
|
||||||
|
var bTitle = normalizeString(b.value);
|
||||||
|
|
||||||
|
// If we stripped one of the strings completely, we can't compare them
|
||||||
|
if (aTitle.length == 0 || bTitle.length == 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (aTitle !== bTitle) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for at least one match on last name + first initial of first name
|
||||||
|
if (creatorRowsCache[a.itemID] != undefined) {
|
||||||
|
aCreatorRows = creatorRowsCache[a.itemID];
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var sql = "SELECT lastName, firstName, fieldMode FROM itemCreators "
|
||||||
|
+ "JOIN creators USING (creatorID) "
|
||||||
|
+ "JOIN creatorData USING (creatorDataID) "
|
||||||
|
+ "WHERE itemID=? ORDER BY orderIndex LIMIT 10";
|
||||||
|
var aCreatorRows = Zotero.DB.query(sql, a.itemID);
|
||||||
|
creatorRowsCache[a.itemID] = aCreatorRows;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for at least one match on last name + first initial of first name
|
||||||
|
if (creatorRowsCache[b.itemID] != undefined) {
|
||||||
|
bCreatorRows = creatorRowsCache[b.itemID];
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var sql = "SELECT lastName, firstName, fieldMode FROM itemCreators "
|
||||||
|
+ "JOIN creators USING (creatorID) "
|
||||||
|
+ "JOIN creatorData USING (creatorDataID) "
|
||||||
|
+ "WHERE itemID=? ORDER BY orderIndex LIMIT 10";
|
||||||
|
var bCreatorRows = Zotero.DB.query(sql, b.itemID);
|
||||||
|
creatorRowsCache[b.itemID] = bCreatorRows;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Match if no creators
|
||||||
|
if (!aCreatorRows && !bCreatorRows.length) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!aCreatorRows || !bCreatorRows) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for each(var aCreatorRow in aCreatorRows) {
|
||||||
|
var aLastName = normalizeString(aCreatorRow.lastName);
|
||||||
|
var aFirstInitial = aCreatorRow.fieldMode == 0 ? normalizeString(aCreatorRow.firstName.substr(1)) : false;
|
||||||
|
|
||||||
|
for each(var bCreatorRow in bCreatorRows) {
|
||||||
|
var bLastName = normalizeString(bCreatorRow.lastName);
|
||||||
|
var bFirstInitial = bCreatorRow.fieldMode == 0 ? normalizeString(bCreatorRow.firstName.substr(1)) : false;
|
||||||
|
|
||||||
|
if (aLastName === bLastName && aFirstInitial === bFirstInitial) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}, true);
|
||||||
|
|
||||||
// Match on exact fields
|
// Match on exact fields
|
||||||
/*var fields = [''];
|
/*var fields = [''];
|
||||||
for each(var field in fields) {
|
for each(var field in fields) {
|
||||||
|
@ -217,6 +288,8 @@ Zotero.Duplicates.prototype._findDuplicates = function () {
|
||||||
var rows = Zotero.DB.query(sql, [this._libraryID, Zotero.ItemFields.getID(field)]);
|
var rows = Zotero.DB.query(sql, [this._libraryID, Zotero.ItemFields.getID(field)]);
|
||||||
processRows();
|
processRows();
|
||||||
}*/
|
}*/
|
||||||
|
|
||||||
|
Zotero.debug("Found duplicates in " + (Date.now() - start) + " ms");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user