Further improvements to the documentation search.

Revise the subword matching to have three different levels for full match (= a permutation of the input), a generic match for all parts, and everything else.
2012-06-14 13:48:05 -04:00 · 2012-06-14 13:48:05 -04:00 · 6b2b784e79
commit 6b2b784e79
parent ecc43ff104
1 changed files with 84 additions and 29 deletions
--- a/collects/scribblings/main/private/search.js
+++ b/collects/scribblings/main/private/search.js
@ -249,10 +249,14 @@ function AdjustResultsNum() {
 // Terms are compared using Compare(pat,str), which returns one of several
 // constants, in increasing order of success:
 // - C_fail: no match
-// - C_words: all of the "subwords" in pat matched, where a word is an
+// - C_words1: all of the "subwords" in pat matched, where a word is an
 //   alphanumeric or a punctuation substring (for example, "foo-bar!!" has
 //   these words: "foo", "-", "bar", "!!"), and a match means a prefix of a
 //   subword in str matched one of pat's subwords
+// - C_words2: same, but all of the subwords in str were matched (so both str
+//   and pat have the same number of subwords)
+// - C_words3: same, but all matches were complete (so str is a permutation of
+//   pat)
 // - C_match: pat matched somewhere in str
 // - C_prefix: pat is a prefix of str
 // - C_exact: pat is equal to str
@ -265,11 +269,13 @@ function AdjustResultsNum() {
 // module that is exactly `foo', but it will return C_exact which means that it
 // doesn't change whether the match is considered exact or not).
 var C_fail   = 0, C_min = 0,
-    C_words  = 1,
-    C_match  = 2,
-    C_prefix = 3,
-    C_exact  = 4,
-    C_rexact = 5, C_max = 5;
+    C_words1 = 1,
+    C_words2 = 2,
+    C_words3 = 3,
+    C_match  = 4,
+    C_prefix = 5,
+    C_exact  = 6,
+    C_rexact = 7, C_max = 7;

 function Compare(pat, str) {
  var i = str.indexOf(pat);
@ -278,14 +284,6 @@ function Compare(pat, str) {
  if (pat.length == str.length) return C_rexact;
  return C_prefix;
 }
-function CompareRx(pat, str) {
-  if (!(pat instanceof RegExp)) return Compare(pat,str);
-  var r = str.match(pat);
-  if (r == null || r.length == 0) return C_fail;
-  if (r[0] == str) return C_rexact;
-  if (str.indexOf(r[0]) == 0) return C_prefix;
-  return C_match;
-}
 function MaxCompares(pat, strs) {
  var r = C_min, c;
  for (var i=0; i<strs.length; i++) {
@ -294,14 +292,76 @@ function MaxCompares(pat, strs) {
  }
  return r;
 }
-function MinComparesRx(pats, str) {
-  var r = C_max, c;
-  for (var i=0; i<pats.length; i++) {
-    c = CompareRx(pats[i],str);
-    if (c < r) { if (c <= C_min) return c; else r = c; }
-  }
-  return r;
+
+// Utilities for dealing with "subword" searches
+function SortByLength(x, y) {
+  // use this to sort with descending length and otherwise lexicographically
+  var xlen = x.length, ylen = y.length;
+  if (xlen != ylen) return ylen - xlen;
+  else if (x < y) return -1;
+  else if (x > y) return +1;
+  else return 0;
 }
+function CompileWordCompare(term) {
+  var words = term.split(/\b/).sort(SortByLength);
+  var word_num = words.length;
+  var is_word = new Array(word_num);
+  for (var i=0; i<word_num; i++) is_word[i] = (words[i].search(/\w/) >= 0);
+  return function (str) {
+    var str_words = str.split(/\b/).sort(SortByLength);
+    var str_word_num = str_words.length;
+    if (str_word_num < word_num) return C_fail; // can't work
+    var r = C_words3;
+    for (var i=0; i<word_num; i++) {
+      var word = words[i];
+      var found = -1;
+      for (var j=0; j<str_word_num; j++) {
+        var str_word = str_words[j];
+        if (str_word != null) {
+          if (word == str_word) {
+            found = j;
+            break;
+          } else {
+            var k = str_word.indexOf(word);
+            if (k == 0 || (k > 0 && !is_word[i])) {
+              found = j;
+              r = C_words2;
+              break;
+            }
+          }
+        }
+      }
+      if (found >= 0) str_words[found] = null;
+      else return C_fail;
+    }
+    // either C_words3 for a complete permutation, or C_words2 when all of the
+    // subwords in str were matched
+    if (word_num == str_word_num) return r;
+    // otherwise return C_words1 regardless of whether all of the words in term
+    // matched exactly words in str
+    else return C_words1;
+  }
+}
+// Tests:
+// console.log("testing...");
+// function t(x,y,e) {
+//   var r = CompileWordCompare(x)(y);
+//   if (e != r) {
+//     console.log('CompileWordCompare("'+x+'")("'+y+'")'+
+//                 " => "+r+" but expected "+e);
+//   }
+// }
+// t("x-y","x-y",C_words3);
+// t("x-y","y-x",C_words3);
+// t("x-y","y--x",C_words2);
+// t("x-y","y=-x",C_words2);
+// t("xx-yy","x-y",C_fail);
+// t("xx-yy","xxx-yyy",C_words2);
+// t("x-y-x","xxx-yyy",C_fail);
+// t("x-y-x","xxx-yyy-xxx",C_words2);
+// t("x-y-x","yyy-xxx-xxx",C_words2);
+// t("x-y-xx","xx-y-xxx",C_words2);
+// console.log("done");

 function NormalizeSpaces(str) {
  // use single spaces first, then trim edge spaces
@ -371,18 +431,13 @@ function CompileTerm(term) {
    }
  /* a case for "Q" is not needed -- same as the default case below */
  default:
-    var words = term.split(/\b/);
-    for (var i=0; i<words.length; i++)
-      if (words[i].search(/^\w/) >= 0) words[i] = new RegExp("\\b"+words[i]);
-    // (note: seems like removing duplicates is not important since search
-    // strings will not have them, except for dashes in things like x-y-z)
+    var compare_words = CompileWordCompare(term);
    return function(x) {
      var r = Compare(term,x[0]);
      // only bindings can be used for rexact matches
      if (r >= C_rexact) return (x[3] ? r : C_exact);
-      if (r > C_words) return r;
-      if (MinComparesRx(words,x[0]) <= C_fail) return r;
-      return C_words;
+      if (r > C_words3) return r;
+      else return compare_words(x[0]);
    }
  }
 }