fix: reflect source kana usage when displaying matches

Fixes #978. Specifically when looking up words like バカ売れ and ガタガタ we would prioritize the hiragana version, sometimes even hiding the katakana version altogether. This is because, when doing a search on the IDB database, we first normalize the text to hiragana. Now, the IDB database itself is able to look up kana variants but I believe we were doing the hiragana conversion separately to that so we can do deinflection using normalized hiragana. This patch moves the normalization to the point where we do deinflection so that we preserve the source kana form and jpdict-idb can use that to more accurately set the match labels on the different headwords. As a result, cases like サボった now correctly show サボる as the match while cases that use katakana in the deinflected part should still work.
birchill · Aug 10, 2022 · f7d9c4d · f7d9c4d
1 parent 6ba0597
commit f7d9c4d
Show file tree

Hide file tree

Showing 6 changed files with 89 additions and 60 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@ app.
 
 ## [Unreleased]
 
+- Prioritize katakana headwords when matching on katakana
+  ([#978](https://github.com/birchill/10ten-ja-reader/issues/978)).
 - Fixed display of currency conversion etc. when there are only name results
   ([#970](https://github.com/birchill/10ten-ja-reader/issues/970)).
 

diff --git a/src/background/deinflect.test.ts b/src/background/deinflect.test.ts
@@ -23,6 +23,26 @@ describe('deinflect', () => {
     });
   });
 
+  it('deinflects kana variations', () => {
+    const cases = [
+      ['走ります', '走る', [[DeinflectReason.Polite]], 2],
+      ['走りまス', '走る', [[DeinflectReason.Polite]], 2],
+      ['走りマス', '走る', [[DeinflectReason.Polite]], 2],
+      ['走リマス', '走る', [[DeinflectReason.Polite]], 2],
+      ['走リマす', '走る', [[DeinflectReason.Polite]], 2],
+      ['走った', '走る', [[DeinflectReason.Past]], 2],
+      ['走っタ', '走る', [[DeinflectReason.Past]], 2],
+      ['走ッタ', '走る', [[DeinflectReason.Past]], 2],
+      ['走ッた', '走る', [[DeinflectReason.Past]], 2],
+    ];
+
+    for (const [inflected, plain, reasons, type] of cases) {
+      const result = deinflect(inflected as string);
+      const match = result.find((candidate) => candidate.word == plain);
+      expect(match).toMatchObject({ reasons, type, word: plain });
+    }
+  });
+
   it('deinflects -masu stem forms', () => {
     const result = deinflect('食べ');
     const match = result.find((candidate) => candidate.word === '食べる');

diff --git a/src/background/deinflect.ts b/src/background/deinflect.ts
@@ -1,3 +1,5 @@
+import { kanaToHiragana } from '@birchill/normal-jp';
+
 export const enum DeinflectReason {
   PolitePastNegative,
   PoliteNegative,
@@ -718,63 +720,72 @@ export function deinflect(word: string): CandidateWord[] {
     const type = thisCandidate.type;
 
     for (const ruleGroup of ruleGroups) {
-      if (ruleGroup.fromLen <= word.length) {
-        const ending = word.substr(-ruleGroup.fromLen);
+      if (ruleGroup.fromLen > word.length) {
+        continue;
+      }
+
+      const ending = word.slice(-ruleGroup.fromLen);
+      const hiraganaEnding = kanaToHiragana(ending);
 
-        for (const rule of ruleGroup.rules) {
-          if (type & rule.type && ending === rule.from) {
-            const newWord =
-              word.substr(0, word.length - rule.from.length) + rule.to;
-            if (newWord.length <= 1) {
-              continue;
-            }
+      for (const rule of ruleGroup.rules) {
+        if (!(type & rule.type)) {
+          continue;
+        }
 
-            // If we already have a candidate for this word with the same
-            // to type(s), expand the possible reasons.
-            //
-            // If the to type(s) differ, then we'll add a separate candidate
-            // and just hope that when we go to match against dictionary words
-            // we'll filter out the mismatching one(s).
-            if (resultIndex[newWord]) {
-              const candidate = result[resultIndex[newWord]];
-              if (candidate.type === rule.type >> 8) {
-                candidate.reasons.unshift([rule.reason]);
-                continue;
-              }
-            }
-            resultIndex[newWord] = result.length;
+        if (ending !== rule.from && hiraganaEnding !== rule.from) {
+          continue;
+        }
 
-            // Deep clone multidimensional array
-            const reasons = [];
-            for (const array of thisCandidate.reasons) {
-              reasons.push(Array.from(array));
-            }
-            if (reasons.length) {
-              const firstReason = reasons[0];
-              // This is a bit hacky but the alternative is to add the
-              // full-form causative passive inflections to the deinflection
-              // dictionary and then try to merge the results.
-              if (
-                rule.reason === DeinflectReason.Causative &&
-                firstReason.length &&
-                firstReason[0] === DeinflectReason.PotentialOrPassive
-              ) {
-                firstReason.splice(0, 1, DeinflectReason.CausativePassive);
-              } else {
-                firstReason.unshift(rule.reason);
-              }
-            } else {
-              reasons.push([rule.reason]);
-            }
-            const candidate: CandidateWord = {
-              reasons,
-              type: rule.type >> 8,
-              word: newWord,
-            };
+        const newWord =
+          word.substring(0, word.length - rule.from.length) + rule.to;
+        if (newWord.length <= 1) {
+          continue;
+        }
 
-            result.push(candidate);
+        // If we already have a candidate for this word with the same
+        // to type(s), expand the possible reasons.
+        //
+        // If the to type(s) differ, then we'll add a separate candidate
+        // and just hope that when we go to match against dictionary words
+        // we'll filter out the mismatching one(s).
+        if (resultIndex[newWord]) {
+          const candidate = result[resultIndex[newWord]];
+          if (candidate.type === rule.type >> 8) {
+            candidate.reasons.unshift([rule.reason]);
+            continue;
           }
         }
+        resultIndex[newWord] = result.length;
+
+        // Deep clone multidimensional array
+        const reasons = [];
+        for (const array of thisCandidate.reasons) {
+          reasons.push(Array.from(array));
+        }
+        if (reasons.length) {
+          const firstReason = reasons[0];
+          // This is a bit hacky but the alternative is to add the
+          // full-form causative passive inflections to the deinflection
+          // dictionary and then try to merge the results.
+          if (
+            rule.reason === DeinflectReason.Causative &&
+            firstReason.length &&
+            firstReason[0] === DeinflectReason.PotentialOrPassive
+          ) {
+            firstReason.splice(0, 1, DeinflectReason.CausativePassive);
+          } else {
+            firstReason.unshift(rule.reason);
+          }
+        } else {
+          reasons.push([rule.reason]);
+        }
+        const candidate: CandidateWord = {
+          reasons,
+          type: rule.type >> 8,
+          word: newWord,
+        };
+
+        result.push(candidate);
       }
     }
   } while (++i < result.length);

diff --git a/src/background/jpdict.ts b/src/background/jpdict.ts
@@ -8,7 +8,6 @@ import {
   UpdateErrorState,
   UpdateState,
 } from '@birchill/jpdict-idb';
-import { kanaToHiragana } from '@birchill/normal-jp';
 import { browser } from 'webextension-polyfill-ts';
 
 import { ExtensionStorageError } from '../common/extension-storage-error';
@@ -329,8 +328,7 @@ export async function searchWords({
     dbStatus: 'updating' | 'unavailable' | undefined
   ]
 > {
-  let [word, inputLengths] = normalizeInput(input);
-  word = kanaToHiragana(word);
+  const [word, inputLengths] = normalizeInput(input);
 
   const maxResults =
     max > 0 ? Math.min(WORDS_MAX_ENTRIES, max) : WORDS_MAX_ENTRIES;

diff --git a/src/background/word-search.ts b/src/background/word-search.ts
@@ -93,7 +93,7 @@ export async function wordSearch({
         continue;
       }
 
-      // Now that we have filtered our set of matches to those we plan to keep
+      // Now that we have filtered our set of matches to those we plan to keep,
       // update our duplicates set.
       have = new Set([...have, ...wordResults.map((word) => word.id)]);
 
@@ -123,7 +123,7 @@ export async function wordSearch({
 
     // Shorten input, but don't split a ようおん (e.g. きゃ).
     const lengthToShorten = endsInYoon(input) ? 2 : 1;
-    input = input.substr(0, input.length - lengthToShorten);
+    input = input.substring(0, input.length - lengthToShorten);
   }
 
   if (!result.data.length) {

diff --git a/src/utils/normalize-input.ts b/src/utils/normalize-input.ts
@@ -10,9 +10,7 @@ export function normalizeInput(input: string): [string, number[]] {
   // Convert to full-width, normalize decomposed characters, expand combined
   // characters etc.
   const fullWidthInput = halfToFullWidthNum(input);
-  let [normalized, inputLengths] = toNormalized(
-    halfToFullWidthNum(fullWidthInput)
-  );
+  let [normalized, inputLengths] = toNormalized(fullWidthInput);
 
   // Strip out any zero-width non-joiners (as Google Docs sometimes likes to
   // stick them between every single character).
@@ -28,7 +26,7 @@ export function normalizeInput(input: string): [string, number[]] {
       while (inputLengths[outputIndex] < i) {
         outputIndex++;
       }
-      normalized = normalized.substr(0, outputIndex);
+      normalized = normalized.substring(0, outputIndex);
       inputLengths = inputLengths.slice(0, outputIndex ? outputIndex + 1 : 0);
       break;
     }