From f7d9c4d882e7bba76f13f4e1579c8393929ab94c Mon Sep 17 00:00:00 2001 From: Brian Birtles Date: Wed, 10 Aug 2022 10:40:28 +0900 Subject: [PATCH] fix: reflect source kana usage when displaying matches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #978. Specifically when looking up words like バカ売れ and ガタガタ we would prioritize the hiragana version, sometimes even hiding the katakana version altogether. This is because, when doing a search on the IDB database, we first normalize the text to hiragana. Now, the IDB database itself is able to look up kana variants but I believe we were doing the hiragana conversion separately to that so we can do deinflection using normalized hiragana. This patch moves the normalization to the point where we do deinflection so that we preserve the source kana form and jpdict-idb can use that to more accurately set the match labels on the different headwords. As a result, cases like サボった now correctly show サボる as the match while cases that use katakana in the deinflected part should still work. --- CHANGELOG.md | 2 + src/background/deinflect.test.ts | 20 ++++++ src/background/deinflect.ts | 113 +++++++++++++++++-------------- src/background/jpdict.ts | 4 +- src/background/word-search.ts | 4 +- src/utils/normalize-input.ts | 6 +- 6 files changed, 89 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c29785da9..a1e5f96f24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ app. ## [Unreleased] +- Prioritize katakana headwords when matching on katakana + ([#978](https://github.com/birchill/10ten-ja-reader/issues/978)). - Fixed display of currency conversion etc. when there are only name results ([#970](https://github.com/birchill/10ten-ja-reader/issues/970)). diff --git a/src/background/deinflect.test.ts b/src/background/deinflect.test.ts index 72ad292460..c6fc22a87d 100644 --- a/src/background/deinflect.test.ts +++ b/src/background/deinflect.test.ts @@ -23,6 +23,26 @@ describe('deinflect', () => { }); }); + it('deinflects kana variations', () => { + const cases = [ + ['走ります', '走る', [[DeinflectReason.Polite]], 2], + ['走りまス', '走る', [[DeinflectReason.Polite]], 2], + ['走りマス', '走る', [[DeinflectReason.Polite]], 2], + ['走リマス', '走る', [[DeinflectReason.Polite]], 2], + ['走リマす', '走る', [[DeinflectReason.Polite]], 2], + ['走った', '走る', [[DeinflectReason.Past]], 2], + ['走っタ', '走る', [[DeinflectReason.Past]], 2], + ['走ッタ', '走る', [[DeinflectReason.Past]], 2], + ['走ッた', '走る', [[DeinflectReason.Past]], 2], + ]; + + for (const [inflected, plain, reasons, type] of cases) { + const result = deinflect(inflected as string); + const match = result.find((candidate) => candidate.word == plain); + expect(match).toMatchObject({ reasons, type, word: plain }); + } + }); + it('deinflects -masu stem forms', () => { const result = deinflect('食べ'); const match = result.find((candidate) => candidate.word === '食べる'); diff --git a/src/background/deinflect.ts b/src/background/deinflect.ts index 5b3b7d4ac3..1030967638 100644 --- a/src/background/deinflect.ts +++ b/src/background/deinflect.ts @@ -1,3 +1,5 @@ +import { kanaToHiragana } from '@birchill/normal-jp'; + export const enum DeinflectReason { PolitePastNegative, PoliteNegative, @@ -718,63 +720,72 @@ export function deinflect(word: string): CandidateWord[] { const type = thisCandidate.type; for (const ruleGroup of ruleGroups) { - if (ruleGroup.fromLen <= word.length) { - const ending = word.substr(-ruleGroup.fromLen); + if (ruleGroup.fromLen > word.length) { + continue; + } + + const ending = word.slice(-ruleGroup.fromLen); + const hiraganaEnding = kanaToHiragana(ending); - for (const rule of ruleGroup.rules) { - if (type & rule.type && ending === rule.from) { - const newWord = - word.substr(0, word.length - rule.from.length) + rule.to; - if (newWord.length <= 1) { - continue; - } + for (const rule of ruleGroup.rules) { + if (!(type & rule.type)) { + continue; + } - // If we already have a candidate for this word with the same - // to type(s), expand the possible reasons. - // - // If the to type(s) differ, then we'll add a separate candidate - // and just hope that when we go to match against dictionary words - // we'll filter out the mismatching one(s). - if (resultIndex[newWord]) { - const candidate = result[resultIndex[newWord]]; - if (candidate.type === rule.type >> 8) { - candidate.reasons.unshift([rule.reason]); - continue; - } - } - resultIndex[newWord] = result.length; + if (ending !== rule.from && hiraganaEnding !== rule.from) { + continue; + } - // Deep clone multidimensional array - const reasons = []; - for (const array of thisCandidate.reasons) { - reasons.push(Array.from(array)); - } - if (reasons.length) { - const firstReason = reasons[0]; - // This is a bit hacky but the alternative is to add the - // full-form causative passive inflections to the deinflection - // dictionary and then try to merge the results. - if ( - rule.reason === DeinflectReason.Causative && - firstReason.length && - firstReason[0] === DeinflectReason.PotentialOrPassive - ) { - firstReason.splice(0, 1, DeinflectReason.CausativePassive); - } else { - firstReason.unshift(rule.reason); - } - } else { - reasons.push([rule.reason]); - } - const candidate: CandidateWord = { - reasons, - type: rule.type >> 8, - word: newWord, - }; + const newWord = + word.substring(0, word.length - rule.from.length) + rule.to; + if (newWord.length <= 1) { + continue; + } - result.push(candidate); + // If we already have a candidate for this word with the same + // to type(s), expand the possible reasons. + // + // If the to type(s) differ, then we'll add a separate candidate + // and just hope that when we go to match against dictionary words + // we'll filter out the mismatching one(s). + if (resultIndex[newWord]) { + const candidate = result[resultIndex[newWord]]; + if (candidate.type === rule.type >> 8) { + candidate.reasons.unshift([rule.reason]); + continue; } } + resultIndex[newWord] = result.length; + + // Deep clone multidimensional array + const reasons = []; + for (const array of thisCandidate.reasons) { + reasons.push(Array.from(array)); + } + if (reasons.length) { + const firstReason = reasons[0]; + // This is a bit hacky but the alternative is to add the + // full-form causative passive inflections to the deinflection + // dictionary and then try to merge the results. + if ( + rule.reason === DeinflectReason.Causative && + firstReason.length && + firstReason[0] === DeinflectReason.PotentialOrPassive + ) { + firstReason.splice(0, 1, DeinflectReason.CausativePassive); + } else { + firstReason.unshift(rule.reason); + } + } else { + reasons.push([rule.reason]); + } + const candidate: CandidateWord = { + reasons, + type: rule.type >> 8, + word: newWord, + }; + + result.push(candidate); } } } while (++i < result.length); diff --git a/src/background/jpdict.ts b/src/background/jpdict.ts index 788f6bd09b..79c7df1847 100644 --- a/src/background/jpdict.ts +++ b/src/background/jpdict.ts @@ -8,7 +8,6 @@ import { UpdateErrorState, UpdateState, } from '@birchill/jpdict-idb'; -import { kanaToHiragana } from '@birchill/normal-jp'; import { browser } from 'webextension-polyfill-ts'; import { ExtensionStorageError } from '../common/extension-storage-error'; @@ -329,8 +328,7 @@ export async function searchWords({ dbStatus: 'updating' | 'unavailable' | undefined ] > { - let [word, inputLengths] = normalizeInput(input); - word = kanaToHiragana(word); + const [word, inputLengths] = normalizeInput(input); const maxResults = max > 0 ? Math.min(WORDS_MAX_ENTRIES, max) : WORDS_MAX_ENTRIES; diff --git a/src/background/word-search.ts b/src/background/word-search.ts index 0d4a41b855..c76a243399 100644 --- a/src/background/word-search.ts +++ b/src/background/word-search.ts @@ -93,7 +93,7 @@ export async function wordSearch({ continue; } - // Now that we have filtered our set of matches to those we plan to keep + // Now that we have filtered our set of matches to those we plan to keep, // update our duplicates set. have = new Set([...have, ...wordResults.map((word) => word.id)]); @@ -123,7 +123,7 @@ export async function wordSearch({ // Shorten input, but don't split a ようおん (e.g. きゃ). const lengthToShorten = endsInYoon(input) ? 2 : 1; - input = input.substr(0, input.length - lengthToShorten); + input = input.substring(0, input.length - lengthToShorten); } if (!result.data.length) { diff --git a/src/utils/normalize-input.ts b/src/utils/normalize-input.ts index 55ad1a6f46..3ceac7be52 100644 --- a/src/utils/normalize-input.ts +++ b/src/utils/normalize-input.ts @@ -10,9 +10,7 @@ export function normalizeInput(input: string): [string, number[]] { // Convert to full-width, normalize decomposed characters, expand combined // characters etc. const fullWidthInput = halfToFullWidthNum(input); - let [normalized, inputLengths] = toNormalized( - halfToFullWidthNum(fullWidthInput) - ); + let [normalized, inputLengths] = toNormalized(fullWidthInput); // Strip out any zero-width non-joiners (as Google Docs sometimes likes to // stick them between every single character). @@ -28,7 +26,7 @@ export function normalizeInput(input: string): [string, number[]] { while (inputLengths[outputIndex] < i) { outputIndex++; } - normalized = normalized.substr(0, outputIndex); + normalized = normalized.substring(0, outputIndex); inputLengths = inputLengths.slice(0, outputIndex ? outputIndex + 1 : 0); break; }